Lets Begin amigos!!!

In [21]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModel,
)
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
import logging
import time
from typing import List, Dict, Tuple


In [22]:
class NewsArticle:
    def __init__(self, title: str, content: str, date: str, source: str, url: str):
        self.title = title
        self.content = content
        self.date = date
        self.source = source
        self.url = url


In [23]:
class FinancialNewsScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.sources = {
            'reuters': 'https://www.reuters.com/markets/companies',
            'marketwatch': 'https://www.marketwatch.com/markets',
            'investing': 'https://www.investing.com/news/stock-market-news'
        }

    def scrape_news(self, days_back: int = 7) -> List[NewsArticle]:
        articles = []
        for source, url in self.sources.items():
            try:
                articles.extend(self._scrape_source(source, url, days_back))
                time.sleep(2)  # Polite delay between sources
            except Exception as e:
                logging.error(f"Error scraping {source}: {str(e)}")
        return articles

    def _scrape_source(self, source: str, url: str, days_back: int) -> List[NewsArticle]:
        articles = []
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            if source == 'reuters':
                articles.extend(self._parse_reuters(soup))
            elif source == 'marketwatch':
                articles.extend(self._parse_marketwatch(soup))
            elif source == 'investing':
                articles.extend(self._parse_investing(soup))
                
        except Exception as e:
            logging.error(f"Error in _scrape_source for {source}: {str(e)}")
            
        return articles

    def _parse_reuters(self, soup: BeautifulSoup) -> List[NewsArticle]:
        articles = []
        for article in soup.find_all('article'):
            try:
                title = article.find('h3').text.strip()
                link = article.find('a')['href']
                article_content = self._get_article_content(f"https://reuters.com{link}")
                date = datetime.now().strftime('%Y-%m-%d')  # Reuters articles usually have current date
                articles.append(NewsArticle(title, article_content, date, 'Reuters', link))
            except Exception as e:
                logging.error(f"Error parsing Reuters article: {str(e)}")
        return articles

    def _get_article_content(self, url: str) -> str:
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            return ' '.join([p.text.strip() for p in paragraphs])
        except Exception as e:
            logging.error(f"Error getting article content: {str(e)}")
            return ""

In [24]:
class FinancialDataset(Dataset):
    def __init__(self, data: np.ndarray, sequence_length: int, target_column: int):
        self.data = torch.FloatTensor(data)
        self.sequence_length = sequence_length
        self.target_column = target_column

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.sequence_length]
        y = self.data[idx + self.sequence_length][self.target_column]
        return x, y


In [25]:
import torch
import logging
import yfinance as yf
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

class StockPredictor:
    def __init__(self,
                 time_series_model_name: str = "huggingface/TimeSeriesTransformer",
                 sentiment_model_name: str = "ProsusAI/finbert",
                 api_access_token: str = "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"):
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Load time series model for regression
        self.time_series_model = AutoModel.from_pretrained(time_series_model_name, token=api_access_token).to(self.device)

        # Load sentiment analysis model
        self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name, token=api_access_token).to(self.device)
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name, token=api_access_token)
        
        # Initialize scaler
        self.scaler = MinMaxScaler()

    def load_data_from_csv(self, file_path: str):
        """Load stock data from a CSV file and split into features (X) and target (y)"""

        try:
            df = pd.read_csv(file_path)

            # Standardize column names (remove leading/trailing spaces)
            df.columns = df.columns.str.strip()

            # Expected columns
            expected_columns = [
                "date", "1. open", "2. high", "3. low", "4. close", "5. volume",
                "EMA", "Volume_Oscillator", "RSI", "%K", "%D", "+DI", "-DI", "ADX", "PVT", "Target"
            ]

            missing_cols = [col for col in expected_columns if col not in df.columns]
            if missing_cols:
                raise ValueError(f"Missing columns in CSV file: {missing_cols}")

            df["date"] = pd.to_datetime(df["date"], errors="coerce")
            df.dropna(subset=["date"], inplace=True)
            df.set_index("date", inplace=True)

            df[expected_columns[1:]] = df[expected_columns[1:]].apply(pd.to_numeric, errors="coerce")

            X = df.drop(columns=["Target"]).values  # Features
            y = df["Target"].values  # Target variable

            return X, y

        except Exception as e:
            print(f"Error loading CSV: {e}")
            return None, None

    def prepare_data_for_training(self, X, y, test_size=0.2, batch_size=32):
        """Prepare PyTorch data loaders"""
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42)

        # Normalize features
        X_train = self.scaler.fit_transform(X_train)
        X_val = self.scaler.transform(X_val)

        # Convert to tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(self.device)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(self.device)
        X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(self.device)
        y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(self.device)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        return train_loader, val_loader

    def train_model(self, train_loader: DataLoader, val_loader: DataLoader, epochs: int = 10):
        """Train the transformer model for time series regression"""
        optimizer = torch.optim.Adam(self.time_series_model.parameters(), lr=1e-4)
        criterion = torch.nn.MSELoss()

        for epoch in range(epochs):
            self.time_series_model.train()
            train_loss = 0

            for batch in train_loader:
                batch_x, batch_y = batch
                optimizer.zero_grad()

                outputs = self.time_series_model(batch_x).logits  # Ensure model outputs logits
                outputs = outputs.squeeze()

                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

            print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss / len(train_loader)}")

    def analyze_sentiment(self, text: str):
        """Analyze sentiment of financial news"""
        inputs = self.sentiment_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(self.device)

        with torch.no_grad():
            outputs = self.sentiment_model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
            prediction = torch.argmax(probabilities, dim=1)

        return prediction.item(), probabilities.cpu().numpy()

    def predict_stock_price(self, X):
        """Predict stock price using trained model"""
        self.time_series_model.eval()

        X_scaled = self.scaler.transform(X)
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(self.device)

        with torch.no_grad():
            predictions = self.time_series_model(X_tensor).logits.squeeze().cpu().numpy()

        return predictions


In [None]:

def main():
    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
    
    predictor_file_path = r"C:\Users\Ajay\Desktop\DOT-SLASH-\features.csv"
    predictor = StockPredictor()

    try:
        # Load stock data
        X, y = predictor.load_data_from_csv(predictor_file_path)

        if X is None or y is None:
            logging.error("Failed to load data.")
            return

        # Prepare data loaders
        train_loader, val_loader = predictor.prepare_data_for_training(X, y)

        # Train model
        print("Training model...")
        predictor.train_model(train_loader, val_loader)

        # Predict stock price
        X_test = X[-10:]  # Example: last 10 rows for prediction
        predictions = predictor.predict_stock_price(X_test)

        print(f"\nPredicted Stock Prices: {predictions}")

        # Sentiment analysis example
        sentiment_result = predictor.analyze_sentiment("The stock market is experiencing a strong bullish trend.")
        print(f"\nSentiment Analysis: {sentiment_result}")

    except Exception as e:
        logging.error(f"Error in main execution: {str(e)}")
        print("An error occurred. Check logs for details.")


if __name__ == "__main__":
    main()


ERROR:root:Error in main execution:too many values to unpack (expected 2)


An error occurred. Check logs for details.
