Lets Begin amigos!!!

In [2]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModel,
)
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
import logging
import time
from typing import List, Dict, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Cell 2: Define Classes for News Scraping
class NewsArticle:
    def __init__(self, title: str, content: str, date: str, source: str, url: str):
        self.title = title
        self.content = content
        self.date = date
        self.source = source
        self.url = url

class FinancialNewsScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.sources = {
            'reuters': 'https://www.reuters.com/markets/companies',
            'marketwatch': 'https://www.marketwatch.com/markets',
            'investing': 'https://www.investing.com/news/stock-market-news'
        }

    def scrape_news(self, days_back: int = 7) -> List[NewsArticle]:
        articles = []
        for source, url in self.sources.items():
            try:
                articles.extend(self._scrape_source(source, url, days_back))
                time.sleep(2)  # Polite delay between sources
            except Exception as e:
                logging.error(f"Error scraping {source}: {str(e)}")
        return articles

    def _scrape_source(self, source: str, url: str, days_back: int) -> List[NewsArticle]:
        articles = []
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            if source == 'reuters':
                articles.extend(self._parse_reuters(soup))
            elif source == 'marketwatch':
                articles.extend(self._parse_marketwatch(soup))
            elif source == 'investing':
                articles.extend(self._parse_investing(soup))
                
        except Exception as e:
            logging.error(f"Error in _scrape_source for {source}: {str(e)}")
            
        return articles

    def _parse_reuters(self, soup: BeautifulSoup) -> List[NewsArticle]:
        articles = []
        for article in soup.find_all('article'):
            try:
                title = article.find('h3').text.strip()
                link = article.find('a')['href']
                article_content = self._get_article_content(f"https://reuters.com{link}")
                date = datetime.now().strftime('%Y-%m-%d')  # Reuters articles usually have current date
                articles.append(NewsArticle(title, article_content, date, 'Reuters', link))
            except Exception as e:
                logging.error(f"Error parsing Reuters article: {str(e)}")
        return articles

    def _get_article_content(self, url: str) -> str:
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            return ' '.join([p.text.strip() for p in paragraphs])
        except Exception as e:
            logging.error(f"Error getting article content: {str(e)}")
            return ""

In [None]:
# Cell 3: Define Classes for Stock Prediction
class FinancialDataset(Dataset):
    def __init__(self, data: np.ndarray, sequence_length: int, target_column: int):
        self.data = torch.FloatTensor(data)
        self.sequence_length = sequence_length
        self.target_column = target_column

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.sequence_length]
        y = self.data[idx + self.sequence_length][self.target_column]
        return x, y

class StockPredictor:
    def __init__(self,
                 time_series_model_name: str = "huggingface/time-series-transformer",
                 sentiment_model_name: str = "ProsusAI/finbert"):
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.time_series_model = AutoModel.from_pretrained(time_series_model_name).to(self.device)
        self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name).to(self.device)
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
        
        # Initialize scaler and news scraper
        self.scaler = MinMaxScaler()
        self.news_scraper = FinancialNewsScraper()

    def load_data_from_csv(self, file_path: str) -> pd.DataFrame:
        """Load stock data from a CSV file."""
        df = pd.read_csv(file_path)
        
        # Ensure that the Date column is in datetime format and set it as index if necessary.
        df['Date'] = pd.to_datetime(df['Date'])
        df.set_index('Date', inplace=True)
        
        # Calculate technical indicators (if needed)
        df['SMA_20'] = df['Close'].rolling(window=20).mean()
        df['SMA_50'] = df['Close'].rolling(window=50).mean()
        
        return df.dropna()

    def prepare_data_for_training(self, df: pd.DataFrame, sequence_length: int = 30) -> Tuple[DataLoader, DataLoader]:
        """Prepare data for model training"""
        
        # Scale the data
        scaled_data = self.scaler.fit_transform(df)

        # Create dataset
        dataset = FinancialDataset(
        data=scaled_data,
        sequence_length=sequence_length,
        target_column=df.columns.get_loc('Close')
        )

         # Split into train and validation sets
        train_size = int(0.8 * len(dataset))
        train_dataset, val_dataset = torch.utils.data.random_split(
            dataset, [train_size, len(dataset) - train_size]
        )
         
         # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32)

        return train_loader, val_loader

    def train_model(self, train_loader: DataLoader, val_loader: DataLoader, epochs: int = 10):
        """Train the time series model"""
        optimizer = torch.optim.Adam(self.time_series_model.parameters())
        criterion = torch.nn.MSELoss()
         
        for epoch in range(epochs):
            # Training loop
            self.time_series_model.train()
            train_loss = 0
            
            for batch_x, batch_y in train_loader:
                batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
                optimizer.zero_grad()
                outputs = self.time_series_model(batch_x)
                loss = criterion(outputs.squeeze(), batch_y)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

            print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss / len(train_loader)}')

    def analyze_sentiment(self, text: str) -> Tuple[int, np.ndarray]:
        """Analyze sentiment of financial news"""
        inputs = self.sentiment_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(self.device)

        with torch.no_grad():
            outputs = self.sentiment_model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
            prediction = torch.argmax(probabilities).item()

        return prediction, probabilities.cpu().numpy()


In [None]:
# Cell 4: Scrape News Articles (Run in Parallel)
news_scraper = FinancialNewsScraper()
news_articles = news_scraper.scrape_news(days_back=7)

# Display the scraped news titles and sources.
for article in news_articles:
    print(f"{article.date} - {article.source}: {article.title}")

In [None]:
# Cell 5: Load Stock Data from CSV and Train Model (Run in Parallel)
file_path_to_csv = 'path/to/your/stock_data.csv'  # Update this path to your CSV file.

predictor = StockPredictor()
df_stock_data = predictor.load_data_from_csv(file_path_to_csv)

# Prepare data loaders using the loaded data.
train_loader, val_loader = predictor.prepare_data_for_training(df_stock_data)

# Train the model.
print("Training model...")
predictor.train_model(train_loader=train_loader,
                      val_loader=val_loader,
                      epochs=10)  # Set epochs as needed.


In [None]:
# Cell 6: Make Predictions with Sentiment Analysis (Combine Results)
def predict_stock_price_with_sentiment(predictor: StockPredictor,
                                        df_stock_data: pd.DataFrame,
                                        news_articles: List[NewsArticle]) -> Dict:
    
    """Make stock price predictions considering both technical and sentiment analysis."""
    
    # Prepare input sequence from latest stock data.
    sequence_length = 30
    scaled_data = predictor.scaler.transform(df_stock_data.values)
    input_sequence = torch.FloatTensor(scaled_data[-sequence_length:]).unsqueeze(0).to(predictor.device)

    # Make prediction using the time series model.
    predictor.time_series_model.eval()
    with torch.no_grad():
        prediction_output = predictor.time_series_model(input_sequence)

    predicted_price_scaled = prediction_output.cpu().numpy()[0][0]

    # Analyze sentiment from news articles.
    sentiment_scores=[]
    
    for article in news_articles:
       sentiment_score,_= predictor.analyze_sentiment(article.title + " " + article.content)
       sentiment_scores.append(sentiment_score)

    average_sentiment_score=np.mean(sentiment_scores) if sentiment_scores else 0
    
   # Adjust prediction based on sentiment.
    predicted_price_adjusted= predicted_price_scaled * (1 + 0.01 * (average_sentiment_score - 1))
    
    return {
       'predicted_price': predicted_price_adjusted,
       'sentiment_score': average_sentiment_score,
   }

In [None]:

# Example usage of predicting price along with sentiment analysis.
prediction_results=predict_stock_price_with_sentiment(predictor=predictor,
                                                      df_stock_data=df_stock_data,
                                                      news_articles=news_articles)

print(f"\nPrediction Results:")
print(f"Predicted Price: ${prediction_results['predicted_price']:.2f}")
print(f"Average Sentiment Score: {prediction_results['sentiment_score']:.2f}")