In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import requests 
from datetime import datetime
from pathlib import Path
import pickle
import traceback
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer  # News sentiment analysis

from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, LSTM, Conv1D, MaxPooling1D, concatenate, 
    Flatten, Dropout, Bidirectional, Activation, RepeatVector,
    Permute, TimeDistributed, multiply
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber as huber_loss
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler

# Stock symbols to process
STOCK_SYMBOLS = ["AAPL", "TSLA", "UNP", "META", "MSFT", "NVDA", "XOM", "INTC", "AVGO", "PLTR", "NXPI"]
NEWS_API_KEY = "2025548ba34a4294a0f3c18c36311f39"

In [None]:
# Function to fetch news data
def fetch_news(api_key, query):
    url = f"https://newsapi.org/v2/everything?q={query}&apiKey={api_key}"
    print(f"Fetching news from URL: {url}")
    try:
        response = requests.get(url)

        if response.status_code == 200:
            news_data = response.json()
            if news_data.get('articles'):
                return news_data
            
        else:
            print(f"Failed to fetch news for {query}: {response.status_code}")

    except Exception as e:
        print(f"Error fetching news: {str(e)}")
    return {"status": "error", "totalResults": 0, "articles": []}

def preprocess_news(articles):
    if not articles:
        return pd.DataFrame(columns=['date', 'sentiment'])
        
    analyzer = SentimentIntensityAnalyzer()
    processed_data = []
    
    for article in articles:
        try:
            if not article.get('publishedAt'):
                continue
                
            date = datetime.strptime(article['publishedAt'][:10], '%Y-%m-%d')
            title = article.get('title', '')
            description = article.get('description', '')
            
            if not any([title, description]):
                continue
                
            content = ' '.join(filter(None, [title, description]))
            sentiment_scores = analyzer.polarity_scores(content)
            
            processed_data.append({
                'date': date,
                'sentiment': sentiment_scores['compound']
            })
            
        except (KeyError, ValueError) as e:
            print(f"Error processing article: {str(e)}")
            continue
    
    if not processed_data:
        return pd.DataFrame(columns=['date', 'sentiment'])
        
    news_df = pd.DataFrame(processed_data)
    return news_df.groupby('date')['sentiment'].mean().reset_index()

def calculate_rsi(prices, periods=14):
    # Relative Strength Index (RSI)
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=periods).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=periods).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def add_technical_indicators(df, window=10):
    # Stock indicators for better predictions
    df['volatility'] = df['Close'].rolling(window=window).std()
    df['ma'] = df['Close'].rolling(window=window).mean()
    df['volume_ma'] = df['Volume'].rolling(window=window).mean()
    df['price_momentum'] = df['Close'].pct_change(periods=5)

    # RSI
    df['rsi'] = calculate_rsi(df['Close'], periods=10)
    
    # Bollinger Bands
    middle = df['Close'].rolling(window=window).mean()
    std_dev = df['Close'].rolling(window=window).std()
    
    df['bb_middle'] = middle
    df['bb_upper'] = middle + (std_dev * 2)
    df['bb_lower'] = middle - (std_dev * 2)
    
    # MACD
    exp1 = df['Close'].ewm(span=12, adjust=False).mean()
    exp2 = df['Close'].ewm(span=26, adjust=False).mean()
    df['macd'] = exp1 - exp2
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    return df.ffill().bfill()

def fetch_and_save_data(ticker, news_api_key):
    try:
        Path("data").mkdir(exist_ok=True)
        
        print(f"\nFetching data for {ticker}")

        # Get stock data
        stock_data = yf.download(ticker, start="2021-01-01", end=datetime.now().strftime('%Y-%m-%d'))
        
        if stock_data.empty:
            return None, None
            
        # Fix the MultiIndex structure
        if isinstance(stock_data.columns, pd.MultiIndex):
            # Convert MultiIndex to single level using first level names
            stock_data.columns = [col[0] for col in stock_data.columns]
        
        # Reset the index to get Date as a column
        stock_data = stock_data.reset_index()
        
        # Convert Date to datetime
        stock_data['Date'] = pd.to_datetime(stock_data['Date'])
        
        print(f"Processed columns: {stock_data.columns.tolist()}")
        
        # Add technical indicators
        stock_data = add_technical_indicators(stock_data)
        
        # Get and process news data
        news_data = fetch_news(news_api_key, query=ticker)
        processed_news = preprocess_news(news_data.get('articles', []))
        
        return stock_data, processed_news
        
    except Exception as e:
        print(f"Error in fetch_and_save_data: {str(e)}")
        traceback.print_exc()
        return None, None


    
# Fetch and save data
data_dict = {}
for ticker in STOCK_SYMBOLS:
    stock_data, news_data = fetch_and_save_data(ticker, NEWS_API_KEY)
    if stock_data is not None and news_data is not None:
        data_dict[ticker] = (stock_data, news_data)

In [3]:
def merge_data(stock_data, news_data):
    if stock_data is None or news_data is None:
        print("Stock data or news data is None")
        return None
    
    try:
        # Handle MultiIndex columns
        if isinstance(stock_data.columns, pd.MultiIndex):
            stock_data.columns = [col[0] for col in stock_data.columns]
            
        # Ensure dates are datetime
        stock_data['Date'] = pd.to_datetime(stock_data['Date'])
        news_data['date'] = pd.to_datetime(news_data['date'])
        
        # Merge on date
        merged_data = pd.merge(
            stock_data,
            news_data,
            left_on='Date',
            right_on='date',
            how='left'
        )
        
        # Fill missing sentiment with 0 (neutral sentiment)
        merged_data['sentiment'] = merged_data['sentiment'].where(merged_data['sentiment'].notna(), 0)
        
        # Drop redundant date column
        if 'date' in merged_data.columns:
            merged_data.drop('date', axis=1, inplace=True)
            
        return merged_data
        
    except Exception as e:
        print(f"Error in merge_data: {str(e)}")
        traceback.print_exc()
        return None
    


# Process and merge data
merged_data_dict = {}
for ticker, (stock_data, news_data) in data_dict.items():
    merged_data = merge_data(stock_data, news_data)
    if merged_data is not None:
        merged_data.to_csv(f"data/cleaned_{ticker.lower()}_stock_data.csv", index=False)
        merged_data_dict[ticker] = merged_data

In [None]:
# Function to load data with sentiment
# 22 as the possible max of month
def prepare_sets(stock_file, sequence_length=60):  # Consistently use 60 days
    try:
        stock_data = pd.read_csv(stock_file)
        print(f"Data columns available: {stock_data.columns.tolist()}")
        
        # Ensure all required columns
        feature_columns = [
            'Open', 'High', 'Low', 'Close', 'Volume',
            'volatility', 'rsi', 'ma', 'volume_ma',
            'price_momentum', 'bb_upper', 'bb_middle', 'bb_lower',
            'macd', 'macd_signal', 'macd_hist',
            'sentiment'
        ]
        
        print(f"Features needed: {feature_columns}")
        
        # Scale all features together
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(stock_data[feature_columns])
        
        X_stock, X_sentiment, y = [], [], []
        for i in range(len(scaled_data) - sequence_length):
            # First 16 columns are stock features
            X_stock.append(scaled_data[i:i + sequence_length, :16])
            # Last column is sentiment
            X_sentiment.append(scaled_data[i:i + sequence_length, -1:])
            # Target is Close price
            y.append(scaled_data[i + sequence_length, 3])
        
        X_stock = np.array(X_stock)
        X_sentiment = np.array(X_sentiment)
        y = np.array(y)
        
        print(f"Final shapes: X_stock={X_stock.shape}, X_sentiment={X_sentiment.shape}")
        
        return X_stock, X_sentiment, y, scaler
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None, None, None, None
    


# Load and prepare data
prepared_data = {}
for symbol in STOCK_SYMBOLS:
    stock_file = f"data/cleaned_{symbol.lower()}_stock_data.csv"
    print(f"\nProcessing sets for {symbol}")
    
    X_stock, X_sentiment, y, scaler = prepare_sets(stock_file, sequence_length=60)
    if all(v is not None for v in [X_stock, X_sentiment, y, scaler]):
        prepared_data[symbol] = (X_stock, X_sentiment, y, scaler)

In [None]:
def build_model(sequence_length, n_features):
    stock_input = Input(shape=(sequence_length, n_features))
    
    # Multiple time scales
    conv1_daily = Conv1D(64, kernel_size=1, padding='causal', activation='relu')(stock_input)
    conv1_weekly = Conv1D(64, kernel_size=2, padding='causal', dilation_rate=2, activation='relu')(stock_input)
    conv1_monthly = Conv1D(64, kernel_size=5, padding='causal', dilation_rate=4, activation='relu')(stock_input)
    
    # Reduce dims
    pool_size = 2
    pool_daily = MaxPooling1D(pool_size=pool_size)(conv1_daily)
    pool_weekly = MaxPooling1D(pool_size=pool_size)(conv1_weekly)
    pool_monthly = MaxPooling1D(pool_size=pool_size)(conv1_monthly)
    
    # Merge
    cnn_merged = concatenate([pool_daily, pool_weekly, pool_monthly], axis=2)
    
    #  LSTM - Long term dependencies
    lstm = Bidirectional(LSTM(100, return_sequences=True))(cnn_merged)
    
    # Self-attention
    attention = Dense(1, activation='tanh')(lstm)
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(lstm.shape[-1])(attention)
    attention = Permute([2, 1])(attention)
    
    sent_lstm = multiply([lstm, attention])
    sent_lstm = LSTM(100)(sent_lstm)
    
    # Sentiment
    sentiment_input = Input(shape=(sequence_length, 1))
    sent_dense = TimeDistributed(Dense(32, activation='relu'))(sentiment_input)
    sent_lstm_2 = LSTM(50)(sent_dense)
    
    combined = concatenate([Flatten()(cnn_merged), sent_lstm, sent_lstm_2])
    
    dense1 = Dense(128, activation='relu')(combined)
    dense2 = Dense(64, activation='relu')(dense1)
    residual = concatenate([dense1, dense2])
    
    dropout = Dropout(0.3)(residual)
    output = Dense(1)(dropout)
    
    model = Model(inputs=[stock_input, sentiment_input], outputs=output)
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss=huber_loss(),
        metrics=['mse', 'mae']
    )
    
    return model

def train_model_with_validation(X_stock, X_sentiment, y, symbol):
    # Split data into train and validation (test not used in training)
    train_size = int(len(X_stock) * 0.7)
    val_size = int(len(X_stock) * 0.15)
    
    X_stock_train = X_stock[:train_size]
    X_stock_val = X_stock[train_size:train_size+val_size]
    
    # Similar splits for sentiment and target
    X_sentiment_train = X_sentiment[:train_size]
    X_sentiment_val = X_sentiment[train_size:train_size+val_size]
    
    y_train = y[:train_size]
    y_val = y[train_size:train_size+val_size]
    
    model = build_model(X_stock.shape[1], X_stock.shape[2])
    
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
            mode='min'
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=0.0001
        ),
        ModelCheckpoint(
            f'models/{symbol}_best_model.keras',
            monitor='val_loss',
            save_best_only=True
        )
    ]
    
    # Train with cross-validation
    history = model.fit(
        [X_stock_train, X_sentiment_train],
        y_train,
        validation_data=([X_stock_val, X_sentiment_val], y_val),
        epochs=50,
        batch_size=32,
        callbacks=callbacks,
        verbose=1
    )
    
    return model, history



models = {}
scalers = {}
results = {}

for symbol, (X_stock, X_sentiment, y, scaler) in prepared_data.items():
    # Split data
    split_idx = int(0.8 * len(X_stock))
    train_data = {
        'X_stock': X_stock[:split_idx],
        'X_sentiment': X_sentiment[:split_idx],
        'y': y[:split_idx]
    }
    test_data = {
        'X_stock': X_stock[split_idx:],
        'X_sentiment': X_sentiment[split_idx:],
        'y': y[split_idx:]
    }
    
    try:
        # Training
        model, history = train_model_with_validation(
            train_data['X_stock'], 
            train_data['X_sentiment'], 
            train_data['y'],
            symbol
        )
        
        # Evaluate
        test_loss = model.evaluate(
            [test_data['X_stock'], test_data['X_sentiment']],
            test_data['y'],
            verbose=0
        )
        
        # Save results
        results[symbol] = {
            'test_loss': test_loss,
            'model': model,
            'scaler': scaler,
            'test_data': test_data
        }
        
        print(f"{symbol} Test Loss: {test_loss}")
        
    except Exception as e:
        print(f"Error processing {symbol}: {str(e)}")
        continue

In [None]:
Path("models").mkdir(exist_ok=True)

for symbol, result in results.items():
    with open(f"models/{symbol}_model.pkl", "wb") as f:
        pickle.dump(result['model'], f)
    with open(f"models/{symbol}_scaler.pkl", "wb") as f:
        pickle.dump(result['scaler'], f)
    print(f"Saved model and scaler for {symbol}")