In [2]:
import pandas as pd
import numpy as np
import os

## Features

In [2]:
def calculate_technical_indicators(df, close_col='close', high_col='high', low_col='low', volume_col='volume'):
    """
    Calculate technical indicators for stock price analysis.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns for close, high, low, and volume
    close_col (str): Name of closing price column
    high_col (str): Name of high price column
    low_col (str): Name of low price column
    volume_col (str): Name of volume column
    
    Returns:
    pandas.DataFrame: Original data with additional technical indicators
    """
    df = df.copy()
    
    # Trend Indicators
    # Moving Averages
    df['sma_5'] = df[close_col].rolling(window=5).mean()
    df['sma_20'] = df[close_col].rolling(window=20).mean()
    df['sma_50'] = df[close_col].rolling(window=50).mean()
    
    # Exponential Moving Average
    df['ema_12'] = df[close_col].ewm(span=12, adjust=False).mean()
    df['ema_26'] = df[close_col].ewm(span=26, adjust=False).mean()
    
    # MACD
    df['macd'] = df['ema_12'] - df['ema_26']
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    # Momentum Indicators
    # Relative Strength Index (RSI)
    delta = df[close_col].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi'] = 100 - (100 / (1 + rs))
    
    # Stochastic Oscillator
    lookback = 14
    df['lowest_low'] = df[low_col].rolling(window=lookback).min()
    df['highest_high'] = df[high_col].rolling(window=lookback).max()
    df['stoch_k'] = 100 * (df[close_col] - df['lowest_low']) / (df['highest_high'] - df['lowest_low'])
    df['stoch_d'] = df['stoch_k'].rolling(window=3).mean()
    
    # Volatility Indicators
    # Bollinger Bands
    df['bb_middle'] = df[close_col].rolling(window=20).mean()
    df['bb_upper'] = df['bb_middle'] + 2 * df[close_col].rolling(window=20).std()
    df['bb_lower'] = df['bb_middle'] - 2 * df[close_col].rolling(window=20).std()
    
    # Average True Range (ATR)
    high_low = df[high_col] - df[low_col]
    high_close = np.abs(df[high_col] - df[close_col].shift())
    low_close = np.abs(df[low_col] - df[close_col].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    df['atr'] = true_range.rolling(14).mean()
    
    # Volume-based Indicators
    # On-Balance Volume (OBV)
    df['daily_ret'] = df[close_col].pct_change()
    df['obv'] = np.where(df['daily_ret'] > 0, df[volume_col], 
                        np.where(df['daily_ret'] < 0, -df[volume_col], 0)).cumsum()
    
    # Volume-Weighted Average Price (VWAP)
    df['vwap'] = (df[close_col] * df[volume_col]).cumsum() / df[volume_col].cumsum()
    
    # Price Rate of Change
    df['roc_5'] = df[close_col].pct_change(periods=5) * 100
    df['roc_20'] = df[close_col].pct_change(periods=20) * 100
    
    # Additional Derived Features
    df['price_volatility'] = df[close_col].rolling(window=20).std()
    df['volume_volatility'] = df[volume_col].rolling(window=20).std()
    
    return df

## Join with features and sentiment

In [59]:
apple_df_features = pd.read_csv('price/raw_with_features/AAPL.csv')
apple_df_prophet = pd.read_csv('price/raw_with_prophet/AAPL_prophet_predictions.csv')
apple_df_llama_sentiment = pd.read_csv('sentiments/AAPL_sentiment.csv')
apple_df_llama_sentiment.rename(columns={"date_of_tweets": "Date"}, inplace=True)

gemini_sentiments = pd.read_csv('sentiments/gemini_sentiment_predictions_all.csv')

apple_df_gemini_sentiment = gemini_sentiments[gemini_sentiments['ticker'] == 'AAPL']
apple_df_gemini_sentiment.rename(columns={"date_of_tweets": "Date"}, inplace=True)

apple_df_combined = apple_df_features.merge(apple_df_prophet[['Date', 'prophet_predicted_price']], on='Date', how='inner')
apple_df_combined = apple_df_combined.merge(apple_df_llama_sentiment[['Date', 'prediction', 'confidence']].rename(
        columns={'prediction': 'llama_sentiment', 'confidence': 'llama_sentiment_confidence'}
    ), on='Date', how='inner')
apple_df_combined = apple_df_combined.merge(apple_df_gemini_sentiment[['Date', 'prediction', 'confidence']].rename(
        columns={'prediction': 'gemini_sentiment', 'confidence': 'gemini_sentiment_confidence'}
    ), on='Date', how='inner')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_df_gemini_sentiment.rename(columns={"date_of_tweets": "Date"}, inplace=True)


In [61]:
def create_df_with_features(ticker: str):
    """
    Loads and combines data for a given ticker from various sources.

    Parameters:
    ticker (str): The stock ticker (e.g., "AAPL").

    Returns:
    pd.DataFrame: A combined DataFrame containing features, predictions, and sentiments.
    """
    try:
        # Define file paths
        features_file = f'price/raw_with_features/{ticker}.csv'
        prophet_file = f'price/raw_with_prophet/{ticker}_prophet_predictions.csv'
        llama_sentiment_file = f'sentiments/{ticker}_sentiment.csv'
        gemini_sentiment_file = 'sentiments/gemini_sentiment_predictions_all.csv'

        # Check file existence
        for file_path in [features_file, prophet_file, llama_sentiment_file, gemini_sentiment_file]:
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")

        # Load data
        features_df = pd.read_csv(features_file)
        prophet_df = pd.read_csv(prophet_file)
        llama_sentiment_df = pd.read_csv(llama_sentiment_file)
        gemini_sentiments_df = pd.read_csv(gemini_sentiment_file)

        # Standardize column names for consistency
        llama_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)
        gemini_sentiment_df = gemini_sentiments_df[gemini_sentiments_df['ticker'] == ticker]
        gemini_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)

        # Merge datasets
        combined_df = features_df.merge(
            prophet_df[['Date', 'prophet_predicted_price']], on='Date', how='inner'
        )
        combined_df = combined_df.merge(
            llama_sentiment_df[['Date', 'prediction', 'confidence']].rename(
                columns={'prediction': 'llama_sentiment', 'confidence': 'llama_sentiment_confidence'}
            ), on='Date', how='inner'
        )
        combined_df = combined_df.merge(
            gemini_sentiment_df[['Date', 'prediction', 'confidence']].rename(
                columns={'prediction': 'gemini_sentiment', 'confidence': 'gemini_sentiment_confidence'}
            ), on='Date', how='inner'
        )

        return combined_df

    except FileNotFoundError as e:
        print(f"Error: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if files are missing
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on any other exception

In [63]:
AAPL_combined_df = create_df_with_features("AAPL")
KO_combined_df = create_df_with_features("KO")
TSLA_combined_df = create_df_with_features("TSLA")
V_combined_df = create_df_with_features("V")
XOM_combined_df = create_df_with_features("XOM")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gemini_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gemini_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gemini_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

In [64]:
os.makedirs("data", exist_ok=True)

AAPL_combined_df.to_csv("data/AAPL_combined.csv", index=False)
KO_combined_df.to_csv("data/KO_combined.csv", index=False)
TSLA_combined_df.to_csv("data/TSLA_combined.csv", index=False)
V_combined_df.to_csv("data/V_combined.csv", index=False)
XOM_combined_df.to_csv("data/XOM_combined.csv", index=False)

In [75]:
AAPL_combined_df['Close'].isna()    

0       75.087502
1       74.357498
2       74.949997
3       74.597504
4       75.797501
          ...    
747    132.369995
748    132.300003
749    135.449997
750    132.229996
751    131.860001
Name: Close, Length: 752, dtype: float64

## Train Test Split

In [71]:
from sklearn.model_selection import TimeSeriesSplit

In [85]:
def save_timeseries_splits(data, ticker, n_splits=5):
    """
    Performs Time Series Split on the training part of data, reserves last 30% of data as a test set.
    
    Args:
        data: DataFrame with features and target for time-series analysis
        ticker: The stock ticker name to organize data into its own folder
        n_splits: Number of splits for TimeSeriesSplit
    """
    
    # Determine the split for training-validation and test set
    test_size = int(len(data) * 0.3)  # Reserve last 30% for testing
    train_val_data = data.iloc[:-test_size]  # First 70% for training-validation
    test_data = data.iloc[-test_size:]  # Last 30% for testing

    # Set up time-series split
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # Directory to save splits
    split_dir = os.path.join('data_split', ticker)
    os.makedirs(split_dir, exist_ok=True)

    # Perform splits
    for i, (train_index, val_index) in enumerate(tscv.split(train_val_data), 1):
        # Split the data
        train_df = train_val_data.iloc[train_index]
        val_df = train_val_data.iloc[val_index]

        # Save training and validation split
        split_subdir = os.path.join(split_dir, f"split_{i:02d}")
        os.makedirs(split_subdir, exist_ok=True)
        
        train_df.to_csv(os.path.join(split_subdir, 'train.csv'), index=False)
        val_df.to_csv(os.path.join(split_subdir, 'val.csv'), index=False)

        print(f"Saved split {i} for {ticker} at {split_subdir}")
    
    # Save the test set
    test_data_dir = os.path.join(split_dir, "test")
    os.makedirs(test_data_dir, exist_ok=True)
    test_data.to_csv(os.path.join(test_data_dir, 'test.csv'), index=False)
    print(f"Saved test set for {ticker} at {test_data_dir}")

In [86]:
tickers = ['AAPL', 'KO', 'TSLA', 'V', 'XOM']
n_splits = 5

for ticker in tickers:
    file_path = f"data/{ticker}_combined.csv"
    if os.path.exists(file_path):
        data = pd.read_csv(file_path)
        save_timeseries_splits(data, ticker, n_splits)
    else:
        print(f"File {file_path} not found.")

Saved split 1 for AAPL at data_split/AAPL/split_01
Saved split 2 for AAPL at data_split/AAPL/split_02
Saved split 3 for AAPL at data_split/AAPL/split_03
Saved split 4 for AAPL at data_split/AAPL/split_04
Saved split 5 for AAPL at data_split/AAPL/split_05
Saved test set for AAPL at data_split/AAPL/test
Saved split 1 for KO at data_split/KO/split_01
Saved split 2 for KO at data_split/KO/split_02
Saved split 3 for KO at data_split/KO/split_03
Saved split 4 for KO at data_split/KO/split_04
Saved split 5 for KO at data_split/KO/split_05
Saved test set for KO at data_split/KO/test
Saved split 1 for TSLA at data_split/TSLA/split_01
Saved split 2 for TSLA at data_split/TSLA/split_02
Saved split 3 for TSLA at data_split/TSLA/split_03
Saved split 4 for TSLA at data_split/TSLA/split_04
Saved split 5 for TSLA at data_split/TSLA/split_05
Saved test set for TSLA at data_split/TSLA/test
Saved split 1 for V at data_split/V/split_01
Saved split 2 for V at data_split/V/split_02
Saved split 3 for V at da