In [40]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import TimeSeriesSplit

## Features

In [25]:
def calculate_technical_indicators(df, close_col='close', high_col='high', low_col='low', volume_col='volume'):
    """
    Calculate technical indicators for stock price analysis.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns for close, high, low, and volume
    close_col (str): Name of closing price column
    high_col (str): Name of high price column
    low_col (str): Name of low price column
    volume_col (str): Name of volume column
    
    Returns:
    pandas.DataFrame: Original data with additional technical indicators
    """
    df = df.copy()
    
    # Trend Indicators
    # Moving Averages
    df['sma_5'] = df[close_col].rolling(window=5).mean()
    df['sma_20'] = df[close_col].rolling(window=20).mean()
    df['sma_50'] = df[close_col].rolling(window=50).mean()
    
    # Exponential Moving Average
    df['ema_12'] = df[close_col].ewm(span=12, adjust=False).mean()
    df['ema_26'] = df[close_col].ewm(span=26, adjust=False).mean()
    
    # MACD
    df['macd'] = df['ema_12'] - df['ema_26']
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    # Momentum Indicators
    # Relative Strength Index (RSI)
    delta = df[close_col].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi'] = 100 - (100 / (1 + rs))
    
    # Stochastic Oscillator
    lookback = 14
    df['lowest_low'] = df[low_col].rolling(window=lookback).min()
    df['highest_high'] = df[high_col].rolling(window=lookback).max()
    df['stoch_k'] = 100 * (df[close_col] - df['lowest_low']) / (df['highest_high'] - df['lowest_low'])
    df['stoch_d'] = df['stoch_k'].rolling(window=3).mean()
    
    # Volatility Indicators
    # Bollinger Bands
    df['bb_middle'] = df[close_col].rolling(window=20).mean()
    df['bb_upper'] = df['bb_middle'] + 2 * df[close_col].rolling(window=20).std()
    df['bb_lower'] = df['bb_middle'] - 2 * df[close_col].rolling(window=20).std()
    
    # Average True Range (ATR)
    high_low = df[high_col] - df[low_col]
    high_close = np.abs(df[high_col] - df[close_col].shift())
    low_close = np.abs(df[low_col] - df[close_col].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    df['atr'] = true_range.rolling(14).mean()
    
    # Volume-based Indicators
    # On-Balance Volume (OBV)
    df['daily_ret'] = df[close_col].pct_change()
    df['obv'] = np.where(df['daily_ret'] > 0, df[volume_col], 
                        np.where(df['daily_ret'] < 0, -df[volume_col], 0)).cumsum()
    
    # Volume-Weighted Average Price (VWAP)
    df['vwap'] = (df[close_col] * df[volume_col]).cumsum() / df[volume_col].cumsum()
    
    # Price Rate of Change
    df['roc_5'] = df[close_col].pct_change(periods=5) * 100
    df['roc_20'] = df[close_col].pct_change(periods=20) * 100
    
    # Additional Derived Features
    df['price_volatility'] = df[close_col].rolling(window=20).std()
    df['volume_volatility'] = df[volume_col].rolling(window=20).std()
    
    return df

## Join with features and sentiment

In [26]:
# apple_df_features = pd.read_csv('price/raw_with_features/AAPL.csv')
# apple_df_prophet = pd.read_csv('price/raw_with_prophet/AAPL_prophet_predictions.csv')
# apple_df_llama_sentiment = pd.read_csv('sentiments/AAPL_sentiment.csv')
# apple_df_llama_sentiment.rename(columns={"date_of_tweets": "Date"}, inplace=True)

# gemini_sentiments = pd.read_csv('sentiments/gemini_sentiment_predictions_all.csv')

# apple_df_gemini_sentiment = gemini_sentiments[gemini_sentiments['ticker'] == 'AAPL']
# apple_df_gemini_sentiment.rename(columns={"date_of_tweets": "Date"}, inplace=True)

# apple_df_combined = apple_df_features.merge(apple_df_prophet[['Date', 'prophet_predicted_price']], on='Date', how='inner')
# apple_df_combined = apple_df_combined.merge(apple_df_llama_sentiment[['Date', 'prediction', 'confidence']].rename(
#         columns={'prediction': 'llama_sentiment', 'confidence': 'llama_sentiment_confidence'}
#     ), on='Date', how='inner')
# apple_df_combined = apple_df_combined.merge(apple_df_gemini_sentiment[['Date', 'prediction', 'confidence']].rename(
#         columns={'prediction': 'gemini_sentiment', 'confidence': 'gemini_sentiment_confidence'}
#     ), on='Date', how='inner')


In [27]:
def create_df_with_features(ticker: str):
    """
    Loads and combines data for a given ticker from various sources.

    Parameters:
    ticker (str): The stock ticker (e.g., "AAPL").

    Returns:
    pd.DataFrame: A combined DataFrame containing features, predictions, and sentiments.
    """
    try:
        # Define file paths
        features_file = f'price/raw_with_features/{ticker}.csv'
        prophet_file = f'price/raw_with_prophet_adj/{ticker}_prophet_predictions.csv'
        llama_sentiment_file = f'sentiments/{ticker}_sentiment.csv'
        gemini_sentiment_file = 'sentiments/gemini_sentiment_predictions_all.csv'

        # Check file existence
        for file_path in [features_file, prophet_file, llama_sentiment_file, gemini_sentiment_file]:
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")

        # Load data
        features_df = pd.read_csv(features_file)
        prophet_df = pd.read_csv(prophet_file)
        llama_sentiment_df = pd.read_csv(llama_sentiment_file)
        gemini_sentiments_df = pd.read_csv(gemini_sentiment_file)

        # Standardize column names for consistency
        llama_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)
        gemini_sentiment_df = gemini_sentiments_df[gemini_sentiments_df['ticker'] == ticker]
        gemini_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)

        # Merge datasets
        combined_df = features_df.merge(
            prophet_df[['Date', 'prophet_predicted_price']], on='Date', how='inner'
        )
        combined_df = combined_df.merge(
            llama_sentiment_df[['Date', 'prediction', 'confidence']].rename(
                columns={'prediction': 'llama_sentiment', 'confidence': 'llama_sentiment_confidence'}
            ), on='Date', how='inner'
        )
        combined_df = combined_df.merge(
            gemini_sentiment_df[['Date', 'prediction', 'confidence']].rename(
                columns={'prediction': 'gemini_sentiment', 'confidence': 'gemini_sentiment_confidence'}
            ), on='Date', how='inner'
        )

        combined_df['pct_change_adj_close'] = combined_df['Adj Close'].pct_change() * 100 # Percentage change in adjusted price compared to the previous day
        combined_df['price_change_direction'] = combined_df['Adj Close'].diff().apply(lambda x: 1 if x >= 0 else -1) 

        # List of sentiment columns to shift
        sentiment_columns = ['prophet_predicted_price', 'llama_sentiment', 
                            'llama_sentiment_confidence', 'gemini_sentiment', 
                            'gemini_sentiment_confidence']

        # Shift the sentiment columns down by one row
        combined_df[sentiment_columns] = combined_df[sentiment_columns].shift(1)

        combined_df = combined_df.dropna()  # Drop rows with missing values - these values do not have enough historical data to calculate indicators
       
        combined_df = combined_df.iloc[1:]  # Drop the first row since we shifted the sentiment columns

        return combined_df

    except FileNotFoundError as e:
        print(f"Error: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if files are missing
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on any other exception

In [28]:
AAPL_combined_df = create_df_with_features("AAPL")
KO_combined_df = create_df_with_features("KO")
TSLA_combined_df = create_df_with_features("TSLA")
V_combined_df = create_df_with_features("V")
XOM_combined_df = create_df_with_features("XOM")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gemini_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gemini_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gemini_sentiment_df.rename(columns={"date_of_tweets": "Date"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

In [29]:
os.makedirs("data", exist_ok=True)

AAPL_combined_df.to_csv("data/AAPL_combined.csv", index=False)
KO_combined_df.to_csv("data/KO_combined.csv", index=False)
TSLA_combined_df.to_csv("data/TSLA_combined.csv", index=False)
V_combined_df.to_csv("data/V_combined.csv", index=False)
XOM_combined_df.to_csv("data/XOM_combined.csv", index=False)

In [30]:
file_paths = ["data/AAPL_combined.csv", "data/KO_combined.csv", "data/TSLA_combined.csv", "data/V_combined.csv", "data/XOM_combined.csv"]
tickers = ["AAPL", "KO", "TSLA", "V", "XOM"]

# Read and combine data
dataframes = []
for file, ticker in zip(file_paths, tickers):
    df = pd.read_csv(file)
    df['ticker'] = ticker  # Add ticker column
    dataframes.append(df)

# Combine all dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

# One-hot encode the ticker column
combined_df = pd.get_dummies(combined_df, columns=['ticker'], prefix='', prefix_sep='')
combined_df = combined_df.sort_values(by='Date')

# Save combined DataFrame (optional)
combined_df.to_csv("data/combined_data.csv", index=False)

## Train Test Split

In [31]:
def save_timeseries_splits(data, n_splits=5):
    """
    Performs Time Series Split on the training part of data, reserves last 30% of data as a test set.
    
    Args:
        data: DataFrame with features and target for time-series analysis
        ticker: The stock ticker name to organize data into its own folder
        n_splits: Number of splits for TimeSeriesSplit
    """
    
    # Determine the split for training-validation and test set
    test_size = int(len(data) * 0.3)  # Reserve last 30% for testing
    train_val_data = data.iloc[:-test_size]  # First 70% for training-validation
    test_data = data.iloc[-test_size:]  # Last 30% for testing

    # Set up time-series split
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # Directory to save splits
    os.makedirs('data_split', exist_ok=True)

    # Perform splits
    for i, (train_index, val_index) in enumerate(tscv.split(train_val_data), 1):
        # Split the data
        train_df = train_val_data.iloc[train_index]
        val_df = train_val_data.iloc[val_index]

        # Save training and validation split
        split_subdir = os.path.join('data_split', f"split_{i:02d}")
        os.makedirs(split_subdir, exist_ok=True)
        
        train_df.to_csv(os.path.join(split_subdir, 'train.csv'), index=False)
        val_df.to_csv(os.path.join(split_subdir, 'val.csv'), index=False)

        print(f"Saved split {i} at {split_subdir}")
    
    # Save the test set
    test_data_dir = os.path.join('data_split', "test")
    os.makedirs(test_data_dir, exist_ok=True)
    test_data.to_csv(os.path.join(test_data_dir, 'test.csv'), index=False)
    print(f"Saved test set {test_data_dir}")

In [32]:
n_splits = 5

file_path = f"data/combined_data.csv"
if os.path.exists(file_path):
    data = pd.read_csv(file_path)
    save_timeseries_splits(data, n_splits)
else:
    print(f"File {file_path} not found.")

Saved split 1 at data_split/split_01
Saved split 2 at data_split/split_02
Saved split 3 at data_split/split_03
Saved split 4 at data_split/split_04
Saved split 5 at data_split/split_05
Saved test set data_split/test


In [33]:
def create_train_test_split(data):
    # Determine the split for training-validation and test set
    test_size = int(len(data) * 0.3)  # Reserve last 30% for testing
    train_val_data = data.iloc[:-test_size]  # First 70% for training-validation
    test_data = data.iloc[-test_size:]  # Last 30% for testing

    train_val_data_dir = os.path.join('data_split', "train_val")
    os.makedirs(train_val_data_dir, exist_ok=True)
    train_val_data.to_csv(os.path.join(train_val_data_dir, 'train_val.csv'), index=False)
    print(f"Saved train val set {train_val_data_dir}")

    test_data_dir = os.path.join('data_split', "test")
    os.makedirs(test_data_dir, exist_ok=True)
    test_data.to_csv(os.path.join(test_data_dir, 'test.csv'), index=False)
    print(f"Saved test set {test_data_dir}")

In [34]:
file_path = f"data/combined_data.csv"
data = pd.read_csv(file_path)
create_train_test_split(data)

Saved train val set data_split/train_val
Saved test set data_split/test


## Modeling

In [77]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "XGBoost": XGBRegressor(random_state=42, n_jobs=-1),
    "LightGBM": lgb.LGBMRegressor(random_state=42, num_threads=-1)
}

param_grids = {
    "Linear Regression": {},
    "Ridge": {
        "alpha": [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strength
    },
    "Lasso": {
        "alpha": [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strength
        "max_iter": [1000, 5000, 10000],  # Maximum number of iterations
    },
    # "Random Forest": {
    #     "n_estimators": [64, 128, 256],  # Number of trees in the forest
    #     "max_depth": [None, 4, 16, 32, 64],  # Maximum depth of the tree
    # },
    # "XGBoost": {
    #     "n_estimators": [64, 128, 256],  # Number of boosting rounds
    #     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # Step size at each iteration
    #     "max_depth": [None, 4, 16, 32, 64],  # Maximum depth of the tree
    # },
    # "LightGBM": {
    #     "n_estimators": [64, 128, 256],  # Number of boosting rounds
    #     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # Step size at each iteration
    #     "max_depth": [None, 4, 16, 32, 64],  # Maximum depth of the tree
    #     "num_leaves": [31, 50, 100],  # Maximum number of leaves in one tree
    # },
    "Random Forest": {
        "n_estimators": [64, 128, 256],  # Number of trees in the forest
        "max_depth": [None, 4, 16, 32, 64],  # Maximum depth of the tree
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "XGBoost": {
        "n_estimators": [64, 128, 256],  # Number of boosting rounds
        "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # Step size at each iteration
        "max_depth": [None, 4, 16],  # Maximum depth of the tree
        'gamma': [0, 0.1, 0.2],
        'min_child_weight': [1, 3, 5],
    },
    "LightGBM": {
        "n_estimators": [64, 128, 256],  # Number of boosting rounds
        "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # Step size at each iteration
        "max_depth": [None, 4, 16],  # Maximum depth of the tree
        "num_leaves": [15, 31, 50, 100],  # Maximum number of leaves in one tree
    }
}

# TimeSeriesSplit for automatic splitting
time_series_cv = TimeSeriesSplit(n_splits=5)

In [78]:
def train_and_evaluate_with_sentiment(train_val_df, model_name):
    # Select features and target
    X_columns = train_val_df.columns[train_val_df.columns.get_loc("Volume") + 1:-1]  # Select all columns after 'Volume', remove the last ticker column
    X_columns = X_columns.drop(['price_change_direction', 'pct_change_adj_close'])
    y_column = 'pct_change_adj_close'

    # TimeSeriesSplit for automatic splitting
    time_series_cv = TimeSeriesSplit(n_splits=5)    
    
    X = train_val_df[X_columns].values
    y = train_val_df[y_column].values
    
    # Initialize the model and parameter grid
    model = models[model_name]
    param_grid = param_grids[model_name]
    
    # Set up GridSearchCV with TimeSeriesSplit
    grid_search = GridSearchCV(model, param_grid, cv=time_series_cv, scoring='neg_mean_squared_error')
    
    # Fit the grid search (this will automatically split the data)
    grid_search.fit(X, y)
    
    # Best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Predict using the best model found
    predictions = best_model.predict(X)
    
    # Evaluate performance (RMSE)
    rmse = np.sqrt(mean_squared_error(y, predictions))
    
    return rmse, best_params

In [79]:
train_val_file = '/Users/abhinavkrishnan/Documents/Fall 2024/Capstone/sn2/data_split/train_val/train_val.csv'
train_val_df = pd.read_csv(train_val_file)

model_results = {}
for model_name in models:
    print(model_name)
    rmse, best_params = train_and_evaluate_with_sentiment(train_val_df, model_name)
    model_results[model_name] = {
        'RMSE': rmse,
        'Best Params': best_params
    }

Linear Regression
Ridge
Lasso


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

Random Forest
XGBoost
LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 372, number of used features: 33
[LightGBM] [Info] Start training from score 0.496268
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6572
[LightGBM] [Info] Number of data points in the train set: 744, number of used features: 33
[LightGBM] [Info] Start training from score 0.369900
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6852
[LightGBM] [Info] Number of data points in the train set: 1116, number of used features: 33
[

In [90]:
model_results

{'Linear Regression': {'RMSE': 0.26721488239372626, 'Best Params': {}},
 'Ridge': {'RMSE': 0.2672245185587161, 'Best Params': {'alpha': 0.001}},
 'Lasso': {'RMSE': 0.27196235118520873,
  'Best Params': {'alpha': 0.001, 'max_iter': 5000}},
 'Random Forest': {'RMSE': 0.31219431849564916,
  'Best Params': {'max_depth': 4,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 128}},
 'XGBoost': {'RMSE': 0.22431734235161024,
  'Best Params': {'gamma': 0.2,
   'learning_rate': 0.05,
   'max_depth': 4,
   'min_child_weight': 1,
   'n_estimators': 128}},
 'LightGBM': {'RMSE': 0.5014795377021516,
  'Best Params': {'learning_rate': 0.05,
   'max_depth': None,
   'n_estimators': 64,
   'num_leaves': 31}}}

In [91]:
with open('tuned_model_configs.json', 'w') as json_file:
    json.dump(model_results, json_file)

In [92]:
# # Set the directory where your splits are stored
# base_dir = 'data_split/'  # Replace with the actual path to your splits directory

# # Loop through the directory splits
# split_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

# # Function to train and evaluate the Random Forest model on a given split
# # def train_and_evaluate_with_sentiment(train_df, val_df):
# #     # Select features and target
# #     X_columns = train_df.columns[train_df.columns.get_loc("Volume") + 1:-1]  # Select all columns after 'Volume', remove the last ticker column
# #     y_column = "Open"
    
# #     X_train = train_df[X_columns].values
# #     y_train = train_df[y_column].values
# #     X_val = val_df[X_columns].values
# #     y_val = val_df[y_column].values
    
# #     # Initialize the Random Forest model
# #     model = RandomForestRegressor(n_estimators=100, random_state=42)
    
# #     # Train the model
# #     model.fit(X_train, y_train)
    
# #     # Predict on validation data
# #     predictions = model.predict(X_val)
    
# #     # Evaluate performance
# #     rmse = np.sqrt(mean_squared_error(y_val, predictions))
# #     return rmse

# def train_and_evaluate_with_sentiment(train_df, val_df, model_name):
#     # Select features and target
#     X_columns = train_df.columns[train_df.columns.get_loc("Volume") + 1:-1]  # Select all columns after 'Volume', remove the last ticker column
#     X_columns = X_columns.drop(['price_change_direction', 'pct_change_open'])
#     y_column = 'pct_change_open'
    
#     X_train = train_df[X_columns].values
#     y_train = train_df[y_column].values
#     X_val = val_df[X_columns].values
#     y_val = val_df[y_column].values
    
#     # Initialize the Random Forest model
#     model = models[model_name]
#     param_grid = param_grids[model_name]

#     if param_grid:
#         grid_search = GridSearchCV(model, param_grid)
    
#     # Train the model
#     model.fit(X_train, y_train)
    
#     # Predict on validation data
#     predictions = model.predict(X_val)
    
#     # Evaluate performance
#     rmse = np.sqrt(mean_squared_error(y_val, predictions))
#     return rmse

In [93]:
# # Loop through each split directory, load train/val, and evaluate the model
# for split in split_dirs:
#     train_path = os.path.join(base_dir, split, 'train.csv')  # Path to the training CSV
#     val_path = os.path.join(base_dir, split, 'val.csv')  # Path to the validation CSV
    
#     # Ensure both train and val files exist
#     if os.path.exists(train_path) and os.path.exists(val_path):
#         # Load train and val data
#         train_df = pd.read_csv(train_path)
#         val_df = pd.read_csv(val_path)
        
#         # Train and evaluate the Random Forest model
#         rmse = train_and_evaluate_with_sentiment(train_df, val_df)
#         print(f"Split: {split} -> RMSE: {rmse}")
#     else:
#         print(f"Skipping split {split} because train or val file is missing.")

In [94]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": lgb.LGBMRegressor(random_state=42)
}

# best_models = {'Linear Regression': {'RMSE': 2.0771440433879818, 'Best Params': {}},
#  'Ridge': {'RMSE': 2.188524030080895, 'Best Params': {'alpha': 1000}},
#  'Lasso': {'RMSE': 2.24708918748917,
#   'Best Params': {'alpha': 1, 'max_iter': 1000}},
#  'Random Forest': {'RMSE': 1.7603466363749978,
#   'Best Params': {'max_depth': 10,
#    'min_samples_leaf': 8,
#    'min_samples_split': 20,
#    'n_estimators': 300}},
#  'XGBoost': {'RMSE': 2.06736247346571,
#   'Best Params': {'learning_rate': 0.01,
#    'max_depth': 3,
#    'min_child_weight': 10,
#    'n_estimators': 200}},
#  'LightGBM': {'RMSE': 2.0790936710408663,
#   'Best Params': {'learning_rate': 0.01,
#    'max_depth': 3,
#    'n_estimators': 200,
#    'num_leaves': 31}}}

In [95]:
with open('tuned_model_configs.json', 'r') as json_file:
    best_models = json.load(json_file)

In [96]:
def evaluate_price_direction(trained_model, X_test, y_test):
    # Generate predictions for pct_change_open using the trained model
    pct_change_open_predictions = trained_model.predict(X_test)
    
    # Transform pct_change_open predictions to price change direction (+1 or -1)
    price_change_direction_predictions = np.where(pct_change_open_predictions > 0, 1, -1)
    
    # Calculate accuracy against the actual labels
    accuracy = accuracy_score(y_test, price_change_direction_predictions)
    
    return price_change_direction_predictions, accuracy

In [97]:
train_val_df = pd.read_csv('data_split/train_val/train_val.csv')
test_df = pd.read_csv('data_split/test/test.csv')

predictions = {}
trained_models = {}
price_direction_results = {}
tickers = ['AAPL', 'KO', 'TSLA', 'V', 'XOM']

for i, (model_name, model_info) in enumerate(best_models.items()):
    # Skip Linear Regression Model
    if model_name == 'Linear Regression':
            continue
        
    X_columns = test_df.columns[test_df.columns.get_loc("Volume") + 1:-1]  # Select all columns after 'Volume', remove the last ticker column
    X_columns = X_columns.drop(['price_change_direction', 'pct_change_adj_close'])
    y_column = 'pct_change_adj_close'

    X_train = train_val_df[X_columns].values
    y_train = train_val_df[y_column].values
    
    X_test = test_df[X_columns].values
    y_test = test_df[y_column].values

    # Get the instantiated model object
    model_instance = models[model_name]
    best_params = model_info['Best Params']

    print(model_instance)
    print(best_params)
    
    # Instantiate and fit the model
    model_instance.set_params(**best_params)
    model_instance.fit(X_train, y_train)
    
    # Save the trained model
    trained_models[model_name] = model_instance
    
    # Generate predictions on the test data
    y_pred = model_instance.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    predictions[model_name] = {}
    predictions[model_name]['Adjusted Closing Price Predictions'] = y_pred
    predictions[model_name]['Adjusted Closing Price RMSE'] = rmse

    # Call the evaluation function
    price_direction_preds, price_direction_acc = evaluate_price_direction(model_instance, X_test, test_df['price_change_direction'].values)

    predictions[model_name]['Price Direction Predictions'] = price_direction_preds
    predictions[model_name]['Price Direction Accuracy'] = price_direction_acc

    for ticker in tickers:
        ticker_mask = test_df[ticker]
        X_test_ticker = X_test[ticker_mask]
        test_df_ticker = test_df[ticker_mask]
        y_test_ticker = y_test[ticker_mask]
        y_pred_ticker = y_pred[ticker_mask]

        rmse_ticker = np.sqrt(mean_squared_error(y_test_ticker, y_pred_ticker))
        _, price_direction_acc_ticker = evaluate_price_direction(model_instance, X_test_ticker, test_df_ticker['price_change_direction'].values)

        predictions[model_name][f'Adjusted Closing Price RMSE {ticker}'] = rmse_ticker
        predictions[model_name][f'Price Direction Accuracy {ticker}'] = price_direction_acc_ticker

Ridge()
{'alpha': 0.001}
Lasso()
{'alpha': 0.001, 'max_iter': 5000}


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  model = cd_fast.enet_coordinate_descent(


RandomForestRegressor(random_state=42)
{'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 128}
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)
{'gamma': 0.2, 'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 128}
LGBMRegressor(random_state=42)
{'

In [98]:
results_df = pd.DataFrame(predictions)

In [99]:
results_df

Unnamed: 0,Ridge,Lasso,Random Forest,XGBoost,LightGBM
Adjusted Closing Price Predictions,"[0.25608472111534764, -6.682021883592404, 0.63...","[0.2766369114006393, -6.558363193002503, 0.674...","[0.2918745883675888, -6.640348095436275, 0.531...","[0.2583671, -7.069083, 0.58652353, -1.8503718,...","[0.21687016156048566, -8.64163933152768, 0.607..."
Adjusted Closing Price RMSE,1.010203,1.011594,1.907065,1.880608,2.097145
Price Direction Predictions,"[1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, -...","[1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, -...","[1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, -...","[1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, -...","[1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, -..."
Price Direction Accuracy,0.98431,0.981172,0.986402,0.978033,0.974895
Adjusted Closing Price RMSE AAPL,0.042529,0.054745,0.110635,0.106582,0.150215
Price Direction Accuracy AAPL,1.0,1.0,1.0,0.995833,0.995833
Adjusted Closing Price RMSE KO,0.363975,0.360922,0.365878,0.368915,0.376187
Price Direction Accuracy KO,0.967532,0.967532,0.974026,0.961039,0.954545
Adjusted Closing Price RMSE TSLA,2.108453,2.11299,4.432019,4.366246,4.906061
Price Direction Accuracy TSLA,0.97561,0.963415,0.987805,0.969512,0.957317


In [100]:
results_df.to_csv('results.csv')