In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv("../artifacts/data/sp500_all.csv")
data.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619040 entries, 0 to 619039
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    619040 non-null  object 
 1   open    619029 non-null  float64
 2   high    619032 non-null  float64
 3   low     619032 non-null  float64
 4   close   619040 non-null  float64
 5   volume  619040 non-null  int64  
 6   Name    619040 non-null  object 
dtypes: float64(4), int64(1), object(2)
memory usage: 33.1+ MB


In [4]:
data.describe()


Unnamed: 0,open,high,low,close,volume
count,619029.0,619032.0,619032.0,619040.0,619040.0
mean,83.023334,83.778311,82.256096,83.043763,4321823.0
std,97.378769,98.207519,96.507421,97.389748,8693610.0
min,1.62,1.69,1.5,1.59,0.0
25%,40.22,40.62,39.83,40.245,1070320.0
50%,62.59,63.15,62.02,62.62,2082094.0
75%,94.37,95.18,93.54,94.41,4284509.0
max,2044.0,2067.99,2035.11,2049.0,618237600.0


In [5]:
data.isna().sum()

date       0
open      11
high       8
low        8
close      0
volume     0
Name       0
dtype: int64

In [6]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='mean')
data['open']=imputer.fit_transform(data[['open']])
data['high']=imputer.fit_transform(data[['high']])
data['low']=imputer.fit_transform(data[['low']])

data.isna().sum()

date      0
open      0
high      0
low       0
close     0
volume    0
Name      0
dtype: int64

In [7]:
data.duplicated().sum()

np.int64(0)

In [8]:
from sklearn.preprocessing import  LabelEncoder
encoder=LabelEncoder()
data['Name']=encoder.fit_transform(data['Name'])


In [9]:
data['date'] = pd.to_datetime(data['date'])

# Day, month, year
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

# Day of the week (0 = Monday, 6 = Sunday)
data['day_of_week'] = data['date'].dt.dayofweek

# Week of the year
data['week_of_year'] = data['date'].dt.isocalendar().week

# Quarter of the year
data['quarter'] = data['date'].dt.quarter

# Is month start/end
data['is_month_start'] = data['date'].dt.is_month_start.astype(int)
data['is_month_end'] = data['date'].dt.is_month_end.astype(int)

# Is quarter start/end
data['is_quarter_start'] = data['date'].dt.is_quarter_start.astype(int)
data['is_quarter_end'] = data['date'].dt.is_quarter_end.astype(int)



In [10]:
cols = ['open', 'high', 'low', 'close']
windows = [5, 10, 20]

for col in cols:
    for w in windows:
        data[f'{col}_EMA_{w}'] = data[col].ewm(span=w, adjust=False).mean()


In [11]:
data.head()

Unnamed: 0,date,open,high,low,close,volume,Name,year,month,day,...,open_EMA_20,high_EMA_5,high_EMA_10,high_EMA_20,low_EMA_5,low_EMA_10,low_EMA_20,close_EMA_5,close_EMA_10,close_EMA_20
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,1,2013,2,8,...,15.07,15.12,15.12,15.12,14.63,14.63,14.63,14.75,14.75,14.75
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,1,2013,2,11,...,15.052857,15.083333,15.1,15.109524,14.506667,14.562727,14.594762,14.653333,14.697273,14.722381
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,1,2013,2,12,...,14.995442,14.892222,14.992727,15.052426,14.371111,14.478595,14.547642,14.525556,14.619587,14.679297
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,1,2013,2,13,...,14.92921,14.908148,14.98314,15.041719,14.330741,14.437032,14.519295,14.57037,14.626935,14.677459
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,1,2013,2,14,...,14.930237,14.925432,14.978933,15.033936,13.940494,14.204845,14.389838,14.376914,14.511128,14.611987


In [12]:
X=data.drop(columns=['date','close']).values
Y=data['close'].values

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, shuffle=False)

pd.DataFrame(x_train).to_csv("../artifacts/data/x_train.csv", index=False)
pd.DataFrame(x_test).to_csv("../artifacts/data/x_test.csv", index=False)
pd.DataFrame(y_train).to_csv("../artifacts/data/y_train.csv", index=False)
pd.DataFrame(y_test).to_csv("../artifacts/data/y_test.csv", index=False)

In [14]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [34]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor

# XGBoost, LightGBM, CatBoost
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [35]:
models={
    "LinearRegression": LinearRegression(),
    "SVR": SVR(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "BaggingRegressor": BaggingRegressor(),
    "XGBRegressor": XGBRegressor(objective='reg:squarederror'),
    "LGBMRegressor": LGBMRegressor(),
    "CatBoostRegressor": CatBoostRegressor(verbose=0)
}

# ===============================
# Comprehensive Hyperparameter Dictionary
# ===============================

param_grids_comprehensive = {

    # ----- Linear Models -----
    'LinearRegression': {},

    # ----- Tree / Ensemble Models -----
    'DecisionTreeRegressor': {
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5],
        'max_features': [None, 'sqrt', 'log2'],
        'criterion': ['squared_error', 'absolute_error', 'poisson'],
        'splitter': ['best', 'random']
    },
    'RandomForestRegressor': {
        'n_estimators': [100, 200, 500],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2'],
        'bootstrap': [True, False],
        'criterion': ['squared_error', 'absolute_error']
    },
    'GradientBoostingRegressor': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.7, 1.0],
        'max_features': [None, 'sqrt', 'log2']
    },
    'AdaBoostRegressor': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
        'loss': ['linear', 'square', 'exponential']
    },
    'BaggingRegressor': {
        'n_estimators': [10, 50, 100],
        'max_samples': [0.5, 0.7, 1.0],
        'max_features': [0.5, 0.7, 1.0],
        'bootstrap': [True, False],
        'bootstrap_features': [True, False]
    },

    # ----- KNN / SVM -----
    'KNeighborsRegressor': {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'p': [1, 2],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [20, 30, 40]
    },
    'SVR': {
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'],
        'epsilon': [0.01, 0.1, 0.2],
        'degree': [2, 3, 4],  # only for poly kernel
        'shrinking': [True, False]
    },

    # ----- Neural Network -----
    'MLPRegressor': {
        'hidden_layer_sizes': [(50,), (100,), (100,50), (100,100)],
        'activation': ['relu', 'tanh', 'logistic'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [0.0001, 0.001, 0.01],
        'batch_size': ['auto', 32, 64],
        'learning_rate': ['constant', 'adaptive'],
        'learning_rate_init': [0.001, 0.01],
        'max_iter': [500, 1000, 2000],
        'early_stopping': [True, False],
        'validation_fraction': [0.1, 0.2]
    },

    # ----- Boosting Libraries -----
    'XGBRegressor': {
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.7, 1],
        'colsample_bytree': [0.7, 1],
        'gamma': [0, 0.1, 0.3],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [1, 1.5, 2]
    },
    'LGBMRegressor': {
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 5, 7, -1],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50, 100],
        'min_child_samples': [20, 30, 50],
        'subsample': [0.7, 1.0],
        'colsample_bytree': [0.7, 1.0],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [0, 0.1, 0.5]
    },
    'CatBoostRegressor': {
        'iterations': [500, 1000, 1500],
        'depth': [4, 6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'l2_leaf_reg': [1, 3, 5, 7],
        'bagging_temperature': [0, 1, 2],
        'rsm': [0.7, 0.8, 1.0],
        'border_count': [32, 50, 100],
        'random_strength': [1, 2, 5],
        'verbose': [0]
    }
}



In [36]:
def model_training(models,param_grids,x_train,y_train,x_test,y_test):
    results = {}
    for name, model in models.items():
        print(f"Training {name}...")
        param_grid = param_grids.get(name, {})
        if param_grid:
            grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
            grid_search.fit(x_train, y_train)
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            print(f"Best parameters for {name}: {best_params}")
        else:
            model.fit(x_train, y_train)
            best_model = model
            best_params = "Default parameters used"
        
        y_pred = best_model.predict(x_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {
            'model': best_model,
            'best_params': best_params,
            'mse': mse,
            'mae': mae,
            'r2': r2
        }
        
        print(f"{name} - MSE: {mse}, MAE: {mae}, R2: {r2}\n")
    
    return results

In [None]:
model_training(models,param_grids_comprehensive,x_train,y_train,x_test,y_test)

Training LinearRegression...
LinearRegression - MSE: 0.06381808301810107, MAE: 0.1534024524048429, R2: 0.9999694004701849

Training SVR...
