In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import time
from joblib import dump, load

import numpy as np


In [6]:

combined_df = pd.read_csv("combined_stock_data.csv")

# Drop the 'Date' column
combined_df_normalized=combined_df.drop(columns=['Date'], inplace=True)

# Separate out the macd_histogram and next_day_macd_histogram columns
macd_histogram = combined_df['macd_histogram']
Next_Day_macd_histogram = combined_df['Next_Day_macd_histogram']
combined_df.drop(columns=['macd_histogram', 'Next_Day_macd_histogram'], inplace=True)

# Normalize numerical columns excluding macd_histogram and next_day_macd_histogram
scaler = MinMaxScaler()
combined_df_normalized = scaler.fit_transform(combined_df.select_dtypes(include=['float64', 'int64']))

# Convert normalized array back to DataFrame
combined_df_normalized = pd.DataFrame(combined_df_normalized, columns=combined_df.select_dtypes(include=['float64', 'int64']).columns)

# Concatenate normalized DataFrame with macd_histogram and next_day_macd_histogram
combined_df_normalized['macd_histogram'] = macd_histogram.values
combined_df_normalized['Next_Day_macd_histogram'] = Next_Day_macd_histogram.values

# Display the normalized combined dataframe
display(combined_df_normalized.head())


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,sma_50,ema_50,sma_200,ema_200,rsi,macd,signal,Next_Day_Close,Next_Day_rsi,macd_histogram,Next_Day_macd_histogram
0,0.13792,0.140526,0.139093,0.140923,0.14935,0.055437,0.146695,0.145086,0.13751,0.141892,0.617499,0.599909,0.589251,0.143224,0.641722,0.56003,0.827646
1,0.141626,0.142949,0.142656,0.143224,0.151792,0.049471,0.146754,0.145256,0.137621,0.142012,0.641722,0.604786,0.591849,0.143745,0.695305,0.827646,0.99498
2,0.143386,0.143553,0.144306,0.143745,0.152345,0.037838,0.146834,0.145441,0.137726,0.142137,0.695305,0.609062,0.594973,0.143974,0.680079,0.99498,1.070549
3,0.1435,0.143438,0.144044,0.143974,0.152588,0.039783,0.146913,0.145629,0.137858,0.142263,0.680079,0.61259,0.598334,0.143151,0.798345,1.070549,1.017745
4,0.144416,0.144,0.144853,0.143151,0.151715,0.038139,0.146909,0.145774,0.13796,0.142378,0.798345,0.614662,0.601529,0.143099,0.767012,1.017745,0.932152


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load your data
# combined_df_normalized = pd.read_csv('your_data.csv')  # Load your data here
target_feature_list=[('Close','Next_Day_Close'),('rsi','Next_Day_rsi'),('macd_histogram','Next_Day_macd_histogram')]

for target,feature in target_feature_list: 
# Feature and target
    X = combined_df_normalized[[target]]
    y = combined_df_normalized[feature]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a dictionary of models
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Support Vector Machine': SVR()
    }

    # Function to evaluate models
    # Function to evaluate models
    def evaluate_model(model, X_train, y_train, X_test, y_test):
        start_time = time.time()
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        metrics = {
            'Training Score': model.score(X_train, y_train),
            'Test Score': model.score(X_test, y_test),
            'MAE': mean_absolute_error(y_test, y_pred_test),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R-squared': r2_score(y_test, y_pred_test),
            'Time (s)': time.time() - start_time  # Calculate time taken for model evaluation
        }
        return metrics

    # Evaluate all models
    results = {name: evaluate_model(model, X_train, y_train, X_test, y_test) for name, model in models.items()}


    # Create a DataFrame from results
    results_df = pd.DataFrame.from_dict(results, orient='index')

    # Display the results in tabular format
    print(f"target : {target}  |  feature : {feature}")
    display(results_df)


target : Close  |  feature : Next_Day_Close


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.998927,0.998962,0.002761,0.005066,0.998962,0.020644
Decision Tree,0.999843,0.997976,0.003969,0.007076,0.997976,0.116519
Random Forest,0.999681,0.998497,0.003471,0.006096,0.998497,7.446156
Support Vector Machine,0.840593,0.84111,0.058791,0.062685,0.84111,0.066609


target : rsi  |  feature : Next_Day_rsi


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.861138,0.857895,0.050954,0.067053,0.857895,0.018171
Decision Tree,0.997597,0.717932,0.073285,0.094469,0.717932,0.148523
Random Forest,0.970277,0.791494,0.06285,0.081221,0.791494,9.666749
Support Vector Machine,0.8598,0.856927,0.051677,0.067281,0.856927,34.950465


target : macd_histogram  |  feature : Next_Day_macd_histogram


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.935418,0.935414,0.235036,0.416462,0.935414,0.02004
Decision Tree,0.999937,0.866609,0.362303,0.598509,0.866609,0.146549
Random Forest,0.986764,0.903583,0.310454,0.508842,0.903583,9.750061
Support Vector Machine,0.9241,0.930514,0.236617,0.431973,0.930514,151.289204


### **Using Principle Component Analysis (PCA)**

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# List of target columns
target_columns = ['Next_Day_Close', 'Next_Day_rsi', 'Next_Day_macd_histogram']

# Iterate over each target variable
for target in target_columns:
    # Separate features and target variable
    y = combined_df_normalized[target]  # Current target variable
    X = combined_df_normalized.drop(target_columns, axis=1)  # Features excluding the current target



    # Initialize PCA model
    pca = PCA(n_components=10)  # You can adjust n_components based on your data and requirements

    # Fit PCA and transform the data
    X_pca = pca.fit_transform(X)
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error

    # Assuming X_pca and y_train are already defined
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define a dictionary of models
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Support Vector Machine': SVR()
    }

    # Function to evaluate models
    def evaluate_model(model, X_train, y_train, X_test, y_test):
        start_time = time.time()
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        metrics = {
            'Training Score': model.score(X_train, y_train),
            'Test Score': model.score(X_test, y_test),
            'MAE': mean_absolute_error(y_test, y_pred_test),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R-squared': r2_score(y_test, y_pred_test),
            'Time (s)': time.time() - start_time  # Calculate time taken for model evaluation
        }
        return metrics

    # Evaluate all models
    results = {name: evaluate_model(model, X_train, y_train, X_test, y_test) for name, model in models.items()}


    # Create a DataFrame from results
    results_df = pd.DataFrame.from_dict(results, orient='index')

    # Display the results in tabular format
    print(f"target : {target}  ")
    display(results_df)



target : Next_Day_Close  


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.998928,0.998958,0.002781,0.005076,0.998958,0.026025
Decision Tree,1.0,0.997165,0.004699,0.008373,0.997165,0.695455
Random Forest,0.999789,0.998642,0.003314,0.005795,0.998642,45.856898
Support Vector Machine,0.946235,0.946262,0.029955,0.036455,0.946262,0.500077


target : Next_Day_rsi  


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.863963,0.860164,0.050769,0.066515,0.860164,0.0359
Decision Tree,1.0,0.718989,0.073329,0.094291,0.718989,0.714749
Random Forest,0.98068,0.858615,0.051742,0.066882,0.858615,47.567861
Support Vector Machine,0.867743,0.864432,0.050921,0.065492,0.864432,36.217016


target : Next_Day_macd_histogram  


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.963157,0.962933,0.173313,0.315499,0.962933,0.034866
Decision Tree,1.0,0.908715,0.281413,0.495115,0.908715,0.911769
Random Forest,0.993734,0.954458,0.191259,0.349714,0.954458,64.906527
Support Vector Machine,0.936273,0.944119,0.207226,0.387382,0.944119,154.313299


## **Exporting the models**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load your data
# combined_df_normalized = pd.read_csv('your_data.csv')  # Load your data here
target_feature_list=[('Close','Next_Day_Close'),('rsi','Next_Day_rsi'),('macd_histogram','Next_Day_macd_histogram')]

for feature,target in target_feature_list: 
# Feature and target
    X = combined_df_normalized[[target]]
    y = combined_df_normalized[feature]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a dictionary of models
    models = {
        'Linear Regression': LinearRegression(),
    }

    # Function to evaluate models
    def evaluate_and_export_model (name,model, X_train, y_train, X_test, y_test):
        start_time = time.time()
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        # Save the model
        dump(model, f'models/{target}_{name}.joblib')
        print(f'\033[92m{name} Model for {target} exported Successfully !!\033[0m')


        metrics = {
            'Training Score': model.score(X_train, y_train),
            'Test Score': model.score(X_test, y_test),
            'MAE': mean_absolute_error(y_test, y_pred_test),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R-squared': r2_score(y_test, y_pred_test),
            'Time (s)': time.time() - start_time  # Calculate time taken for model evaluation
        }
        
        return metrics

    # Evaluate all models
    results = {name: evaluate_and_export_model(name,model, X_train, y_train, X_test, y_test) for name, model in models.items()}


    # Create a DataFrame from results
    results_df = pd.DataFrame.from_dict(results, orient='index')

    # Display the results in tabular format
    print(f"target : {target}  |  feature : {feature}")
    display(results_df)
    print("\n\n")


[92mLinear Regression Model for Next_Day_Close exported Successfully !![0m
target : Next_Day_Close  |  feature : Close


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.998927,0.998962,0.002761,0.005064,0.998962,0.017994





[92mLinear Regression Model for Next_Day_rsi exported Successfully !![0m
target : Next_Day_rsi  |  feature : rsi


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.861138,0.85788,0.050936,0.067164,0.85788,0.022728





[92mLinear Regression Model for Next_Day_macd_histogram exported Successfully !![0m
target : Next_Day_macd_histogram  |  feature : macd_histogram


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.935418,0.935412,0.236964,0.420867,0.935412,0.015862







### with PCA

In [10]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# List of target columns
target_columns = ['Next_Day_Close', 'Next_Day_rsi', 'Next_Day_macd_histogram']

# Iterate over each target variable
for target in target_columns:
    # Separate features and target variable
    y = combined_df_normalized[target]  # Current target variable
    X = combined_df_normalized.drop(target_columns, axis=1)  # Features excluding the current target



    # Initialize PCA model
    pca = PCA(n_components=10)  # You can adjust n_components based on your data and requirements
    # Save the PCA object
    dump(pca, 'models/pca_model.joblib')
    # Fit PCA and transform the data
    X_pca = pca.fit_transform(X)
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error

    # Assuming X_pca and y_train are already defined
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define a dictionary of models
    models = {
        'Linear Regression': LinearRegression(),
    }

 # Function to evaluate models
    def evaluate_and_export_model (name,model, X_train, y_train, X_test, y_test):
        start_time = time.time()
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        # Save the model
        dump(model, f'models/{target}_{name}.joblib')
        print(f'\033[92m{name} Model for {target} exported Successfully !!\033[0m')


        metrics = {
            'Training Score': model.score(X_train, y_train),
            'Test Score': model.score(X_test, y_test),
            'MAE': mean_absolute_error(y_test, y_pred_test),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R-squared': r2_score(y_test, y_pred_test),
            'Time (s)': time.time() - start_time  # Calculate time taken for model evaluation
        }
        
        return metrics

    # Evaluate all models
    results = {name: evaluate_and_export_model(name,model, X_train, y_train, X_test, y_test) for name, model in models.items()}



    # Create a DataFrame from results
    results_df = pd.DataFrame.from_dict(results, orient='index')

    # Display the results in tabular format
    # print(f"target : {target}  |  feature : {feature}")
    display(results_df)



[92mLinear Regression Model for Next_Day_Close exported Successfully !![0m


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.998928,0.998958,0.002781,0.005076,0.998958,0.014426


[92mLinear Regression Model for Next_Day_rsi exported Successfully !![0m


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.863963,0.860164,0.050769,0.066515,0.860164,0.037009


[92mLinear Regression Model for Next_Day_macd_histogram exported Successfully !![0m


Unnamed: 0,Training Score,Test Score,MAE,RMSE,R-squared,Time (s)
Linear Regression,0.963157,0.962933,0.173313,0.315499,0.962933,0.062543


In [None]:
dump(scaler, 'models/scaler.joblib')


['models/scaler.joblib']