In [3]:
# Machine Learning Algorithms used to predict the Stock Market Behaviour with Exogenous variables

In [4]:
# Linear Regression. (Supervised Learning)
# Linear Regression is a supervised machine learning algorithm(where machined is trained that a particular inut will give a particular output). 
# It basically fits a straight line which predicts a dependent term based on the independent term. Ex- Y=mx+c

# Prediction of the stock market behaviour using Linear Regression

import pandas as pd                                  # pandas are used for data operations
from sklearn.model_selection import train_test_split # it is used to split the data (some for training and some for testing)
from sklearn.linear_model import LinearRegression    # used to fit a LinearRegression model
from sklearn.preprocessing import StandardScaler     # used to rescale all the features used as no high value feature dominates low value feature.
                                                     # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                     # it is used to obtain the observed results
from collections import Counter                      # it is used to count the occurance af a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(       # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.627543      Neutral         Neutral
1293           0.639770              0.662385      Neutral         Bullish
1562           0.423568              0.486638      Neutral         Neutral
1101           0.255415              0.406810      Bearish         Neutral
1161           0.635864              0.588450      Neutral         Neutral
382            0.457954              0.490487      Neutral         Neutral
1197           0.735602              0.722561      Bullish         Bullish
777            0.513502              0.498765      Neutral         Neutral
643            0.585349              0.589357      Neutral         Neutral
275            0.511786              0.540493      Neutral         Neutral

 Mean Squared Error: 0.00210467949629224

 Classification Report:
              precision    recall  f1-score   support

     Bearish       0

In [5]:
# Ridge Regression. (Supervised Learning)
# Ridge Regression is the smart version of the Linear Regressino. Sometimes Linear Regression gives more weights to some features. Ridge Regression keeps
# the weight small and balanced. Y = w1x1 + w2x2 +.........+ wnxn where w1,w2....wn are the weights

# Prediction of the stock market behaviour using Ridge Regression

import pandas as pd                                  # pandas are used for data operations
from sklearn.model_selection import train_test_split # it is used to split the data (some for training and some for testing)
from sklearn.linear_model import Ridge               # used to fit a Ridge Regression model
from sklearn.preprocessing import StandardScaler     # used to rescale all the features used as no high value feature dominates low value feature.
                                                     # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                     # it is used to obtain the observed results
from collections import Counter                      # it is used to count the occurance of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(       # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a Ridge Regression model
model = Ridge(alpha=1.0)    # higher the alpha value lower and appropriate is the weight. lower the alpha value more it behaves like linear regression
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.616329      Neutral         Neutral
1293           0.639770              0.668870      Neutral         Bullish
1562           0.423568              0.504211      Neutral         Neutral
1101           0.255415              0.435123      Bearish         Neutral
1161           0.635864              0.600252      Neutral         Neutral
382            0.457954              0.503097      Neutral         Neutral
1197           0.735602              0.697902      Bullish         Bullish
777            0.513502              0.506101      Neutral         Neutral
643            0.585349              0.591749      Neutral         Neutral
275            0.511786              0.542211      Neutral         Neutral

 Mean Squared Error: 0.0022548728917693474

 Classification Report:
              precision    recall  f1-score   support

     Bearish      

In [6]:
# Lasso Regression. (Supervised Learning)
# Lasso Regression is a linear regression with some of the less importnat feaures are been removed.
# Y = w1x1 + w2x2 +.........+ wnxn where w1,w2....wn are the weights with some of the weights as 0 which is less important. 

# Prediction of the stock market behaviour using Lasso Regression

import pandas as pd                                  # pandas are used for data operations
from sklearn.model_selection import train_test_split # it is used to split the data (some for training and some for testing)
from sklearn.linear_model import Lasso               # used to fit a Lasso Regression model
from sklearn.preprocessing import StandardScaler     # used to rescale all the features used as no high value feature dominates low value feature.
                                                     # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                     # it is used to obtain the observed results
from collections import Counter                      # it is used to count the occurance of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(       # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a Lasso Regression model
model = Lasso(alpha=0.01) # larger the alpha value stronger is the module as unwanted feaures are removed by making unwanted weights 0.
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.575428      Neutral         Neutral
1293           0.639770              0.656835      Neutral         Neutral
1562           0.423568              0.606721      Neutral         Neutral
1101           0.255415              0.592229      Bearish         Neutral
1161           0.635864              0.650599      Neutral         Neutral
382            0.457954              0.555656      Neutral         Neutral
1197           0.735602              0.595077      Bullish         Neutral
777            0.513502              0.536533      Neutral         Neutral
643            0.585349              0.605111      Neutral         Neutral
275            0.511786              0.556848      Neutral         Neutral

 Mean Squared Error: 0.006538738689870886

 Classification Report:
              precision    recall  f1-score   support

     Bearish       

In [7]:
# Decision Tree Regression. (Supervised Algorithm)
# Decision Tree Regression is a non-linear machine learning algorithm. Instead of fitting a straight line, it splits the data in the yes no form like
# flowchart.
# Prediction of the stock market behaviour using Decision Tree Regression

import pandas as pd                                  # pandas are used for data operations
from sklearn.model_selection import train_test_split # it is used to split the data (some for training and some for testing)
from sklearn.tree import DecisionTreeRegressor       # used to fit a Decision Tree Regressor model
from sklearn.preprocessing import StandardScaler     # used to rescale all the features used as no high value feature dominates low value feature.
                                                     # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                     # it is used to obtain the observed results
from collections import Counter                      # it is used to count the occurance of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(       # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a Decision Tree Regression model
model = DecisionTreeRegressor(max_depth=5, random_state=42)   # 5 roots 
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.646743      Neutral         Neutral
1293           0.639770              0.640439      Neutral         Neutral
1562           0.423568              0.593609      Neutral         Neutral
1101           0.255415              0.593609      Bearish         Neutral
1161           0.635864              0.640439      Neutral         Neutral
382            0.457954              0.582426      Neutral         Neutral
1197           0.735602              0.795242      Bullish         Bullish
777            0.513502              0.435019      Neutral         Neutral
643            0.585349              0.593609      Neutral         Neutral
275            0.511786              0.570874      Neutral         Neutral

 Mean Squared Error: 0.007307169285807634

 Classification Report:
              precision    recall  f1-score   support

     Bearish       

In [8]:
# Random Forest Regression. (Supervised Algorithm)
# Random Forest Regressor takes multiple Decision Trees and take average of the predictions.

# Prediction of the stock market behaviour using Random Forest Regression

import pandas as pd                                  # pandas are used for data operations
from sklearn.model_selection import train_test_split # it is used to split the data (some for training and some for testing)
from sklearn.ensemble import RandomForestRegressor   # used to fit a Random Forest Regressor model
from sklearn.preprocessing import StandardScaler     # used to rescale all the features used as no high value feature dominates low value feature.
                                                     # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                     # it is used to obtain the observed results
from collections import Counter                      # it is used to count the occurance of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(       # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)   # 100 trees are been taken into consideration
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.592317      Neutral         Neutral
1293           0.639770              0.666071      Neutral         Bullish
1562           0.423568              0.580331      Neutral         Neutral
1101           0.255415              0.556146      Bearish         Neutral
1161           0.635864              0.649530      Neutral         Neutral
382            0.457954              0.527018      Neutral         Neutral
1197           0.735602              0.585333      Bullish         Neutral
777            0.513502              0.532431      Neutral         Neutral
643            0.585349              0.612472      Neutral         Neutral
275            0.511786              0.540570      Neutral         Neutral

 Mean Squared Error: 0.004570971017688775

 Classification Report:
              precision    recall  f1-score   support

     Bearish       

In [9]:
# Gradient Boosting Regression. (Supervised Algorithm)
# Gradient Boosting Algorithm is a method where instead of building one big model, multiple models (Decision Tree) are built simultaneously where each 
# model is improving by the mistakes made by previous model.

# Prediction of the stock market behaviour using Gradient Boosting Regression

import pandas as pd                                          # pandas are used for data operations
from sklearn.model_selection import train_test_split         # it is used to split the data (some for training and some for testing)
from sklearn.ensemble import GradientBoostingRegressor       # used to fit a Gradient Boosting Regressor model
from sklearn.preprocessing import StandardScaler             # used to rescale all the features so high value features don’t dominate low value features.
                                                             # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                             # it is used to obtain the observed results
from collections import Counter                              # it is used to count the occurrence of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(         # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a Gradient Boosting Regression model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42) # 100 decision trees, slower the learning rate better the model
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.576908      Neutral         Neutral
1293           0.639770              0.652724      Neutral         Neutral
1562           0.423568              0.577411      Neutral         Neutral
1101           0.255415              0.445630      Bearish         Neutral
1161           0.635864              0.616019      Neutral         Neutral
382            0.457954              0.518533      Neutral         Neutral
1197           0.735602              0.582962      Bullish         Neutral
777            0.513502              0.552472      Neutral         Neutral
643            0.585349              0.597301      Neutral         Neutral
275            0.511786              0.533722      Neutral         Neutral

 Mean Squared Error: 0.0036282527344308232

 Classification Report:
              precision    recall  f1-score   support

     Bearish      

In [10]:
# XGBoost Regression. (Supervised Learning)
# XGBoost (Extreme Gradient Boosting) is an optimized gradient boosting algorithm that is fast and highly accurate.
# It builds multiple trees sequentially to correct previous errors and includes regularization to reduce overfitting.

# Prediction of the stock market behaviour using XGBoost Regression

import pandas as pd                                  # pandas are used for data operations
from sklearn.model_selection import train_test_split # it is used to split the data (some for training and some for testing)
from xgboost import XGBRegressor                     # used to fit an XGBoost Regression model
from sklearn.preprocessing import StandardScaler     # used to rescale all the features used as no high value feature dominates low value feature.
                                                     # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                     # it is used to obtain the observed results
from collections import Counter                      # it is used to count the occurance of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(       # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting an XGBoost Regression model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42) # same as Gradient
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.594097      Neutral         Neutral
1293           0.639770              0.692315      Neutral         Bullish
1562           0.423568              0.585731      Neutral         Neutral
1101           0.255415              0.503842      Bearish         Neutral
1161           0.635864              0.622813      Neutral         Neutral
382            0.457954              0.522217      Neutral         Neutral
1197           0.735602              0.696231      Bullish         Bullish
777            0.513502              0.529144      Neutral         Neutral
643            0.585349              0.596162      Neutral         Neutral
275            0.511786              0.527390      Neutral         Neutral

 Mean Squared Error: 0.003190792360547677

 Classification Report:
              precision    recall  f1-score   support

     Bearish       

In [11]:
# LightGBM Regression. (Supervised Learning)
# LightGBM (Light Gradient Boosting Machine) is a high-performance gradient boosting framework.
# It is optimized for speed and efficiency and is suitable for large datasets and fast training.

# Prediction of the stock market behaviour using LightGBM Regression

import pandas as pd                                      # pandas are used for data operations
from sklearn.model_selection import train_test_split     # it is used to split the data (some for training and some for testing)
from lightgbm import LGBMRegressor                       # used to fit a LightGBM Regression model
from sklearn.preprocessing import StandardScaler         # used to rescale all the features used as no high value feature dominates low value feature.
                                                         # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                         # it is used to obtain the observed results
from collections import Counter                          # it is used to count the occurance of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(     # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a LightGBM Regression model
model = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42) # same as aboe algorithms
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5259
[LightGBM] [Info] Number of data points in the train set: 1461, number of used features: 30
[LightGBM] [Info] Start training from score 0.592248

 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.623226      Neutral         Neutral
1293           0.639770              0.645474      Neutral         Neutral
1562           0.423568              0.581528      Neutral         Neutral
1101           0.255415              0.429706      Bearish         Neutral
1161           0.635864              0.656699      Neutral         Neutral
382            0.457954              0.506017      Neutral         Neutral
1197           0.735602              0.593091      Bullish         Neutral
777            0.513502   

In [12]:
# CatBoost Regression. (Supervised Learning)
# CatBoost (Categorical Boosting) is a gradient boosting algorithm that handles categorical variables automatically.
# It is fast, accurate, and prevents overfitting using built-in regularization.

# Prediction of the stock market behaviour using CatBoost Regression

import pandas as pd                                      # pandas are used for data operations
from sklearn.model_selection import train_test_split     # it is used to split the data (some for training and some for testing)
from catboost import CatBoostRegressor                   # used to fit a CatBoost Regression model
from sklearn.preprocessing import StandardScaler         # used to rescale all the features used as no high value feature dominates low value feature.
                                                         # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                         # it is used to obtain the observed results
from collections import Counter                          # it is used to count the occurrence of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(     # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a CatBoost Regression model
model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=5, verbose=0, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.588520      Neutral         Neutral
1293           0.639770              0.665065      Neutral         Bullish
1562           0.423568              0.587688      Neutral         Neutral
1101           0.255415              0.476847      Bearish         Neutral
1161           0.635864              0.622817      Neutral         Neutral
382            0.457954              0.535863      Neutral         Neutral
1197           0.735602              0.608186      Bullish         Neutral
777            0.513502              0.563085      Neutral         Neutral
643            0.585349              0.591648      Neutral         Neutral
275            0.511786              0.542333      Neutral         Neutral

 Mean Squared Error: 0.003954102658382247

 Classification Report:
              precision    recall  f1-score   support

     Bearish       

In [13]:
# Support Vector Regression (SVR).
# Support Vector Regression is used just like Linear Regression which uses 2D plane to fit and seperatre the distinct values.
# It is a non linear algorithm

# Prediction of the stock market behaviour using Support Vector Regression

import pandas as pd                                  # pandas are used for data operations
from sklearn.model_selection import train_test_split # it is used to split the data (some for training and some for testing)
from sklearn.svm import SVR                          # used to fit a Support Vector Regressor model
from sklearn.preprocessing import StandardScaler     # used to rescale all the features used as no high value feature dominates low value feature.
                                                     # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                     # it is used to obtain the observed results
from collections import Counter                      # it is used to count the occurance of a particular element

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(       # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a Support Vector Regression model
model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
# kernel='rbf'	Use a non-linear kernel to fit curved patterns
# C=1.0	Controls penalty for errors (how hard the model tries to avoid mistakes)
# epsilon=0.1	Sets the margin of tolerance – the zone where errors are not punished

model.fit(X_train_scaled, y_train)

# Predict confidence
y_pred = model.predict(X_test_scaled)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculating by subtracting predicted values with actual values(error) then square it and take average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")



 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.578944      Neutral         Neutral
1293           0.639770              0.631419      Neutral         Neutral
1562           0.423568              0.530219      Neutral         Neutral
1101           0.255415              0.533908      Bearish         Neutral
1161           0.635864              0.627519      Neutral         Neutral
382            0.457954              0.516345      Neutral         Neutral
1197           0.735602              0.531822      Bullish         Neutral
777            0.513502              0.529600      Neutral         Neutral
643            0.585349              0.620129      Neutral         Neutral
275            0.511786              0.570003      Neutral         Neutral

 Mean Squared Error: 0.006974891625762776

 Classification Report:
              precision    recall  f1-score   support

     Bearish       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# Deep Learning Algorithms used to predict the Stock Market Behaviour with Exogenous variables

In [15]:
# MLP Regression. (Supervised Learning)
# MLP (Multilayer Perceptron) is a deep learning model that consists of layers of neurons.
# It is used to predict continuous values like Market_Confidence by learning patterns from input features.
# It works like an advanced version of Linear Regression, but with hidden layers for better learning.

# Prediction of the stock market behaviour using MLP Regression

import pandas as pd                                          # pandas are used for data operations
from sklearn.model_selection import train_test_split         # it is used to split the data (some for training and some for testing)
from sklearn.preprocessing import StandardScaler             # used to rescale all the features used as no high value feature dominates low value feature.
                                                             # a particular value of the feature is subtracted by mean and divided by the variance
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                             # it is used to obtain the observed results
from collections import Counter                              # it is used to count the occurrence of a particular element
import tensorflow as tf                                      # tensorflow is used for building deep learning models
from tensorflow.keras.models import Sequential               # Sequential is used to define the model structure
from tensorflow.keras.layers import Dense                    # Dense layers are fully connected layers in the neural network
from tensorflow.keras.optimizers import Adam                 # Adam is an optimizer used to update weights in neural networks

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(         # 80% data is been trained and 20% of the data is been tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting a MLP Regression model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')                             # Output value between 0 and 1 (because Market_Confidence is in that range)
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse') # Compiling the model using mean squared error as loss
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.1, verbose=0) # Training the model

# Predict confidence
y_pred = model.predict(X_test_scaled).flatten()

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculated by subtracting predicted values with actual values(error) then squaring it and taking the average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step

 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.613181      Neutral         Neutral
1293           0.639770              0.705979      Neutral         Bullish
1562           0.423568              0.543546      Neutral         Neutral
1101           0.255415              0.511410      Bearish         Neutral
1161           0.635864              0.709671      Neutral         Bullish
382            0.457954              0.504433      Neutral         Neutral
1197           0.735602              0.527491      Bullish         Neutral
777            0.513502              0.568437      Neutral         Neutral
643            0.585349              0.551178      Neutral         Neutral
275            0.511786              0.551440      Neutral         Neutral

 Mean Squared Error: 0.004995286408950402

 Classification Report:
 

In [16]:
# LSTM Regression. (Supervised Learning)
# LSTM is a type of deep learning model (RNN) used for sequential data. It remembers past values to better predict future values.
# It is best suited for time series tasks like stock prediction, where the order of data matters.

# Prediction of the stock market behaviour using LSTM Regression

import pandas as pd                                          # pandas are used for data operations
from sklearn.model_selection import train_test_split         # it is used to split the data (some for training and some for testing)
from sklearn.preprocessing import StandardScaler             # used to rescale all the features
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                             # used to evaluate the model
from collections import Counter                              # it is used to count the occurrence of each class
import numpy as np                                           
import tensorflow as tf                                      # tensorflow for deep learning
from tensorflow.keras.models import Sequential               # Sequential to define model
from tensorflow.keras.layers import LSTM, Dense              # LSTM and Dense layers
from tensorflow.keras.optimizers import Adam

# Load dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# Columns to drop from training (used in target calculation)
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Define target and features
y = df['Market_Confidence']
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Use only numeric columns
X = X.select_dtypes(include='number')

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert data into time-based sequences for LSTM (use 5 time steps)
time_steps = 5

def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y.values, time_steps)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# Define LSTM model
model = Sequential([
    LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')   # Output between 0 and 1
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# Predict
y_pred = model.predict(X_test).flatten()

# Convert predictions to class labels
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]
y_pred_class = [confidence_to_class(val) for val in y_pred]

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))

# Evaluation
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step

 Top 10 Predictions vs Actuals:
   Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
0           0.626303              0.630695      Neutral         Neutral
1           0.581940              0.478558      Neutral         Neutral
2           0.606372              0.560391      Neutral         Neutral
3           0.568237              0.496782      Neutral         Neutral
4           0.485682              0.463311      Neutral         Neutral
5           0.655102              0.646418      Neutral         Neutral
6           0.703647              0.706889      Bullish         Bullish
7           0.612046              0.613495      Neutral         Neutral
8           0.590268              0.638489      Neutral         Neutral
9           0.574182              0.595111      Neutral         Neutral

 Mean Squared Error: 0.0044497249743812965

 Classification Report:
              precision    recal

In [17]:
# 1D CNN Regression. (Supervised Learning)
# A 1D Convolutional Neural Network learns from sequences by applying filters to detect patterns in time steps.
# It is useful for time-based regression problems like stock prediction, where local patterns in recent time steps matter.

# Prediction of the stock market behaviour using 1D CNN Regression

import pandas as pd                                          # pandas are used for data operations
from sklearn.model_selection import train_test_split         # it is used to split the data (some for training and some for testing)
from sklearn.preprocessing import StandardScaler             # used to rescale all the features
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                             # used to evaluate the model
from collections import Counter                              # it is used to count the occurrence of each class
import numpy as np                                           
import tensorflow as tf                                      # tensorflow for deep learning
from tensorflow.keras.models import Sequential               # Sequential to define model
from tensorflow.keras.layers import Conv1D, Dense, Flatten   # 1D Convolution and fully connected layers
from tensorflow.keras.optimizers import Adam

# Load dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# Columns to drop from training (used in target calculation)
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Define target and features
y = df['Market_Confidence']
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Use only numeric columns
X = X.select_dtypes(include='number')

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert data into time-based sequences for 1D CNN (use 5 time steps)
time_steps = 5

def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y.values, time_steps)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# Define 1D CNN model
model = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')   # Output between 0 and 1
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# Predict
y_pred = model.predict(X_test).flatten()

# Convert predictions to class labels
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]
y_pred_class = [confidence_to_class(val) for val in y_pred]

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))

# Evaluation
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

 Top 10 Predictions vs Actuals:
   Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
0           0.626303              0.586069      Neutral         Neutral
1           0.581940              0.518853      Neutral         Neutral
2           0.606372              0.567551      Neutral         Neutral
3           0.568237              0.599833      Neutral         Neutral
4           0.485682              0.554576      Neutral         Neutral
5           0.655102              0.596630      Neutral         Neutral
6           0.703647              0.624043      Bullish         Neutral
7           0.612046              0.678469      Neutral         Bullish
8           0.590268              0.637863      Neutral         Neutral
9           0.574182              0.557567      Neutral         Neutral

 Mean Squared Error: 0.0063193712447092015

 Classification Report:
              precision    recal

In [18]:
!pip install pytorch-tabnet --quiet

In [19]:
# GRU Regression. (Supervised Learning)
# GRU is a recurrent neural network that learns time-based patterns.
# It is similar to LSTM but faster and uses fewer parameters. Ideal for sequence data like stock predictions.

# Prediction of the stock market behaviour using GRU Regression

import pandas as pd                                          # pandas are used for data operations
from sklearn.model_selection import train_test_split         # split the dataset into train and test sets
from sklearn.preprocessing import StandardScaler             # normalize features
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                             # evaluation metrics
from collections import Counter                              # count class distributions
import numpy as np
import tensorflow as tf                                      # deep learning framework
from tensorflow.keras.models import Sequential               # used to define the GRU model
from tensorflow.keras.layers import GRU, Dense               # GRU and Dense layers
from tensorflow.keras.optimizers import Adam                 # optimizer for training

# Load dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# Drop columns used in Market_Confidence formula
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Define target and features
y = df['Market_Confidence']
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Keep only numeric columns
X = X.select_dtypes(include='number')

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to sequences (time steps)
time_steps = 5
def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y.values, time_steps)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# Build GRU model
model = Sequential([
    GRU(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')   # Output between 0 and 1
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# Predict
y_pred = model.predict(X_test).flatten()

# Convert to class labels
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]
y_pred_class = [confidence_to_class(val) for val in y_pred]

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))

# Evaluation
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step

 Top 10 Predictions vs Actuals:
   Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
0           0.626303              0.608351      Neutral         Neutral
1           0.581940              0.532641      Neutral         Neutral
2           0.606372              0.621988      Neutral         Neutral
3           0.568237              0.556664      Neutral         Neutral
4           0.485682              0.525000      Neutral         Neutral
5           0.655102              0.610133      Neutral         Neutral
6           0.703647              0.618184      Bullish         Neutral
7           0.612046              0.633621      Neutral         Neutral
8           0.590268              0.679550      Neutral         Bullish
9           0.574182              0.611146      Neutral         Neutral

 Mean Squared Error: 0.004745434981333837

 Classification Report:
              precision    recall

In [20]:
# Transformer Regression. (Supervised Learning)
# Transformer models use attention to capture long-range relationships in time series data.
# Unlike LSTM/GRU, they don't process data sequentially, making them faster and better for long sequences.

# Prediction of the stock market behaviour using Transformer Regression

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
from collections import Counter
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout
from tensorflow.keras.layers import MultiHeadAttention, Add, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

# Load dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# Define features and target
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]
y = df['Market_Confidence']
X = df.drop(columns=formula_columns + ['Market_Confidence'])
X = X.select_dtypes(include='number')

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create time sequences
time_steps = 5
def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y.values, time_steps)
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# Define Transformer block
def transformer_encoder(inputs, num_heads=4, key_dim=32, ff_dim=64, dropout=0.1):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(inputs, inputs)
    attn_output = Dropout(dropout)(attn_output)
    out1 = LayerNormalization(epsilon=1e-6)(inputs + attn_output)

    ffn_output = Dense(ff_dim, activation='relu')(out1)
    ffn_output = Dense(inputs.shape[-1])(ffn_output)
    ffn_output = Dropout(dropout)(ffn_output)
    return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

# Build Transformer model
input_shape = X_train.shape[1:]
inputs = Input(shape=input_shape)
x = transformer_encoder(inputs)
x = GlobalAveragePooling1D()(x)
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# Predict
y_pred = model.predict(X_test).flatten()

# Convert to classes
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]
y_pred_class = [confidence_to_class(val) for val in y_pred]

# Evaluation
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))

print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step

 Top 10 Predictions vs Actuals:
   Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
0           0.626303              0.591683      Neutral         Neutral
1           0.581940              0.532231      Neutral         Neutral
2           0.606372              0.590778      Neutral         Neutral
3           0.568237              0.600054      Neutral         Neutral
4           0.485682              0.432901      Neutral         Neutral
5           0.655102              0.660102      Neutral         Bullish
6           0.703647              0.737073      Bullish         Bullish
7           0.612046              0.665465      Neutral         Bullish
8           0.590268              0.647631      Neutral         Neutral
9           0.574182              0.569720      Neutral         Neutral

 Mean Squared Error: 0.005249526077902524

 Classification Report:
              precision    recall

In [23]:
# Hybrid models used to predict the Stock Market Behaviour with Exogenous variables

In [29]:
# MLP + XGBoost Hybrid. (Supervised Learning)
# This hybrid model first uses an MLP (neural network) to extract hidden features.
# Then XGBoost is used to perform the final prediction using those deep features.

# Prediction of the stock market behaviour using MLP + XGBoost Hybrid Model

import pandas as pd                                          # pandas are used for data operations
from sklearn.model_selection import train_test_split         # split dataset into train and test sets
from sklearn.preprocessing import StandardScaler             # scale features
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                             # evaluation metrics
from collections import Counter                              # count class frequencies
import numpy as np
import tensorflow as tf                                      # for MLP
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from xgboost import XGBRegressor                             # XGBoost model for final prediction

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(         # 80% data is trained, 20% is tested
    X, y, test_size=0.2, random_state=42
)

# Scaling of the Training data and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting an MLP model for feature extraction
input_layer = Input(shape=(X_train_scaled.shape[1],))
x = Dense(64, activation='relu')(input_layer)
x = Dense(32, activation='relu')(x)
feature_model = Model(inputs=input_layer, outputs=x)

feature_model.compile(optimizer='adam', loss='mse')
feature_model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# Extract deep features from MLP
X_train_deep = feature_model.predict(X_train_scaled)
X_test_deep = feature_model.predict(X_test_scaled)

# Train XGBoost Regressor on extracted features
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train_deep, y_train)

# Predict confidence
y_pred = xgb_model.predict(X_test_deep)

# Class assigning based upon the Market_Confidence (Bullish/Bearish/Neutral)
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]        # Predicted class of the Market_Confidence

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculated by subtracting predicted values with actual values(error) then squaring it and taking the average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.622066      Neutral         Neutral
1293           0.639770              0.639133      Neutral         Neutral
1562           0.423568              0.515659      Neutral         Neutral
1101           0.255415              0.540983      Bearish         Neutral
1161           0.635864              0.583498      Neutral         Neutral
382            0.457954              0.524266      Neutral         Neutral
1197           0.735602              0.636637      Bullish         Neutral
777            0.513502              0.514264      Neutral         Neutral
643            0.585349              0.578544      Neutral         Neutral
275            0.511786              0.537159      Neutral         Neu

In [33]:
# CNN + CatBoost Hybrid. (Supervised Learning)
# This hybrid model first uses CNN to learn local patterns in time sequences.
# Then CatBoost is used to perform the final prediction using those CNN-based features.

# Prediction of the stock market behaviour using CNN + CatBoost Hybrid Model

import pandas as pd                                          # pandas are used for data operations
from sklearn.model_selection import train_test_split         # split dataset into train and test sets
from sklearn.preprocessing import StandardScaler             # scale features
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                             # evaluation metrics
from collections import Counter                              # count class frequencies
import numpy as np
import tensorflow as tf                                      # for CNN
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Flatten, Dense
from catboost import CatBoostRegressor                       # CatBoost model for final prediction

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Scaling of the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create sequences for CNN input
time_steps = 5
def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y.values, time_steps)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_seq, test_size=0.2, random_state=42
)

# Fix target shape to avoid shape mismatch
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# Fitting CNN feature extractor
input_layer = Input(shape=(X_train.shape[1], X_train.shape[2]))
x = Conv1D(filters=64, kernel_size=2, activation='relu')(input_layer)
x = Flatten()(x)
x = Dense(32, activation='relu')(x)
cnn_model = Model(inputs=input_layer, outputs=x)

cnn_model.compile(optimizer='adam', loss='mse')
cnn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# Extract features
X_train_cnn = cnn_model.predict(X_train)
X_test_cnn = cnn_model.predict(X_test)

# Fit CatBoost on CNN features
cat_model = CatBoostRegressor(verbose=0, iterations=100, learning_rate=0.1, depth=5, random_state=42)
cat_model.fit(X_train_cnn, y_train.ravel())

# Predict
y_pred = cat_model.predict(X_test_cnn)

# Class assignment based on Market_Confidence
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test.flatten()]        # Actual class of the Market_Confidence
y_pred_class = [confidence_to_class(val) for val in y_pred]                  # Predicted class of the Market_Confidence

# Comparison DataFrame
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test.flatten(),
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # top 10 from test set

# Evaluation
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

 Top 10 Predictions vs Actuals:
   Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
0           0.626303              0.899270      Neutral         Bullish
1           0.581940              0.606801      Neutral         Neutral
2           0.606372              0.579566      Neutral         Neutral
3           0.568237              0.558737      Neutral         Neutral
4           0.485682              0.582087      Neutral         Neutral
5           0.655102              0.588408      Neutral         Neutral
6           0.703647              0.525968      Bullish         Neutral
7           0.612046              0.706794      Neutral         Bullish
8           0.590268              0.648358      Neutral         Neutral
9           0.574182              0.520569      Neutral         Neutral

 Mean Squared Error: 0.0056

In [35]:
# AutoEncoder + Random Forest Hybrid. (Supervised Learning)
# This hybrid model first uses an AutoEncoder to compress input features and remove noise.
# Then Random Forest is used to perform the final prediction using those compressed features.

# Prediction of the stock market behaviour using AutoEncoder + Random Forest Hybrid Model

import pandas as pd                                          # pandas are used for data operations
from sklearn.model_selection import train_test_split         # used to split the data into train and test sets
from sklearn.preprocessing import StandardScaler             # used to normalize features
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score
                                                             # used for model evaluation
from collections import Counter                              # used to count predicted classes
import numpy as np
from tensorflow.keras.models import Model                    # base class for AutoEncoder
from tensorflow.keras.layers import Input, Dense             # layers for AutoEncoder
from sklearn.ensemble import RandomForestRegressor           # final regressor

# Loading the dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")

# Clean column names(remove all the characters that are not letter, number or underscore)
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# These columns are used to obtain Market_Confidence hence noting it down to drop these from the training data
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]

# Defining the target column
y = df['Market_Confidence']

# Defining the training column
X = df.drop(columns=formula_columns + ['Market_Confidence'])

# Selecting only numeric features (Not including Dates, Top1 - Top25, All news)
X = X.select_dtypes(include='number')

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(         # 80% data is trained, 20% is tested
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define AutoEncoder architecture
input_layer = Input(shape=(X_train_scaled.shape[1],))
encoded = Dense(32, activation='relu')(input_layer)
encoded = Dense(16, activation='relu')(encoded)
decoded = Dense(32, activation='relu')(encoded)
output_layer = Dense(X_train_scaled.shape[1], activation='linear')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
encoder = Model(inputs=input_layer, outputs=encoded)  # for feature extraction

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# Extract encoded features
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Train Random Forest on encoded features
rf_model = RandomForestRegressor(n_estimators=100, max_depth=8, random_state=42)
rf_model.fit(X_train_encoded, y_train)

# Predict confidence
y_pred = rf_model.predict(X_test_encoded)

# Class assignment based on Market_Confidence
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]
y_pred_class = [confidence_to_class(val) for val in y_pred]

# Creating a DataFrame consisting of 4 columns
comparison_df = pd.DataFrame({
    'Actual_Confidence': y_test,
    'Predicted_Confidence': y_pred,
    'Actual_Class': y_test_class,
    'Predicted_Class': y_pred_class
})
print("\n Top 10 Predictions vs Actuals:")
print(comparison_df.head(10))                                      # out of the 20% tested data, it shows the top 10

# Evaluation

# Mean square method is used to obtain how far our model predicted from the actual values.
# It is calculated by subtracting predicted values with actual values(error) then squaring it and taking the average.
print("\n Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Classification report gives detail about how well the model handled each class
print("\n Classification Report:")
print(classification_report(y_test_class, y_pred_class))

# A table which gives actual(rows) vs predicted(columns) value
print("\n Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))

# Print accuracy score
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

 Top 10 Predictions vs Actuals:
      Actual_Confidence  Predicted_Confidence Actual_Class Predicted_Class
247            0.572501              0.571974      Neutral         Neutral
1293           0.639770              0.608579      Neutral         Neutral
1562           0.423568              0.651972      Neutral         Neutral
1101           0.255415              0.568429      Bearish         Neutral
1161           0.635864              0.669990      Neutral         Bullish
382            0.457954              0.503044      Neutral         Neutral
1197           0.735602              0.547747      Bullish         Neutral
777            0.513502              0.564839      Neutral         Neutral
643            0.585349              0.600415      Neutral         Neutral
275            0.511786              0.556863      Neutral         Neu

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load and clean dataset
df = pd.read_csv("C:/Users/luvkh/data/Datasets/stock_data_with_exogenous_variables.csv")
df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

# Drop formula columns
formula_columns = [
    'Price_Change', 'StockMarket_Trend', 'Sentiment',
    'ATR', 'Stochastic', 'Momentum', 'IsFedMeetingDay', 'CoreCPI'
]
y = df['Market_Confidence']
X = df.drop(columns=formula_columns + ['Market_Confidence'])
X = X.select_dtypes(include='number')

# Shuffle the target
y_shuffled = np.random.permutation(y.values)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_shuffled, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Base Model 1: Random Forest
rf = RandomForestRegressor(n_estimators=100, max_depth=8, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)

# Base Model 2: XGBoost
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb.fit(X_train_scaled, y_train)
xgb_pred = xgb.predict(X_test_scaled)

# Base Model 3: MLP
mlp = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
mlp.compile(optimizer='adam', loss='mse')
mlp.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)
mlp_pred = mlp.predict(X_test_scaled).flatten()

# Stack base predictions
stacked_X = np.column_stack((rf_pred, xgb_pred, mlp_pred))

# Meta-learner
meta_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
meta_model.fit(stacked_X, y_test)
final_pred = meta_model.predict(stacked_X)

# Convert prediction to class
def confidence_to_class(conf):
    if conf < 0.33:
        return "Bearish"
    elif conf <= 0.66:
        return "Neutral"
    else:
        return "Bullish"

y_test_class = [confidence_to_class(val) for val in y_test]
final_pred_class = [confidence_to_class(val) for val in final_pred]

# Evaluation
print("\n Mean Squared Error (shuffled):", mean_squared_error(y_test, final_pred))
print("\n Classification Report (shuffled):")
print(classification_report(y_test_class, final_pred_class))
print("\n Confusion Matrix (shuffled):")
print(confusion_matrix(y_test_class, final_pred_class))
accuracy = accuracy_score(y_test_class, final_pred_class)
print(f"\n ✅ Shuffled Label Accuracy: {accuracy * 100:.2f}%")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step

 Mean Squared Error (shuffled): 0.0012595392324101329

 Classification Report (shuffled):
              precision    recall  f1-score   support

     Bearish       0.00      0.00      0.00         2
     Bullish       0.98      0.59      0.73        87
     Neutral       0.88      1.00      0.93       277

    accuracy                           0.89       366
   macro avg       0.62      0.53      0.56       366
weighted avg       0.90      0.89      0.88       366


 Confusion Matrix (shuffled):
[[  0   0   2]
 [  0  51  36]
 [  0   1 276]]

 ✅ Shuffled Label Accuracy: 89.34%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
