In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, RFE

import pickle
import time 

In [2]:
# Creating Function(s) :

def SelectKBest_Regression(indep_X,dep_Y,n):
        test = SelectKBest(score_func=chi2, k=n)
        print(f"Selecting top {n} features using Chi-Square test")
        
        fit1= test.fit(indep_X,dep_Y)
        selectk_features = fit1.transform(indep_X)
        return selectk_features

def train_test_split_and_StandardScaler(indep_X,dep_Y):
        X_train, X_test, Y_train, Y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, Y_train, Y_test
    
def R2_Prediction(regressor,X_test,Y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    R2_Score = r2_score(Y_test,y_pred)
    return R2_Score

def LinearRegression(X_train,Y_train,X_test):       
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, Y_train)
        
    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score  

def SVM_Linear(X_train,Y_train,X_test):
    
    from sklearn.svm import SVR
    regressor = SVR(kernel = 'linear')
    regressor.fit(X_train, Y_train)

    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score

def SVM_Non_Linear(X_train,Y_train,X_test):

    from sklearn.svm import SVR
    regressor = SVR(kernel = 'rbf')
    regressor.fit(X_train, Y_train)

    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score

def DecisionTree(X_train,Y_train,X_test):

    # Fitting K-NN to the Training set
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, Y_train)

    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score

def RandomForest(X_train,Y_train,X_test):

    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    regressor.fit(X_train, Y_train)

    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score

def SelectK_Regression(R2_LinearRegression, R2_SVM_Linear, R2_SVM_NonLinear, R2_DecisionTree, R2_RandomForest): 

    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear Regression','SVM Linear','SVM Non Linear',
                                                        'Decision Tree','Random Forest'])
    
    #Function - enumerate() acts as a Counter which Iterates index starting from 0 (by default) and their item(s) from the iterable
    #Use enumerate() when We need both Position in the loop (number) and its value from the iterable (idex)
    
    for indexCount,indexValue in enumerate(dataframe.index):      
        dataframe['Linear Regression'][indexValue]=R2_LinearRegression[indexCount]       
        dataframe['SVM Linear'][indexValue]=R2_SVM_Linear[indexCount]
        dataframe['SVM Non Linear'][indexValue]=R2_SVM_NonLinear[indexCount]
        dataframe['Decision Tree'][indexValue]=R2_DecisionTree[indexCount]
        dataframe['Random Forest'][indexValue]=R2_RandomForest[indexCount]
    return dataframe

In [3]:
# ✅ 1.Loading Original Dataset : 
dataset=pd.read_csv("Preprocessed_Data_Cryptos - One Percent of Actual Dataset.csv",index_col=None)
print(dataset.shape)
dataset.head()

(5242, 18)


Unnamed: 0,Crypto,Date,Time - 24 Hour Format,Time - 12 Hour Format,volume,open,Close Minus Open,Adj Close Minus Open,Close Minus Adj Close,High Minus Open,High Minus Close,High Minus Adj Close,High Minus Low,close,adj_close,high,low,Trade Impact
0,SPC-USD,03-10-2020,00:00:00,12:00 AM,1954.0,0.005614,0.000301,0.000301,0.0,0.000305,4e-06,4e-06,0.00039,0.005915,0.005915,0.005919,0.005529,Positive Impact
1,CONX28135-USD,21-03-2025,00:00:00,12:00 AM,46719.0,17.543959,0.109655,0.109655,0.0,0.180546,0.07089,0.07089,0.242294,17.653614,17.653614,17.724504,17.48221,Positive Impact
2,SHX-USD,24-11-2020,00:00:00,12:00 AM,8190.0,0.000192,-3e-06,-3e-06,0.0,1.9e-05,2.2e-05,2.2e-05,3.2e-05,0.000189,0.000189,0.000211,0.000179,Negative Impact
3,PALLA-USD,16-12-2022,00:00:00,12:00 AM,84921.0,0.011293,0.000133,0.000133,0.0,0.000147,1.4e-05,1.4e-05,0.000162,0.011426,0.011426,0.01144,0.011278,Positive Impact
4,PORK29220-USD,16-05-2024,00:00:00,12:00 AM,1864946.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No Impact


In [4]:
#✅ 2.Duplicating the Original Dataset
dataset2 = dataset.drop(['Crypto','Date','Time - 24 Hour Format','Time - 12 Hour Format'],axis=1)

#✅ 3.Classifying the Nominal Columns in Dataset : 
dataset2 = pd.get_dummies(dataset2, drop_first=True)
print(dataset2.shape)
dataset2

(5242, 15)


Unnamed: 0,volume,open,Close Minus Open,Adj Close Minus Open,Close Minus Adj Close,High Minus Open,High Minus Close,High Minus Adj Close,High Minus Low,close,adj_close,high,low,Trade Impact_No Impact,Trade Impact_Positive Impact
0,1954.0,0.005614,0.000301,0.000301,0.0,0.000305,0.000004,0.000004,0.000390,0.005915,0.005915,0.005919,0.005529,False,True
1,46719.0,17.543959,0.109655,0.109655,0.0,0.180546,0.070890,0.070890,0.242294,17.653614,17.653614,17.724504,17.482210,False,True
2,8190.0,0.000192,-0.000003,-0.000003,0.0,0.000019,0.000022,0.000022,0.000032,0.000189,0.000189,0.000211,0.000179,False,False
3,84921.0,0.011293,0.000133,0.000133,0.0,0.000147,0.000014,0.000014,0.000162,0.011426,0.011426,0.011440,0.011278,False,True
4,1864946.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237,211093892.0,375.328049,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,375.659451,375.627867,400.833600,365.791182,True,False
5238,3487036.0,0.073854,-0.009214,-0.009214,0.0,0.004684,0.013898,0.013898,0.017769,0.064640,0.064640,0.078538,0.060769,False,False
5239,6895906.0,0.000381,0.000000,0.000000,0.0,0.000018,0.000018,0.000018,0.000022,0.000381,0.000381,0.000399,0.000377,True,False
5240,15719.0,0.006267,0.000142,0.000142,0.0,0.000198,0.000056,0.000056,0.000198,0.006409,0.006409,0.006465,0.006267,False,True


In [5]:
dataset2.columns

Index(['volume', 'open', 'Close Minus Open', 'Adj Close Minus Open',
       'Close Minus Adj Close', 'High Minus Open', 'High Minus Close',
       'High Minus Adj Close', 'High Minus Low', 'close', 'adj_close', 'high',
       'low', 'Trade Impact_No Impact', 'Trade Impact_Positive Impact'],
      dtype='object')

In [6]:
#✅ 4.Assigning Variables (Independent/Dependent) : 

indep_X = dataset2.drop(['Trade Impact_Positive Impact'], axis=1)
print(indep_X.shape)

dep_Y = dataset2[['Trade Impact_Positive Impact']]
print(dep_Y.shape)

(5242, 14)
(5242, 1)


In [7]:
# Scale for Chi-Square
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(indep_X)

In [8]:
#Creating Empty Lists
R2_LinearRegression = []
R2_SVM_Linear = []
R2_SVM_NonLinear = []
R2_DecisionTree = []
R2_RandomForest = []

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
# Enter the Number of Features
No_Of_Features = 10
Feature_List = []

# Loop through and print each regressor name with its value
for value in range(No_Of_Features, 0, -1):  # from 6 to 1 (inclusive)
    print(f"Feature Count: {value}")  
    
    #5.Calling a Created Function - select_K_Best(indep_X,dep_Y,n): which returns - selectk_features
    k_Best = SelectKBest_Regression(X_scaled,dep_Y,1)      
    k_Best

    #6.Calling a Created Function - train_test_split_and_StandardScaler: which returns - X_train, X_test, Y_train, Y_test
    #Here before creating a Model, We are Selecting Few Best Columns as Input 
    #Hence, Passing (k_Best, dep_Y) along with Selected Number of Features instead of Usual (indep_X, dep_Y)
    X_train, X_test, Y_train, Y_test = train_test_split_and_StandardScaler(k_Best, dep_Y)   
    
    #Creating Various Models as follows :
    
    #7.Calling a Created Function - LogisticRegression(X_train,Y_train,X_test): which returns - R2_score
    R2_score = LinearRegression(X_train,Y_train,X_test)
    R2_LinearRegression.append(R2_score)
    
    #8.Calling a Created Function - SVM_Linear(X_train,Y_train,X_test): which returns - R2_score
    R2_score = SVM_Linear(X_train,Y_train,X_test)  
    R2_SVM_Linear.append(R2_score)
    
    #9.Calling a Created Function - SVM_Non_Linear(X_train,Y_train,X_test): which returns - R2_score
    R2_score = SVM_Non_Linear(X_train,Y_train,X_test)  
    R2_SVM_NonLinear.append(R2_score)
    
    #10.Calling a Created Function - DecisionTree(X_train,Y_train,X_test): which returns - R2_score
    R2_score = DecisionTree(X_train,Y_train,X_test)  
    R2_DecisionTree.append(R2_score)
    
    #11.Calling a Created Function - RandomForest(X_train,Y_train,X_test): which returns - R2_score
    R2_score = RandomForest(X_train,Y_train,X_test)  
    R2_RandomForest.append(R2_score)
        
    #12.Calling a Created Function - SelectK(With Below Parameters): which returns - dataframe    
    result = SelectK_Regression(R2_LinearRegression, R2_SVM_Linear, R2_SVM_NonLinear, R2_DecisionTree, R2_RandomForest)

    # Append a dictionary with feature count and result
    Feature_List.append({
        "Feature_Count": value, # Number of features used
        "Result": result # DataFrame of accuracy scores from multiple classifiers
    })


Feature Count: 10
Selecting top 1 features using Chi-Square test
Feature Count: 9
Selecting top 1 features using Chi-Square test
Feature Count: 8
Selecting top 1 features using Chi-Square test
Feature Count: 7
Selecting top 1 features using Chi-Square test
Feature Count: 6
Selecting top 1 features using Chi-Square test
Feature Count: 5
Selecting top 1 features using Chi-Square test
Feature Count: 4
Selecting top 1 features using Chi-Square test
Feature Count: 3
Selecting top 1 features using Chi-Square test
Feature Count: 2
Selecting top 1 features using Chi-Square test
Feature Count: 1
Selecting top 1 features using Chi-Square test


In [11]:
# Combine all results into one DataFrame
combined_df = pd.concat([item["Result"].assign(Feature_Count = item["Feature_Count"]) for item in Feature_List], axis=0)

# Optional: reset index for cleanliness
combined_df = combined_df.reset_index()

# Convert all string columns to int (if safe)
combined_df = combined_df.apply(pd.to_numeric, errors='coerce')

# Show the combined DataFrame
print(combined_df)

   index  Linear Regression  SVM Linear  SVM Non Linear  Decision Tree  \
0    NaN           0.028897   -0.337168       -0.337168       0.028897   
1    NaN           0.028897   -0.337168       -0.337168       0.028897   
2    NaN           0.028897   -0.337168       -0.337168       0.028897   
3    NaN           0.028897   -0.337168       -0.337168       0.028897   
4    NaN           0.028897   -0.337168       -0.337168       0.028897   
5    NaN           0.028897   -0.337168       -0.337168       0.028897   
6    NaN           0.028897   -0.337168       -0.337168       0.028897   
7    NaN           0.028897   -0.337168       -0.337168       0.028897   
8    NaN           0.028897   -0.337168       -0.337168       0.028897   
9    NaN           0.028897   -0.337168       -0.337168       0.028897   

   Random Forest  Feature_Count  
0       0.028812             10  
1       0.028812              9  
2       0.028812              8  
3       0.028812              7  
4       0.02881

In [12]:
# Get only the first mode per Row
mode_max = combined_df.max().drop("Feature_Count")
mode_max

index                     NaN
Linear Regression    0.028897
SVM Linear          -0.337168
SVM Non Linear      -0.337168
Decision Tree        0.028897
Random Forest        0.028812
dtype: float64

In [13]:
# Get the maximum value
mode_max_final = mode_max.max()

# Get all regressors with the maximum value
mode_max_final_items = mode_max[mode_max == mode_max_final]

# Display names along with values
print(f"Final Regressors with Highest Repeatation and Maximum Value(s):\n{mode_max_final_items}")

Final Regressors with Highest Repeatation and Maximum Value(s):
Decision Tree    0.028897
dtype: float64
