In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, RFE

import pickle
import time 

In [2]:
# Creating Function(s) :

def SelectKBest_Classification(indep_X,dep_Y,n):
        test = SelectKBest(score_func=chi2, k=n)
        print(f"Selecting top {n} features using Chi-Square test")
        
        fit1= test.fit(indep_X,dep_Y)
        selectk_features = fit1.transform(indep_X)
        return selectk_features

def train_test_split_and_StandardScaler(indep_X,dep_Y):
        X_train, X_test, Y_train, Y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, Y_train, Y_test

def Confusion_Matrix(classifier,X_test):
    y_pred = classifier.predict(X_test)

    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(Y_test, y_pred)

    from sklearn.metrics import classification_report 
    classification_report=classification_report(Y_test, y_pred)

    from sklearn.metrics import accuracy_score
    accuracy_score=accuracy_score(Y_test, y_pred)         

    return classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score 

def LogisticRegression(X_train,Y_train,X_test):       
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0) 
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score   

def SVM_Linear(X_train,Y_train,X_test):

    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score

def SVM_Non_Linear(X_train,Y_train,X_test):

    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score

def Naive_Bayes(X_train,Y_train,X_test):       

    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score

def KNN(X_train,Y_train,X_test):

    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score

def DecisionTree(X_train,Y_train,X_test):

    # Fitting K-NN to the Training set
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score

def RandomForest(X_train,Y_train,X_test):

    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score


def SelectK_Classification(accuracy_LogisticRegression, accuracy_SVM_Linear, accuracy_SVM_NonLinear, 
                           accuracy_KNN, accuracy_NaiveBayes, accuracy_DecisionTree, accuracy_RandomForest): 

    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Logistic Regression','SVM Linear','SVM Non Linear','KNN',
                                                        'Naive Bayes','Decision Tree','Random Forest'])
    
    #Function - enumerate() acts as a Counter which Iterates index starting from 0 (by default) and their item(s) from the iterable
    #Use enumerate() when We need both Position in the loop (number) and its value from the iterable (idex)
    
    for indexCount,indexValue in enumerate(dataframe.index):      
        dataframe['Logistic Regression'][indexValue]=accuracy_LogisticRegression[indexCount]       
        dataframe['SVM Linear'][indexValue]=accuracy_SVM_Linear[indexCount]
        dataframe['SVM Non Linear'][indexValue]=accuracy_SVM_NonLinear[indexCount]
        dataframe['KNN'][indexValue]=accuracy_KNN[indexCount]
        dataframe['Naive Bayes'][indexValue]=accuracy_NaiveBayes[indexCount]
        dataframe['Decision Tree'][indexValue]=accuracy_DecisionTree[indexCount]
        dataframe['Random Forest'][indexValue]=accuracy_RandomForest[indexCount]
    return dataframe

In [3]:
# ✅ 1.Loading Original Dataset : 
dataset=pd.read_csv("Preprocessed_Data_Cryptos - One Percent of Actual Dataset.csv",index_col=None)
print(dataset.shape)
dataset.head()

(5242, 18)


Unnamed: 0,Crypto,Date,Time - 24 Hour Format,Time - 12 Hour Format,volume,open,Close Minus Open,Adj Close Minus Open,Close Minus Adj Close,High Minus Open,High Minus Close,High Minus Adj Close,High Minus Low,close,adj_close,high,low,Trade Impact
0,SPC-USD,03-10-2020,00:00:00,12:00 AM,1954.0,0.005614,0.000301,0.000301,0.0,0.000305,4e-06,4e-06,0.00039,0.005915,0.005915,0.005919,0.005529,Positive Impact
1,CONX28135-USD,21-03-2025,00:00:00,12:00 AM,46719.0,17.543959,0.109655,0.109655,0.0,0.180546,0.07089,0.07089,0.242294,17.653614,17.653614,17.724504,17.48221,Positive Impact
2,SHX-USD,24-11-2020,00:00:00,12:00 AM,8190.0,0.000192,-3e-06,-3e-06,0.0,1.9e-05,2.2e-05,2.2e-05,3.2e-05,0.000189,0.000189,0.000211,0.000179,Negative Impact
3,PALLA-USD,16-12-2022,00:00:00,12:00 AM,84921.0,0.011293,0.000133,0.000133,0.0,0.000147,1.4e-05,1.4e-05,0.000162,0.011426,0.011426,0.01144,0.011278,Positive Impact
4,PORK29220-USD,16-05-2024,00:00:00,12:00 AM,1864946.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No Impact


In [4]:
#✅ 2.Duplicating the Original Dataset
dataset2 = dataset.drop(['Crypto','Date','Time - 24 Hour Format','Time - 12 Hour Format'],axis=1)

#✅ 3.Classifying the Nominal Columns in Dataset : 
dataset2 = pd.get_dummies(dataset2, drop_first=True)
print(dataset2.shape)
dataset2

(5242, 15)


Unnamed: 0,volume,open,Close Minus Open,Adj Close Minus Open,Close Minus Adj Close,High Minus Open,High Minus Close,High Minus Adj Close,High Minus Low,close,adj_close,high,low,Trade Impact_No Impact,Trade Impact_Positive Impact
0,1954.0,0.005614,0.000301,0.000301,0.0,0.000305,0.000004,0.000004,0.000390,0.005915,0.005915,0.005919,0.005529,False,True
1,46719.0,17.543959,0.109655,0.109655,0.0,0.180546,0.070890,0.070890,0.242294,17.653614,17.653614,17.724504,17.482210,False,True
2,8190.0,0.000192,-0.000003,-0.000003,0.0,0.000019,0.000022,0.000022,0.000032,0.000189,0.000189,0.000211,0.000179,False,False
3,84921.0,0.011293,0.000133,0.000133,0.0,0.000147,0.000014,0.000014,0.000162,0.011426,0.011426,0.011440,0.011278,False,True
4,1864946.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237,211093892.0,375.328049,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,375.659451,375.627867,400.833600,365.791182,True,False
5238,3487036.0,0.073854,-0.009214,-0.009214,0.0,0.004684,0.013898,0.013898,0.017769,0.064640,0.064640,0.078538,0.060769,False,False
5239,6895906.0,0.000381,0.000000,0.000000,0.0,0.000018,0.000018,0.000018,0.000022,0.000381,0.000381,0.000399,0.000377,True,False
5240,15719.0,0.006267,0.000142,0.000142,0.0,0.000198,0.000056,0.000056,0.000198,0.006409,0.006409,0.006465,0.006267,False,True


In [5]:
dataset2.columns

Index(['volume', 'open', 'Close Minus Open', 'Adj Close Minus Open',
       'Close Minus Adj Close', 'High Minus Open', 'High Minus Close',
       'High Minus Adj Close', 'High Minus Low', 'close', 'adj_close', 'high',
       'low', 'Trade Impact_No Impact', 'Trade Impact_Positive Impact'],
      dtype='object')

In [6]:
#✅ 4.Assigning Variables (Independent/Dependent) : 

indep_X = dataset2.drop(['Trade Impact_Positive Impact'], axis=1)
print(indep_X.shape)

dep_Y = dataset2[['Trade Impact_Positive Impact']]
print(dep_Y.shape)

(5242, 14)
(5242, 1)


In [7]:
# Scale for Chi-Square
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(indep_X)

In [25]:
#5.Calling a Created Function - select_K_Best(indep_X,dep_Y,n): which returns - selectk_features
#Here the number = n as Feature Selection which takes 5 Parameters as Input
k_Best = SelectKBest_Classification(X_scaled, dep_Y, 1)       

#Creating Empty Lists
accuracy_LogisticRegression = []
accuracy_SVM_Linear = []
accuracy_SVM_NonLinear = []
accuracy_KNN = []
accuracy_NaiveBayes = []
accuracy_DecisionTree = []
accuracy_RandomForest = []

k_Best

Selecting top 1 features using Chi-Square test


array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [0.],
       [0.]], shape=(5242, 1))

In [26]:
#6.Calling a Created Function - train_test_split_and_StandardScaler: which returns - X_train, X_test, Y_train, Y_test
#Here before creating a Model, We are Selecting Few Best Columns as Input 
#Hence, Passing (k_Best, dep_Y) along with Selected Number of Features instead of Usual (indep_X, dep_Y)
X_train, X_test, Y_train, Y_test = train_test_split_and_StandardScaler(k_Best, dep_Y)   

#Creating Various Models as follows :

#7.Calling a Created Function - LogisticRegression(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score
classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = LogisticRegression(X_train,Y_train,X_test)
accuracy_LogisticRegression.append(accuracy_score)

#8.Calling a Created Function - SVM_Linear(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score
classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = SVM_Linear(X_train,Y_train,X_test)  
accuracy_SVM_Linear.append(accuracy_score)

#9.Calling a Created Function - SVM_Non_Linear(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score
classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = SVM_Non_Linear(X_train,Y_train,X_test)  
accuracy_SVM_NonLinear.append(accuracy_score)

#10.Calling a Created Function - KNN(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score
classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = KNN(X_train,Y_train,X_test)  
accuracy_KNN.append(accuracy_score)

#11.Calling a Created Function - Naive_Bayes(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score
classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Naive_Bayes(X_train,Y_train,X_test)  
accuracy_NaiveBayes.append(accuracy_score)

#12.Calling a Created Function - DecisionTree(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score
classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = DecisionTree(X_train,Y_train,X_test)  
accuracy_DecisionTree.append(accuracy_score)

#13.Calling a Created Function - RandomForest(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score
classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = RandomForest(X_train,Y_train,X_test)  
accuracy_RandomForest.append(accuracy_score)
    
#14.Calling a Created Function - SelectK(With Below Parameters): which returns - dataframe    
result = SelectK_Classification(accuracy_LogisticRegression, accuracy_SVM_Linear, accuracy_SVM_NonLinear, 
                           accuracy_KNN, accuracy_NaiveBayes, accuracy_DecisionTree, accuracy_RandomForest)

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  return self._fit(X, y)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier

In [10]:
# Calling the Final Results by Selecting Top 6 Features
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM Non Linear,KNN,Naive Bayes,Decision Tree,Random Forest
ChiSquare,0.633105,0.630816,0.629291,0.960336,0.431732,0.971014,0.961098


In [13]:
# Calling the Final Results by Selecting Top 5 Features
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM Non Linear,KNN,Naive Bayes,Decision Tree,Random Forest
ChiSquare,0.633105,0.630816,0.628528,0.965675,0.432494,0.970252,0.971014


In [17]:
# Calling the Final Results by Selecting Top 4 Features
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM Non Linear,KNN,Naive Bayes,Decision Tree,Random Forest
ChiSquare,0.633867,0.630816,0.629291,0.988558,0.433257,0.97254,0.979405


In [20]:
# Calling the Final Results by Selecting Top 3 Features
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM Non Linear,KNN,Naive Bayes,Decision Tree,Random Forest
ChiSquare,0.617849,0.617849,0.617849,0.575896,0.43402,0.566743,0.57971


In [23]:
# Calling the Final Results by Selecting Top 2 Features
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM Non Linear,KNN,Naive Bayes,Decision Tree,Random Forest
ChiSquare,0.617849,0.617849,0.617849,0.575896,0.431732,0.56598,0.577422


In [27]:
# Calling the Final Results by Selecting Top 1 Feature
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM Non Linear,KNN,Naive Bayes,Decision Tree,Random Forest
ChiSquare,0.617849,0.617849,0.617849,0.617849,0.427155,0.617849,0.617849
