# Ballanced dataset model implementation

## conventional techniques:
    
    Random Forest
    K-nearest neighbors
    Logistic regression
    Support vector machine (SVM) techniques
    linear regression


## Ballanced dataset Models

In [21]:
# Importing necessary libraries

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option('float_format', '{:f}'.format)

import matplotlib.pyplot as plt
import matplotlib.colors as colors  
import matplotlib.patches as mpatches
%matplotlib inline
import seaborn as sns

import time

import warnings
warnings.filterwarnings('ignore')


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, recall_score, precision_score

In [22]:
df = pd.read_csv('datasets/data2_balanced.csv')

In [23]:
df.head()

Unnamed: 0,s_amount,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,0.496764,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,0.637735,-0.98702,0.293438,-0.941386,0.54902,1.804879,0.215598,0.512307,0.333644,0.12427,0.091202,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,0
1,-0.458942,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,0.529808,0.140107,1.564246,0.574074,0.627719,0.706121,0.789188,0.40381,0.201799,-0.340687,-0.233984,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,0
2,-0.794279,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,0.690708,-0.272985,0.659201,0.805173,0.616874,3.069025,-0.577514,0.886526,0.239442,-2.366079,0.361652,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,0
3,-0.554667,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,0.575231,-0.752581,0.737483,0.592994,0.559535,-0.697664,-0.030669,0.242629,2.178616,-1.34506,-0.378223,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,0
4,0.187692,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,0.968046,-1.203171,1.029577,1.43931,0.241454,0.153008,0.224538,0.366466,0.291782,0.445317,0.247237,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,0


In [24]:
# Spliting Training and testing data
X = df.drop('Class', axis=1)
y = df['Class']


In [25]:
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [26]:
for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# Check the Distribution of the labels
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# Check to see if the distribution of the test and train labels is similar.
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

Train: [ 57020  57021  57022 ... 568627 568628 568629] Test: [     0      1      2 ... 341175 341176 341177]
Train: [     0      1      2 ... 568627 568628 568629] Test: [ 57020  57021  57022 ... 398038 398039 398040]
Train: [     0      1      2 ... 568627 568628 568629] Test: [113967 113968 113969 ... 454901 454902 454903]
Train: [     0      1      2 ... 568627 568628 568629] Test: [170949 170950 170951 ... 511764 511765 511766]
Train: [     0      1      2 ... 511764 511765 511766] Test: [227869 227870 227871 ... 568627 568628 568629]
----------------------------------------------------------------------------------------------------
Label Distributions: 

[0.5 0.5]
[0.5 0.5]


In [27]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Random Forest

In [28]:
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define different values of n_estimators to test
n_estimators_list = [10, 20, 100]

# Initialize a list to store the results
results = []

# Loop over the n_estimators_list
for n_estimators in n_estimators_list:
    start_time = time.time()  # Start timing
    
    # Instantiate the Random Forest classifier with the current number of estimators
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, class_weight='balanced', random_state=42)
    
    # Fit the classifier to the training data
    rf_classifier.fit(X_train, y_train)
    
    # Predict and evaluate on training data
    y_train_pred = rf_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    
    # Predict and evaluate on testing data
    y_test_pred = rf_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Append training and testing results separately, including n_estimators value and execution time
    results.append({
        'Parameters': f'n_estimators: {n_estimators}',
        'Data Split': 'Training',
        'Accuracy': train_accuracy,
        'F1 Score': train_f1,
        'Recall': train_recall,
        'Precision': train_precision,
        'Execution Time (s)': elapsed_time
    })
    results.append({
        'Parameters': f'n_estimators: {n_estimators}',
        'Data Split': 'Testing',
        'Accuracy': test_accuracy,
        'F1 Score': test_f1,
        'Recall': test_recall,
        'Precision': test_precision,
        'Execution Time (s)': elapsed_time
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

In [29]:
results_df.to_excel('data2_results/RF_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,n_estimators: 10,Training,0.999996,0.999996,1.0,0.999991,122.482803
1,n_estimators: 10,Testing,0.999833,0.999833,0.999965,0.999702,122.482803
2,n_estimators: 20,Training,0.999998,0.999998,1.0,0.999996,223.15144
3,n_estimators: 20,Testing,0.999894,0.999895,1.0,0.999789,223.15144
4,n_estimators: 100,Training,1.0,1.0,1.0,1.0,1493.506191


## K-Nearest Neighbors (KNN)

In [30]:
import time
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define the different values of neighbors to test
neighbors_list = [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]

# Initialize a list to store the results
results = []

# Loop over the neighbors_list
for n_neighbors in neighbors_list:
    start_time = time.time()  # Start timing
    
    # Instantiate and fit the KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
    knn_classifier.fit(X_train, y_train)
    
    # Predict and evaluate on training data
    y_train_pred = knn_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    
    # Predict and evaluate on testing data
    y_test_pred = knn_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Append training and testing results separately, repeating execution time for both
    results.append({
        'Parameters': f'n_neighbors: {n_neighbors}',
        'Data Split': 'Training',
        'Accuracy': train_accuracy,
        'F1 Score': train_f1,
        'Recall': train_recall,
        'Precision': train_precision,
        'Execution Time (s)': elapsed_time
    })
    results.append({
        'Parameters': f'n_neighbors: {n_neighbors}',
        'Data Split': 'Testing',
        'Accuracy': test_accuracy,
        'F1 Score': test_f1,
        'Recall': test_recall,
        'Precision': test_precision,
        'Execution Time (s)': elapsed_time
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

In [31]:
results_df.to_excel('data2_results/KNN_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,n_neighbors: 2,Training,1.0,1.0,1.0,1.0,646.04681
1,n_neighbors: 2,Testing,0.999314,0.999316,0.99993,0.998703,646.04681
2,n_neighbors: 3,Training,1.0,1.0,1.0,1.0,3476.791402
3,n_neighbors: 3,Testing,0.998848,0.998852,0.999965,0.997741,3476.791402
4,n_neighbors: 4,Training,1.0,1.0,1.0,1.0,552.213389


## Logistic Regression LR

While Logistic Regression doesn't have a parameter like the number of neighbors in KNN, there are still several hyperparameters you can adjust to observe different performance outcomes. Some of the commonly tweaked hyperparameters in Logistic Regression include:

    C (Inverse of regularization strength): Smaller values specify stronger regularization. Regularization can help prevent overfitting by penalizing larger coefficients.

    Solver: The algorithm to use for optimization ('liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'). Different solvers are suitable for different types of data and different penalty configurations.

    Penalty: Specifies the norm used in the penalization ('l1', 'l2', 'elasticnet', 'none'). Different penalties can lead to different decision boundaries.

In [32]:
import time
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define different values of C to test
C_values = [0.01, 0.1, 1, 10, 100]

# Initialize a list to store the results
results = []

# Loop over the C_values
for C in C_values:
    start_time = time.time()  # Start timing
    
    # Instantiate and fit the Logistic Regression classifier
    logreg_classifier = LogisticRegression(C=C, solver='liblinear')
    logreg_classifier.fit(X_train, y_train)
    
    # Predict and evaluate on training data
    y_train_pred = logreg_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    
    # Predict and evaluate on testing data
    y_test_pred = logreg_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Append training and testing results separately, including C value and execution time
    results.append({
        'Parameters': f'C: {C}',
        'Data Split': 'Training',
        'Accuracy': train_accuracy,
        'F1 Score': train_f1,
        'Recall': train_recall,
        'Precision': train_precision,
        'Execution Time (s)': elapsed_time
    })
    results.append({
        'Parameters': f'C: {C}',
        'Data Split': 'Testing',
        'Accuracy': test_accuracy,
        'F1 Score': test_f1,
        'Recall': test_recall,
        'Precision': test_precision,
        'Execution Time (s)': elapsed_time
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

In [33]:
results_df.to_excel('data2_results/LogisticReg_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,C: 0.01,Training,0.962493,0.961703,0.942337,0.981882,7.648315
1,C: 0.01,Testing,0.962788,0.962126,0.943432,0.981575,7.648315
2,C: 0.1,Training,0.964593,0.964042,0.949767,0.978754,8.111438
3,C: 0.1,Testing,0.96481,0.96439,0.95112,0.978036,8.111438
4,C: 1,Training,0.964861,0.96436,0.951297,0.977787,9.126934


## SVM - Linear
    One often adjusted hyperparameter for SVM is the C parameter, which manages the trade-off between lowering the weights' norm and obtaining a low error on the training set. This effectively regulates the margin size and, consequently, the model's tolerance for incorrectly categorized points. 
    
    The LinearSVC class is optimized for linear SVMs and can handle large datasets more efficiently than SVC with a linear kernel.    

In [None]:
import time
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define different values of C to test
C_values = [0.01, 0.1, 1, 10, 100]

# Initialize a list to store the results
results = []

# Loop over the C_values
for C in C_values:
    start_time = time.time()  # Start timing
    
    # Instantiate and fit the LinearSVC classifier
    svm_classifier = LinearSVC(C=C, max_iter=10000)  # Adjust max_iter as needed
    svm_classifier.fit(X_train, y_train)
    
    # Predict and evaluate on training data
    y_train_pred = svm_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    
    # Predict and evaluate on testing data
    y_test_pred = svm_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Append training and testing results separately, including C value and execution time
    results.append({
        'Parameters': f'C: {C}',
        'Data Split': 'Training',
        'Accuracy': train_accuracy,
        'F1 Score': train_f1,
        'Recall': train_recall,
        'Precision': train_precision,
        'Execution Time (s)': elapsed_time
    })
    results.append({
        'Parameters': f'C: {C}',
        'Data Split': 'Testing',
        'Accuracy': test_accuracy,
        'F1 Score': test_f1,
        'Recall': test_recall,
        'Precision': test_precision,
        'Execution Time (s)': elapsed_time
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

In [None]:
results_df.to_excel('data2_results/SVM_results.xlsx', index=False)
results_df.head()

## Linear Regression
    While Linear Regression isn't typically used for classification tasks, it's possible to apply it for binary classification (e.g., predicting 1/0) by setting a threshold on the regression output. A common approach is to interpret the regression output as a probability (after some transformation, if necessary) and classify outputs greater than 0.5 as 1 and less than or equal to 0.5 as 0.

    

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

start_time = time.time()  # Start timing

# Instantiate and fit the Linear Regression model
linreg_classifier = LinearRegression()
linreg_classifier.fit(X_train, y_train)

# Predict on training data
y_train_pred_raw = linreg_classifier.predict(X_train)
# Convert predictions to binary outcomes
y_train_pred = np.where(y_train_pred_raw > 0.5, 1, 0)

# Calculate metrics for training data
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)

# Predict on testing data
y_test_pred_raw = linreg_classifier.predict(X_test)
# Convert predictions to binary outcomes
y_test_pred = np.where(y_test_pred_raw > 0.5, 1, 0)

# Calculate metrics for testing data
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)

# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

# Create a results dictionary and convert it to a DataFrame
# Create two separate dictionaries for training and testing results
training_results = {
    'Model': 'Linear Regression',
    'Data Split': 'Training',
    'Accuracy': train_accuracy,
    'F1 Score': train_f1,
    'Recall': train_recall,
    'Precision': train_precision,
    'Execution Time (s)': elapsed_time
}

testing_results = {
    'Model': 'Linear Regression',
    'Data Split': 'Testing',
    'Accuracy': test_accuracy,
    'F1 Score': test_f1,
    'Recall': test_recall,
    'Precision': test_precision,
    'Execution Time (s)': elapsed_time
}

# Combine the dictionaries into a list and convert it to a DataFrame
results_df = pd.DataFrame([training_results, testing_results])


In [None]:
results_df.to_excel('data2_results/LinearReg_results.xlsx', index=False)
results_df.head()