# Ballanced dataset model implementation

## conventional techniques:
    
    Random Forest
    K-nearest neighbors
    Logistic regression
    Support vector machine (SVM) techniques
    linear regression


## Ballanced dataset Models

In [1]:
# Importing necessary libraries

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option('float_format', '{:f}'.format)

import matplotlib.pyplot as plt
import matplotlib.colors as colors  
import matplotlib.patches as mpatches
%matplotlib inline
import seaborn as sns

import time

import warnings
warnings.filterwarnings('ignore')


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, recall_score, precision_score

In [2]:
df = pd.read_csv('datasets/data3_synthetic.csv')

In [3]:
df.head()

Unnamed: 0,Class,V1,V2,V3,V4,V5
0,0,0.223811,-0.997202,-0.760048,-0.368115,-0.98913
1,0,0.659099,-0.997189,0.97692,-0.316844,-0.98913
2,0,1.811941,-0.997146,-0.996419,-0.726818,-0.98913
3,0,0.354844,-0.997134,0.645444,0.74965,-0.98913
4,0,0.113514,-0.997126,-0.628333,0.983777,-0.98913


In [4]:
# Spliting Training and testing data
X = df.drop('Class', axis=1)
y = df['Class']


In [5]:
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [6]:
for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# Check the Distribution of the labels
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# Check to see if the distribution of the test and train labels is similar.
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

Train: [ 350197  350198  350199 ... 1754152 1754153 1754154] Test: [     0      1      2 ... 424727 425176 425184]
Train: [      0       1       2 ... 1754152 1754153 1754154] Test: [350197 350198 350199 ... 768566 768653 768685]
Train: [      0       1       2 ... 1754152 1754153 1754154] Test: [ 701101  701102  701103 ... 1095208 1095343 1095476]
Train: [      0       1       2 ... 1754152 1754153 1754154] Test: [1052163 1052164 1052165 ... 1424801 1424859 1425018]
Train: [      0       1       2 ... 1424801 1424859 1425018] Test: [1403110 1403111 1403112 ... 1754152 1754153 1754154]
----------------------------------------------------------------------------------------------------
Label Distributions: 

[0.9916313 0.0083687]
[0.99162845 0.00837155]


In [7]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Random Forest

In [8]:
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define different values of n_estimators to test
n_estimators_list = [10, 20, 100]

# Initialize a list to store the results
results = []

# Loop over the n_estimators_list
for n_estimators in n_estimators_list:
    start_time = time.time()  # Start timing
    
    # Instantiate the Random Forest classifier with the current number of estimators
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, class_weight='balanced', random_state=42)
    
    # Fit the classifier to the training data
    rf_classifier.fit(X_train, y_train)
    
    # Predict and evaluate on training data
    y_train_pred = rf_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    
    # Predict and evaluate on testing data
    y_test_pred = rf_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f'{n_estimators} DONE : ', elapsed_time)
    
    # Append training and testing results separately, including n_estimators value and execution time
    results.append({
        'Parameters': f'n_estimators: {n_estimators}',
        'Data Split': 'Training',
        'Accuracy': train_accuracy,
        'F1 Score': train_f1,
        'Recall': train_recall,
        'Precision': train_precision,
        'Execution Time (s)': elapsed_time
    })
    results.append({
        'Parameters': f'n_estimators: {n_estimators}',
        'Data Split': 'Testing',
        'Accuracy': test_accuracy,
        'F1 Score': test_f1,
        'Recall': test_recall,
        'Precision': test_precision,
        'Execution Time (s)': elapsed_time
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

10 DONE :  114.1181435585022
20 DONE :  229.83098244667053
100 DONE :  1723.5130016803741


In [9]:
results_df.to_excel('data3_results/RF_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,n_estimators: 10,Training,0.998653,0.913191,0.84025,1.0,114.118144
1,n_estimators: 10,Testing,0.993815,0.387352,0.240702,0.991329,114.118144
2,n_estimators: 20,Training,0.999369,0.961145,0.925197,1.0,229.830982
3,n_estimators: 20,Testing,0.993772,0.379438,0.234386,0.995529,229.830982
4,n_estimators: 100,Training,0.999995,0.999704,0.999408,1.0,1723.513002


## K-Nearest Neighbors (KNN)

In [15]:
import time
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define the different values of neighbors to test
neighbors_list = [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]

# Initialize a list to store the results
results = []

# Loop over the neighbors_list
for n_neighbors in neighbors_list:
    start_time = time.time()  # Start timing
    
    # Instantiate and fit the KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
    knn_classifier.fit(X_train, y_train)
    
    # Predict and evaluate on training data
    y_train_pred = knn_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    
    # Predict and evaluate on testing data
    y_test_pred = knn_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f'{n_neighbors} DONE : ', elapsed_time)
    
    # Append training and testing results separately, repeating execution time for both
    results.append({
        'Parameters': f'n_neighbors: {n_neighbors}',
        'Data Split': 'Training',
        'Accuracy': train_accuracy,
        'F1 Score': train_f1,
        'Recall': train_recall,
        'Precision': train_precision,
        'Execution Time (s)': elapsed_time
    })
    results.append({
        'Parameters': f'n_neighbors: {n_neighbors}',
        'Data Split': 'Testing',
        'Accuracy': test_accuracy,
        'F1 Score': test_f1,
        'Recall': test_recall,
        'Precision': test_precision,
        'Execution Time (s)': elapsed_time
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

2 DONE :  36.16287851333618
3 DONE :  46.462634563446045
4 DONE :  53.399858474731445
5 DONE :  62.09785580635071
6 DONE :  66.4122326374054
7 DONE :  71.04027795791626
8 DONE :  69.66048812866211
9 DONE :  64.19036197662354
10 DONE :  190.02127385139465
15 DONE :  101.13436818122864
20 DONE :  125.3592882156372


In [16]:
results_df.to_excel('data3_results/KNN_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,n_neighbors: 2,Training,1.0,1.0,1.0,1.0,36.162879
1,n_neighbors: 2,Testing,0.987322,0.229383,0.232281,0.226557,36.162879
2,n_neighbors: 3,Training,1.0,1.0,1.0,1.0,46.462635
3,n_neighbors: 3,Testing,0.992851,0.337559,0.224211,0.682692,46.462635
4,n_neighbors: 4,Training,1.0,1.0,1.0,1.0,53.399858


## Logistic Regression LR

While Logistic Regression doesn't have a parameter like the number of neighbors in KNN, there are still several hyperparameters you can adjust to observe different performance outcomes. Some of the commonly tweaked hyperparameters in Logistic Regression include:

    C (Inverse of regularization strength): Smaller values specify stronger regularization. Regularization can help prevent overfitting by penalizing larger coefficients.

    Solver: The algorithm to use for optimization ('liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'). Different solvers are suitable for different types of data and different penalty configurations.

    Penalty: Specifies the norm used in the penalization ('l1', 'l2', 'elasticnet', 'none'). Different penalties can lead to different decision boundaries.

In [17]:
import time
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define different values of C to test
C_values = [0.01, 0.1, 1, 10, 100]

# Initialize a list to store the results
results = []

# Loop over the C_values
for C in C_values:
    start_time = time.time()  # Start timing
    
    # Instantiate and fit the Logistic Regression classifier
    logreg_classifier = LogisticRegression(C=C, solver='liblinear')
    logreg_classifier.fit(X_train, y_train)
    
    # Predict and evaluate on training data
    y_train_pred = logreg_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    
    # Predict and evaluate on testing data
    y_test_pred = logreg_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Append training and testing results separately, including C value and execution time
    results.append({
        'Parameters': f'C: {C}',
        'Data Split': 'Training',
        'Accuracy': train_accuracy,
        'F1 Score': train_f1,
        'Recall': train_recall,
        'Precision': train_precision,
        'Execution Time (s)': elapsed_time
    })
    results.append({
        'Parameters': f'C: {C}',
        'Data Split': 'Testing',
        'Accuracy': test_accuracy,
        'F1 Score': test_f1,
        'Recall': test_recall,
        'Precision': test_precision,
        'Execution Time (s)': elapsed_time
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

In [18]:
results_df.to_excel('data3_results/LogisticReg_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,C: 0.01,Training,0.992354,0.170275,0.093061,1.0,4.851651
1,C: 0.01,Testing,0.992675,0.178914,0.098246,1.0,4.851651
2,C: 0.1,Training,0.992384,0.176199,0.096611,1.0,4.428366
3,C: 0.1,Testing,0.992703,0.184713,0.101754,1.0,4.428366
4,C: 1,Training,0.992385,0.17648,0.09678,1.0,4.321384


## SVM - Linear
    One often adjusted hyperparameter for SVM is the C parameter, which manages the trade-off between lowering the weights' norm and obtaining a low error on the training set. This effectively regulates the margin size and, consequently, the model's tolerance for incorrectly categorized points. 
    
    The LinearSVC class is optimized for linear SVMs and can handle large datasets more efficiently than SVC with a linear kernel.    

In [19]:
import time
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define different values of C to test
C_values = [0.01, 0.1, 1, 10] #, 100]

# Initialize a list to store the results
results = []

# Loop over the C_values
for C in C_values:
    start_time = time.time()  # Start timing
    
    # Instantiate and fit the LinearSVC classifier
    svm_classifier = LinearSVC(C=C, max_iter=10000)  # Adjust max_iter as needed
    svm_classifier.fit(X_train, y_train)
    
    # Predict and evaluate on training data
    y_train_pred = svm_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    
    # Predict and evaluate on testing data
    y_test_pred = svm_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f'{C} DONE : ', elapsed_time)
    
    # Append training and testing results separately, including C value and execution time
    results.append({
        'Parameters': f'C: {C}',
        'Data Split': 'Training',
        'Accuracy': train_accuracy,
        'F1 Score': train_f1,
        'Recall': train_recall,
        'Precision': train_precision,
        'Execution Time (s)': elapsed_time
    })
    results.append({
        'Parameters': f'C: {C}',
        'Data Split': 'Testing',
        'Accuracy': test_accuracy,
        'F1 Score': test_f1,
        'Recall': test_recall,
        'Precision': test_precision,
        'Execution Time (s)': elapsed_time
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

0.01 DONE :  19.222820520401
0.1 DONE :  151.3875241279602
1 DONE :  1453.6152141094208
10 DONE :  5998.4648649692535


In [20]:
results_df.to_excel('data3_results/SVM_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,C: 0.01,Training,0.991579,0.002195,0.001099,1.0,19.222821
1,C: 0.01,Testing,0.991885,0.002103,0.001053,1.0,19.222821
2,C: 0.1,Training,0.991579,0.002364,0.001183,1.0,151.387524
3,C: 0.1,Testing,0.991885,0.002103,0.001053,1.0,151.387524
4,C: 1,Training,0.991579,0.002364,0.001183,1.0,1453.615214


## Linear Regression
    While Linear Regression isn't typically used for classification tasks, it's possible to apply it for binary classification (e.g., predicting 1/0) by setting a threshold on the regression output. A common approach is to interpret the regression output as a probability (after some transformation, if necessary) and classify outputs greater than 0.5 as 1 and less than or equal to 0.5 as 0.

    

In [21]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

start_time = time.time()  # Start timing

# Instantiate and fit the Linear Regression model
linreg_classifier = LinearRegression()
linreg_classifier.fit(X_train, y_train)

# Predict on training data
y_train_pred_raw = linreg_classifier.predict(X_train)
# Convert predictions to binary outcomes
y_train_pred = np.where(y_train_pred_raw > 0.5, 1, 0)

# Calculate metrics for training data
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)

# Predict on testing data
y_test_pred_raw = linreg_classifier.predict(X_test)
# Convert predictions to binary outcomes
y_test_pred = np.where(y_test_pred_raw > 0.5, 1, 0)

# Calculate metrics for testing data
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)

# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

# Create a results dictionary and convert it to a DataFrame
# Create two separate dictionaries for training and testing results
training_results = {
    'Model': 'Linear Regression',
    'Data Split': 'Training',
    'Accuracy': train_accuracy,
    'F1 Score': train_f1,
    'Recall': train_recall,
    'Precision': train_precision,
    'Execution Time (s)': elapsed_time
}

testing_results = {
    'Model': 'Linear Regression',
    'Data Split': 'Testing',
    'Accuracy': test_accuracy,
    'F1 Score': test_f1,
    'Recall': test_recall,
    'Precision': test_precision,
    'Execution Time (s)': elapsed_time
}

# Combine the dictionaries into a list and convert it to a DataFrame
results_df = pd.DataFrame([training_results, testing_results])


In [22]:
results_df.to_excel('data3_results/LinearReg_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Model,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,Linear Regression,Training,0.991571,0.000338,0.000169,1.0,3.168684
1,Linear Regression,Testing,0.991879,0.000702,0.000351,1.0,3.168684
