In [1]:
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from sklearn import metrics
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
data = pd.read_csv("data_cleaned.csv")
data

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q91,Q92,Q93,Q94,Q95,Q96,Q97,Q98,Q99,DLEAVING
0,4,3,5,4,4,4,5,4,4,4,...,3,2,1,3,1,4,4,4,4,1
1,5,5,5,5,5,5,5,4,4,5,...,3,2,1,4,1,4,4,4,4,1
2,5,5,5,5,5,5,5,5,5,5,...,2,1,1,5,2,5,5,5,5,0
3,2,2,5,4,3,2,4,4,4,4,...,3,2,1,1,2,3,1,3,4,0
4,5,5,4,5,5,5,5,5,5,5,...,1,2,1,4,2,5,5,5,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111128,4,4,5,5,2,5,5,4,5,4,...,4,2,1,4,1,5,5,5,5,1
111129,4,4,4,4,2,3,4,4,4,3,...,3,2,2,4,1,4,4,4,4,1
111130,3,3,3,4,3,4,4,4,4,3,...,2,1,1,3,2,3,3,3,3,1
111131,5,4,5,5,4,5,5,5,4,4,...,2,1,1,5,1,5,4,5,5,1


In [3]:
np.random.seed(42)
# Exclude "DLEAVING" from the list of columns to choose from
columns_list = data.columns.tolist()
columns_list.remove("DLEAVING")

# Randomly select 14 unique column names since "DLEAVING" will be included
selected_columns = np.random.choice(columns_list, size=14, replace=False)

# Add "DLEAVING" back to the list of selected columns
selected_columns = np.append(selected_columns, "DLEAVING")

# Create a new DataFrame with just the selected columns
data = data[selected_columns]
data

Unnamed: 0,Q26,Q61,Q60,Q49,Q41,Q89,Q87,Q43,Q11,Q1,Q15_5,Q27,Q84,Q91,DLEAVING
0,4,3,2,5,4,5,4,4,4,4,0,4,3,3,1
1,4,4,4,5,4,5,4,4,4,5,0,4,4,3,1
2,5,5,5,5,5,5,5,5,5,5,0,5,5,2,0
3,3,2,2,4,4,5,5,3,4,2,0,3,3,3,0
4,3,5,5,5,5,5,5,4,5,5,0,4,5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111128,5,3,5,5,5,5,5,5,5,4,0,4,3,4,1
111129,4,3,3,3,4,3,3,4,5,4,0,4,4,3,1
111130,3,4,3,3,3,3,4,3,4,3,0,3,4,2,1
111131,3,5,5,5,4,5,5,5,4,5,0,5,4,2,1


In [4]:
from sklearn.model_selection import train_test_split
y=data['DLEAVING']
X=data.iloc[:, :-1]
X_train, X_test, y_train, y_test= train_test_split(
                                                X, y,
                                                test_size=0.2,
                                                random_state=53)



In [5]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
# For AUC-ROC, you need the probabilities of the positive class
y_score = reg.predict_proba(X_test)[:, 1]
# Calculate precision, recall, f1-score, and support for each class
metrics = precision_recall_fscore_support(y_test, y_pred)

# Calculate macro and weighted averages
macro_avg = precision_recall_fscore_support(y_test, y_pred, average='macro')
weighted_avg = precision_recall_fscore_support(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

# Calculate AUC score, using the scores for the positive class
# If y_score is 2-dimensional (e.g., [n_samples, 2] for binary outcomes), use the second column: y_score[:, 1]
# If y_score is 1-dimensional (e.g., [n_samples]), use it directly
auc_score = roc_auc_score(y_test, y_score[:, 1]) if len(y_score.shape) > 1 else roc_auc_score(y_test, y_score)

# Prepare metrics for DataFrame creation, including AUC
metrics_list = list(metrics) + [[auc_score] * len(metrics[0])]  # Replicate AUC score for each class

# Create DataFrame
df_metrics = pd.DataFrame(metrics_list, 
                          index=['Precision', 'Recall', 'F1-score', 'Support', 'AUC']).T

# Initialize Accuracy column with NaNs or blanks
df_metrics['Accuracy'] = ' '  # This avoids repeating the accuracy for each class

# Adjust additional metrics dictionary to include the AUC and place Accuracy only in the macro average row
additional_rows = [
    {'Class': 'macro avg', 'Precision': macro_avg[0], 'Recall': macro_avg[1], 'F1-score': macro_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': accuracy},
    {'Class': 'weighted avg', 'Precision': weighted_avg[0], 'Recall': weighted_avg[1], 'F1-score': weighted_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': ' '}
]

# Convert dictionary to DataFrame and concatenate with the existing one
df_additional = pd.DataFrame(additional_rows)
df_metrics_final = pd.concat([df_metrics, df_additional], ignore_index=True)

# Adjust 'Class' column
df_metrics_final['Class'] = df_metrics_final.index.map(lambda x: f'Class {x}' if x < len(df_metrics) else df_metrics_final.loc[x, 'Class'])

# Rearrange columns to match desired output, including AUC and selectively showing Accuracy
df_metrics_final = df_metrics_final[['Class', 'Precision', 'Recall', 'F1-score', 'AUC', 'Accuracy', 'Support']]


print(df_metrics_final)
df_metrics_final.to_csv("logi40.csv", index=False)

          Class  Precision    Recall  F1-score       AUC  Accuracy  Support
0       Class 0   0.704590  0.401838  0.511793  0.759399             7182.0
1       Class 1   0.763058  0.919575  0.834037  0.759399            15045.0
2     macro avg   0.733824  0.660706  0.672915  0.759399  0.752283         
3  weighted avg   0.744166  0.752283  0.729913  0.759399                   


In [6]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb



param_grid = {'max_depth':range(1, 5),
                'learning_rate':(0.005, 0.05, 0.5)}
 
grid = GridSearchCV(xgb.XGBClassifier(random_state=0), 
                    param_grid=param_grid, cv=10, verbose=1, scoring='accuracy')
    
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)


# For AUC-ROC, you need the probabilities of the positive class
y_score = grid.predict_proba(X_test)[:, 1]
# Calculate precision, recall, f1-score, and support for each class
metrics = precision_recall_fscore_support(y_test, y_pred)

# Calculate macro and weighted averages
macro_avg = precision_recall_fscore_support(y_test, y_pred, average='macro')
weighted_avg = precision_recall_fscore_support(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

# Calculate AUC score, using the scores for the positive class
# If y_score is 2-dimensional (e.g., [n_samples, 2] for binary outcomes), use the second column: y_score[:, 1]
# If y_score is 1-dimensional (e.g., [n_samples]), use it directly
auc_score = roc_auc_score(y_test, y_score[:, 1]) if len(y_score.shape) > 1 else roc_auc_score(y_test, y_score)

# Prepare metrics for DataFrame creation, including AUC
metrics_list = list(metrics) + [[auc_score] * len(metrics[0])]  # Replicate AUC score for each class

# Create DataFrame
df_metrics = pd.DataFrame(metrics_list, 
                          index=['Precision', 'Recall', 'F1-score', 'Support', 'AUC']).T

# Initialize Accuracy column with NaNs or blanks
df_metrics['Accuracy'] = ' '  # This avoids repeating the accuracy for each class

# Adjust additional metrics dictionary to include the AUC and place Accuracy only in the macro average row
additional_rows = [
    {'Class': 'macro avg', 'Precision': macro_avg[0], 'Recall': macro_avg[1], 'F1-score': macro_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': accuracy},
    {'Class': 'weighted avg', 'Precision': weighted_avg[0], 'Recall': weighted_avg[1], 'F1-score': weighted_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': ' '}
]

# Convert dictionary to DataFrame and concatenate with the existing one
df_additional = pd.DataFrame(additional_rows)
df_metrics_final = pd.concat([df_metrics, df_additional], ignore_index=True)

# Adjust 'Class' column
df_metrics_final['Class'] = df_metrics_final.index.map(lambda x: f'Class {x}' if x < len(df_metrics) else df_metrics_final.loc[x, 'Class'])

# Rearrange columns to match desired output, including AUC and selectively showing Accuracy
df_metrics_final = df_metrics_final[['Class', 'Precision', 'Recall', 'F1-score', 'AUC', 'Accuracy', 'Support']]

print(df_metrics_final)
df_metrics_final.to_csv("xgb40.csv", index=False)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
          Class  Precision    Recall  F1-score      AUC  Accuracy  Support
0       Class 0   0.702059  0.427179  0.531163  0.76616             7182.0
1       Class 1   0.769614  0.913460  0.835390  0.76616            15045.0
2     macro avg   0.735837  0.670319  0.683277  0.76616  0.756332         
3  weighted avg   0.747786  0.756332  0.737088  0.76616                   


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, classification_report
import pandas as pd
import numpy as np

# Define a simple neural network model for binary classification
model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),  # Input layer
    Dense(64, activation='relu'),  # Hidden layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# Make predictions on the testing dataset
y_pred = model.predict(X_test).ravel()  # Flatten the array to 1D if it's not already

# Convert probabilities to binary class labels
pred_nn = np.round(y_pred).astype(int)

# Calculate precision, recall, f1-score, and support for each class
metrics = precision_recall_fscore_support(y_test, pred_nn)

# Calculate macro and weighted averages
macro_avg = precision_recall_fscore_support(y_test, pred_nn, average='macro')
weighted_avg = precision_recall_fscore_support(y_test, pred_nn, average='weighted')
accuracy = accuracy_score(y_test, pred_nn)

# Calculate AUC score directly using y_pred
auc_score = roc_auc_score(y_test, y_pred)

# Create DataFrame
df_metrics = pd.DataFrame(metrics_list, 
                          index=['Precision', 'Recall', 'F1-score', 'Support', 'AUC']).T

# Initialize Accuracy column with NaNs or blanks
df_metrics['Accuracy'] = ' '  # This avoids repeating the accuracy for each class

# Adjust additional metrics dictionary to include the AUC and place Accuracy only in the macro average row
additional_rows = [
    {'Class': 'macro avg', 'Precision': macro_avg[0], 'Recall': macro_avg[1], 'F1-score': macro_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': accuracy},
    {'Class': 'weighted avg', 'Precision': weighted_avg[0], 'Recall': weighted_avg[1], 'F1-score': weighted_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': ' '}
]

# Convert dictionary to DataFrame and concatenate with the existing one
df_additional = pd.DataFrame(additional_rows)
df_metrics_final = pd.concat([df_metrics, df_additional], ignore_index=True)

# Adjust 'Class' column
df_metrics_final['Class'] = df_metrics_final.index.map(lambda x: f'Class {x}' if x < len(df_metrics) else df_metrics_final.loc[x, 'Class'])

# Rearrange columns to match desired output, including AUC and selectively showing Accuracy
df_metrics_final = df_metrics_final[['Class', 'Precision', 'Recall', 'F1-score', 'AUC', 'Accuracy', 'Support']]

print(df_metrics_final)
df_metrics_final.to_csv("nn40.csv", index=False)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2779/2779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 876us/step - accuracy: 0.7496 - loss: 0.5376
Epoch 2/10
[1m2779/2779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 886us/step - accuracy: 0.7573 - loss: 0.5253
Epoch 3/10
[1m2779/2779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 893us/step - accuracy: 0.7556 - loss: 0.5243
Epoch 4/10
[1m2779/2779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 896us/step - accuracy: 0.7559 - loss: 0.5233
Epoch 5/10
[1m2779/2779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 914us/step - accuracy: 0.7578 - loss: 0.5214
Epoch 6/10
[1m2779/2779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 895us/step - accuracy: 0.7591 - loss: 0.5210
Epoch 7/10
[1m2779/2779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 889us/step - accuracy: 0.7583 - loss: 0.5208
Epoch 8/10
[1m2779/2779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 913us/step - accuracy: 0.7555 - loss: 0.5249
Epoch 9/10
[1m2779

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
pipeline = Pipeline([
    ('clf', LinearSVC(dual=False))  # 'clf' is the name we give to the LinearSVC step
])



# Define a very simple parameter grid, could be expanded based on need
param_grid = {
    'clf__penalty': ['l1', 'l2'],
    'clf__loss': ['squared_hinge'],
    'clf__tol': [1e-4, 1e-3, 1e-2],
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [1000, 2000, 3000],
    'clf__intercept_scaling': [1, 10, 100]
}


# Initialize GridSearchCV with the SVC estimator and the simple param grid
grid = RandomizedSearchCV(pipeline, param_grid, verbose=3, cv=10)

# Train the model on the training dataset
grid.fit(X_train, y_train)

# Make predictions on the testing dataset using the best model found
y_pred = grid.predict(X_test)

# For AUC-ROC, you need the probabilities of the positive class
y_scores = grid.decision_function(X_test)
# Calculate precision, recall, f1-score, and support for each class
metrics = precision_recall_fscore_support(y_test, y_pred)

# Calculate macro and weighted averages
macro_avg = precision_recall_fscore_support(y_test, y_pred, average='macro')
weighted_avg = precision_recall_fscore_support(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

# Calculate AUC score, using the scores for the positive class
# If y_score is 2-dimensional (e.g., [n_samples, 2] for binary outcomes), use the second column: y_score[:, 1]
# If y_score is 1-dimensional (e.g., [n_samples]), use it directly
auc_score = roc_auc_score(y_test, y_score[:, 1]) if len(y_score.shape) > 1 else roc_auc_score(y_test, y_score)

# Create DataFrame
df_metrics = pd.DataFrame(metrics_list, 
                          index=['Precision', 'Recall', 'F1-score', 'Support', 'AUC']).T

# Initialize Accuracy column with NaNs or blanks
df_metrics['Accuracy'] = ' '  # This avoids repeating the accuracy for each class

# Adjust additional metrics dictionary to include the AUC and place Accuracy only in the macro average row
additional_rows = [
    {'Class': 'macro avg', 'Precision': macro_avg[0], 'Recall': macro_avg[1], 'F1-score': macro_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': accuracy},
    {'Class': 'weighted avg', 'Precision': weighted_avg[0], 'Recall': weighted_avg[1], 'F1-score': weighted_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': ' '}
]

# Convert dictionary to DataFrame and concatenate with the existing one
df_additional = pd.DataFrame(additional_rows)
df_metrics_final = pd.concat([df_metrics, df_additional], ignore_index=True)

# Adjust 'Class' column
df_metrics_final['Class'] = df_metrics_final.index.map(lambda x: f'Class {x}' if x < len(df_metrics) else df_metrics_final.loc[x, 'Class'])

# Rearrange columns to match desired output, including AUC and selectively showing Accuracy
df_metrics_final = df_metrics_final[['Class', 'Precision', 'Recall', 'F1-score', 'AUC', 'Accuracy', 'Support']]

print(df_metrics_final)
df_metrics_final.to_csv("svm40.csv", index=False)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV 1/10] END clf__C=100, clf__intercept_scaling=1, clf__loss=squared_hinge, clf__max_iter=1000, clf__penalty=l2, clf__tol=0.001;, score=0.755 total time=   0.2s
[CV 2/10] END clf__C=100, clf__intercept_scaling=1, clf__loss=squared_hinge, clf__max_iter=1000, clf__penalty=l2, clf__tol=0.001;, score=0.754 total time=   0.2s
[CV 3/10] END clf__C=100, clf__intercept_scaling=1, clf__loss=squared_hinge, clf__max_iter=1000, clf__penalty=l2, clf__tol=0.001;, score=0.755 total time=   0.2s
[CV 4/10] END clf__C=100, clf__intercept_scaling=1, clf__loss=squared_hinge, clf__max_iter=1000, clf__penalty=l2, clf__tol=0.001;, score=0.758 total time=   0.2s
[CV 5/10] END clf__C=100, clf__intercept_scaling=1, clf__loss=squared_hinge, clf__max_iter=1000, clf__penalty=l2, clf__tol=0.001;, score=0.758 total time=   0.2s
[CV 6/10] END clf__C=100, clf__intercept_scaling=1, clf__loss=squared_hinge, clf__max_iter=1000, clf__penalty=l2, clf__tol=0.00

[CV 2/10] END clf__C=1, clf__intercept_scaling=10, clf__loss=squared_hinge, clf__max_iter=2000, clf__penalty=l2, clf__tol=0.0001;, score=0.754 total time=   0.1s
[CV 3/10] END clf__C=1, clf__intercept_scaling=10, clf__loss=squared_hinge, clf__max_iter=2000, clf__penalty=l2, clf__tol=0.0001;, score=0.754 total time=   0.1s
[CV 4/10] END clf__C=1, clf__intercept_scaling=10, clf__loss=squared_hinge, clf__max_iter=2000, clf__penalty=l2, clf__tol=0.0001;, score=0.758 total time=   0.1s
[CV 5/10] END clf__C=1, clf__intercept_scaling=10, clf__loss=squared_hinge, clf__max_iter=2000, clf__penalty=l2, clf__tol=0.0001;, score=0.758 total time=   0.1s
[CV 6/10] END clf__C=1, clf__intercept_scaling=10, clf__loss=squared_hinge, clf__max_iter=2000, clf__penalty=l2, clf__tol=0.0001;, score=0.757 total time=   0.1s
[CV 7/10] END clf__C=1, clf__intercept_scaling=10, clf__loss=squared_hinge, clf__max_iter=2000, clf__penalty=l2, clf__tol=0.0001;, score=0.744 total time=   0.1s
[CV 8/10] END clf__C=1, clf_

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Adjusting the parameter grid for RandomForestClassifier
param_grid = {
    'n_estimators': [10, 50, 100, 200],  # Number of trees in the forest
    'max_depth': range(1, 5)  # Maximum depth of the tree
}

grid = GridSearchCV(RandomForestClassifier(random_state=0),
                    param_grid=param_grid, cv=10, verbose=1, scoring='accuracy')

# Fit to the training data
grid.fit(X_train, y_train)

# Predict on the test data
y_pred = grid.predict(X_test)

# For AUC-ROC, you need the probabilities of the positive class
y_score = grid.predict_proba(X_test)[:, 1]
# Calculate precision, recall, f1-score, and support for each class
metrics = precision_recall_fscore_support(y_test, y_pred)

# Calculate macro and weighted averages
macro_avg = precision_recall_fscore_support(y_test, y_pred, average='macro')
weighted_avg = precision_recall_fscore_support(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

# Calculate AUC score, using the scores for the positive class
# If y_score is 2-dimensional (e.g., [n_samples, 2] for binary outcomes), use the second column: y_score[:, 1]
# If y_score is 1-dimensional (e.g., [n_samples]), use it directly
auc_score = roc_auc_score(y_test, y_score[:, 1]) if len(y_score.shape) > 1 else roc_auc_score(y_test, y_score)

# Create DataFrame
df_metrics = pd.DataFrame(metrics_list, 
                          index=['Precision', 'Recall', 'F1-score', 'Support', 'AUC']).T

# Initialize Accuracy column with NaNs or blanks
df_metrics['Accuracy'] = ' '  # This avoids repeating the accuracy for each class

# Adjust additional metrics dictionary to include the AUC and place Accuracy only in the macro average row
additional_rows = [
    {'Class': 'macro avg', 'Precision': macro_avg[0], 'Recall': macro_avg[1], 'F1-score': macro_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': accuracy},
    {'Class': 'weighted avg', 'Precision': weighted_avg[0], 'Recall': weighted_avg[1], 'F1-score': weighted_avg[2], 'Support': ' ', 'AUC': auc_score, 'Accuracy': ' '}
]

# Convert dictionary to DataFrame and concatenate with the existing one
df_additional = pd.DataFrame(additional_rows)
df_metrics_final = pd.concat([df_metrics, df_additional], ignore_index=True)

# Adjust 'Class' column
df_metrics_final['Class'] = df_metrics_final.index.map(lambda x: f'Class {x}' if x < len(df_metrics) else df_metrics_final.loc[x, 'Class'])

# Rearrange columns to match desired output, including AUC and selectively showing Accuracy
df_metrics_final = df_metrics_final[['Class', 'Precision', 'Recall', 'F1-score', 'AUC', 'Accuracy', 'Support']]

print(df_metrics_final)
df_metrics_final.to_csv("rf40.csv", index=False)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
          Class  Precision    Recall  F1-score       AUC  Accuracy  Support
0       Class 0   0.702059  0.427179  0.531163  0.766160             7182.0
1       Class 1   0.769614  0.913460  0.835390  0.766160            15045.0
2     macro avg   0.738448  0.658397  0.670523  0.758918  0.753048         
3  weighted avg   0.746429  0.753048  0.728856  0.758918                   
