In [1]:
import pandas as pd

#dataset division
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#Logistic Regression
from sklearn.linear_model import LogisticRegression

#KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

#Classification Reports
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer, accuracy_score
from sklearn.model_selection import cross_val_score

#Stratified Kfolds
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import ParameterGrid

#Decision Tree
from sklearn.tree import DecisionTreeClassifier

#Ensemble
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier, AdaBoostClassifier

#importing the GaussianNB classification algorithm
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB

#MLP CLassifier, neural network
from sklearn.neural_network import MLPClassifier

#Best decision tree calculator
from sklearn.model_selection import GridSearchCV

#Import tqdm
from tqdm.notebook import tqdm

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

Info:
The confusion matrix in sklearn is presented in the following format: <br>
[ [ TN  FP  ] <br>
    [ FN  TP ] ]

# Metrics Calculation Function

In [2]:
def metrics(y_train, pred_train , y_test, pred_test):
    '''Takes, as arguments, the labels and predictions for both the training and the test data
    Returns the classification_report and confusion matrix for training and test data'''
    print('_____________________________________')
    print('                                                     TRAIN                                                 ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_train, pred_train))
    print(confusion_matrix(y_train, pred_train))


    print('_____________________________________')
    print('                                                     TEST                                                  ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_test, pred_test))
    print(confusion_matrix(y_test, pred_test))

This function will be used to check the values of the metrics for the models that we will try

# Importing data

In [3]:
train = pd.read_csv("Data_after_feature_selection/train_data_scalled.csv", index_col = 0)
val = pd.read_csv("Data_after_feature_selection/val_data_scalled.csv", index_col = 0)

In [4]:
train["Admitted in School"].value_counts()

Admitted in School
0.0    323
1.0    176
Name: count, dtype: int64

In [5]:
323/(323+176)

0.6472945891783567

As the number of 0 is 65% and the number of 1 is 35% we conclude that the dataset is unbalanced

In [6]:
X_train = train.drop(columns = "Admitted in School")
y_train = train[["Admitted in School"]]
X_val = val.drop(columns = "Admitted in School")
y_val = val[["Admitted in School"]]

In [7]:
X = pd.concat([X_train, X_val], axis = 0)
y = pd.concat([y_train, y_val], axis = 0)

# Logistic Regression

In [8]:
def find_best_logistic_regression_params(X_train, y_train):

    # Define the parameter grid to search
    param_grid = {
        'penalty': ['l1', 'l2', "elasticnet"],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'], 
        "l1_ratio" : [0.1, 0.3, 0.5, 0.7]
    }

    # Create a Logistic Regression model
    log_reg = LogisticRegression()
    
    #Creates RepeatedStratifiedKFold
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

    # Use GridSearchCV to find the best parameters
    grid_search = GridSearchCV(log_reg, param_grid, cv=rskf, scoring='f1')
    grid_search.fit(X_train, y_train)

    # Print the best parameters
    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)


In [9]:
#find_best_logistic_regression_params(X_train, y_train)

Best Parameters: {'C': 4, 'penalty': 'l2', 'solver': 'liblinear', 'l1_ratio' = 0.1}

In [10]:
logreg = LogisticRegression(C = 4, penalty = 'l2', solver = 'liblinear', l1_ratio = 0.1).fit(X_train, y_train)

In [11]:
f1_score(y_val, logreg.predict(X_val), average='weighted')

0.7726295467509104

In [12]:
metrics(y_train = y_train, pred_train = logreg.predict(X_train) , y_test = y_val, pred_test = logreg.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.85      0.89      0.87       323
         1.0       0.78      0.72      0.75       176

    accuracy                           0.83       499
   macro avg       0.82      0.81      0.81       499
weighted avg       0.83      0.83      0.83       499

[[288  35]
 [ 49 127]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.81      0.86      0.83       138
         1.0       0.71      0.63

# KNN Classifier

In [13]:
"""numberK_list=np.arange(1,21)
high_score=0
nof=0           
score_list_train =[]
score_list_val =[]
for n in numberK_list:
    model = KNeighborsClassifier(n_neighbors = n).fit(X_train, y_train)
    score_train = model.score(X_train, y_train)
    score_val = model.score(X_val, y_val)
    score_list_train.append(score_train)
    score_list_val.append(score_val)
    
    if(score_val>high_score):
        high_score = score_val
        nof = numberK_list[n-1]
print("Best number of neighbors: %d" %nof)
print("Mean accuracy in train with %d neighbors: %f" % (nof, score_list_train[nof-1]))
print("Mean accuracy in validation with %d neighbors: %f" % (nof, high_score))"""

'numberK_list=np.arange(1,21)\nhigh_score=0\nnof=0           \nscore_list_train =[]\nscore_list_val =[]\nfor n in numberK_list:\n    model = KNeighborsClassifier(n_neighbors = n).fit(X_train, y_train)\n    score_train = model.score(X_train, y_train)\n    score_val = model.score(X_val, y_val)\n    score_list_train.append(score_train)\n    score_list_val.append(score_val)\n    \n    if(score_val>high_score):\n        high_score = score_val\n        nof = numberK_list[n-1]\nprint("Best number of neighbors: %d" %nof)\nprint("Mean accuracy in train with %d neighbors: %f" % (nof, score_list_train[nof-1]))\nprint("Mean accuracy in validation with %d neighbors: %f" % (nof, high_score))'

Best number of neighbors: 6

In [14]:
"""plt.plot(numberK_list, score_list_train, label='Train')
plt.plot(numberK_list, score_list_val, label = 'Validation')
plt.vlines(x=nof,ymax=high_score,ymin=min(score_list_val),ls='--',colors='g')
plt.xticks(numberK_list)
plt.xlabel('k')
plt.ylabel('score')
plt.legend()

plt.show()"""

"plt.plot(numberK_list, score_list_train, label='Train')\nplt.plot(numberK_list, score_list_val, label = 'Validation')\nplt.vlines(x=nof,ymax=high_score,ymin=min(score_list_val),ls='--',colors='g')\nplt.xticks(numberK_list)\nplt.xlabel('k')\nplt.ylabel('score')\nplt.legend()\n\nplt.show()"

With this last 2 cells we conclude that the best number of neighbors is 6

In [15]:
"""opt_method = ["auto", "ball_tree", "kd_tree"]
high_score=0
nof=0           
score_list_train =[]
score_list_val =[]
for method in opt_method:
    
    model = KNeighborsClassifier(n_neighbors = 6, algorithm = method ).fit(X_train, y_train)
    score_train = model.score(X_train, y_train)
    score_val = model.score(X_val, y_val)
    score_list_train.append(score_train)
    score_list_val.append(score_val)
for n in range(len(score_list_val)):   
    if score_list_val[n-1]> high_score:
        high_score = score_list_val[n-1]
        nof = opt_method[n-1]
        
print(f"Best method: {nof}")
print(f"Mean accuracy in train with algorithm =  {nof}: {score_list_train[n-1]}")
print(f"Mean accuracy in validation with algorithm = {nof}: {high_score}")"""

'opt_method = ["auto", "ball_tree", "kd_tree"]\nhigh_score=0\nnof=0           \nscore_list_train =[]\nscore_list_val =[]\nfor method in opt_method:\n    \n    model = KNeighborsClassifier(n_neighbors = 6, algorithm = method ).fit(X_train, y_train)\n    score_train = model.score(X_train, y_train)\n    score_val = model.score(X_val, y_val)\n    score_list_train.append(score_train)\n    score_list_val.append(score_val)\nfor n in range(len(score_list_val)):   \n    if score_list_val[n-1]> high_score:\n        high_score = score_list_val[n-1]\n        nof = opt_method[n-1]\n        \nprint(f"Best method: {nof}")\nprint(f"Mean accuracy in train with algorithm =  {nof}: {score_list_train[n-1]}")\nprint(f"Mean accuracy in validation with algorithm = {nof}: {high_score}")'

With this last cell we conclude that the best method is kd_tree

In [16]:
"""opt_dist = ["euclidean", "manhattan", "minkowski"]
high_score=0
nof=0           
score_list_train =[]
score_list_val =[]
for dist in opt_dist:
    
    model = KNeighborsClassifier(n_neighbors = 6, algorithm = "kd_tree", metric = dist).fit(X_train, y_train)
    score_train = model.score(X_train, y_train)
    score_val = model.score(X_val, y_val)
    score_list_train.append(score_train)
    score_list_val.append(score_val)
for n in range(len(score_list_val)):   
    if score_list_val[n-1]> high_score:
        high_score = score_list_val[n-1]
        nof = opt_dist[n-1]
        
print(f"Best method: {nof}")
print(f"Mean accuracy in train with algorithm =  {nof}: {score_list_train[n-1]}")
print(f"Mean accuracy in validation with algorithm = {nof}: {high_score}")"""

'opt_dist = ["euclidean", "manhattan", "minkowski"]\nhigh_score=0\nnof=0           \nscore_list_train =[]\nscore_list_val =[]\nfor dist in opt_dist:\n    \n    model = KNeighborsClassifier(n_neighbors = 6, algorithm = "kd_tree", metric = dist).fit(X_train, y_train)\n    score_train = model.score(X_train, y_train)\n    score_val = model.score(X_val, y_val)\n    score_list_train.append(score_train)\n    score_list_val.append(score_val)\nfor n in range(len(score_list_val)):   \n    if score_list_val[n-1]> high_score:\n        high_score = score_list_val[n-1]\n        nof = opt_dist[n-1]\n        \nprint(f"Best method: {nof}")\nprint(f"Mean accuracy in train with algorithm =  {nof}: {score_list_train[n-1]}")\nprint(f"Mean accuracy in validation with algorithm = {nof}: {high_score}")'

With this last cell we conclude that the mest metric is minkowski

In [17]:
knc = KNeighborsClassifier(n_neighbors = 6, algorithm = "kd_tree", metric = "minkowski").fit(X_train, y_train)

In [18]:
f1_score(y_val, knc.predict(X_val), average='weighted')

0.7911837251929126

In [19]:
metrics(y_train = y_train, pred_train = knc.predict(X_train) , y_test = y_val, pred_test = knc.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.85      0.94      0.90       323
         1.0       0.87      0.70      0.78       176

    accuracy                           0.86       499
   macro avg       0.86      0.82      0.84       499
weighted avg       0.86      0.86      0.86       499

[[305  18]
 [ 52 124]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.80      0.91      0.85       138
         1.0       0.79      0.59

# Naive Bayes

## Gaussian Naive Bayes

In [20]:
#We don't have a function to study the best parameter as this functions doesn't take significant parameters

In [21]:
gaussian_nb = GaussianNB().fit(X_train, y_train)

In [22]:
f1_score(y_val, gaussian_nb.predict(X_val), average='weighted')

0.7872445336725317

In [23]:
metrics(y_train = y_train, pred_train = gaussian_nb.predict(X_train) , y_test = y_val, pred_test = gaussian_nb.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.83      0.86      0.85       323
         1.0       0.73      0.68      0.70       176

    accuracy                           0.80       499
   macro avg       0.78      0.77      0.78       499
weighted avg       0.80      0.80      0.80       499

[[278  45]
 [ 56 120]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.82      0.86      0.84       138
         1.0       0.72      0.66

# Decision Tree

In [24]:
dt = DecisionTreeClassifier().fit(X_train, y_train)

In [25]:
"""parameter_space = {
    'max_depth': [3,4,5,6,7,8,9],
    'criterion': ['entropy', 'log_loss','gini'],
    'splitter': ['random', 'best'],
    'max_features': [2,4,6,8,10,None],
    'max_leaf_nodes': [3,6,9,12, None]
}

clf = GridSearchCV(dt, parameter_space, scoring = 'neg_mean_squared_error')
clf.fit(X_train,y_train)
"""

"parameter_space = {\n    'max_depth': [3,4,5,6,7,8,9],\n    'criterion': ['entropy', 'log_loss','gini'],\n    'splitter': ['random', 'best'],\n    'max_features': [2,4,6,8,10,None],\n    'max_leaf_nodes': [3,6,9,12, None]\n}\n\nclf = GridSearchCV(dt, parameter_space, scoring = 'neg_mean_squared_error')\nclf.fit(X_train,y_train)\n"

In [26]:
"""# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))
    
# Best results
print("BEST RESULTS: %0.5f (+/-%0.05f) for %r" % (clf.best_score_, \
                                    clf.cv_results_['std_test_score'][clf.best_index_], clf.best_params_))"""

'# Best parameter set\nprint(\'------------------------------------------------------------------------------------------------------------------------\')\nprint(\'Best parameters found:\n\', clf.best_params_)\nprint(\'------------------------------------------------------------------------------------------------------------------------\')\n\n# All results\nmeans = clf.cv_results_[\'mean_test_score\']\nstds = clf.cv_results_[\'std_test_score\']\nfor mean, std, params in zip(means, stds, clf.cv_results_[\'params\']):\n    print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))\n    \n# Best results\nprint("BEST RESULTS: %0.5f (+/-%0.05f) for %r" % (clf.best_score_,                                     clf.cv_results_[\'std_test_score\'][clf.best_index_], clf.best_params_))'

This tells us that the best results using the Decision Tree Classifier are obtained when the parameters are:
`criterion = 'gini'`, `max_depth = 4`, `max_features: 8`, `max_leaf_nodes: 10`, `splitter: 'random'`

In [27]:
dt_best = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, max_features = 8, max_leaf_nodes = 10, splitter = 'random', random_state = 8).fit(X_train, y_train)

In [28]:
f1_score(y_val, dt_best.predict(X_val), average='weighted')

0.7967003321759747

In [29]:
metrics(y_train = y_train, pred_train = dt_best.predict(X_train) , y_test = y_val, pred_test = dt_best.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.87      0.90      0.88       323
         1.0       0.80      0.75      0.77       176

    accuracy                           0.85       499
   macro avg       0.83      0.82      0.83       499
weighted avg       0.84      0.85      0.84       499

[[290  33]
 [ 44 132]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.83      0.87      0.85       138
         1.0       0.74      0.67

# Bagging Classifier

In [30]:
def find_best_bagging_classifier_params(X, y, base_estimator = None):
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=15)

    # Create a BaggingClassifier
    bagging_clf = BaggingClassifier(estimator = base_estimator)

    # Define the parameter grid to search
    param_grid = {'n_estimators': [10, 50, 100, 150],
                  'max_samples': [0.3, 0.5, 0.7, 1.0],
                  'max_features': [0.5, 0.7, 1.0],
                  "bootstrap" : [True, False],
                  'bootstrap_features' : [True, False]}

    # Use F1 score as the scoring metric for GridSearchCV
    f1_scorer = make_scorer(f1_score)

    # Create the RepeatedStratifiedKFold object
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

    # Create the GridSearchCV object with RepeatedStratifiedKFold
    grid_search = GridSearchCV(bagging_clf, param_grid, scoring=f1_scorer, cv=cv)

    # Fit the model to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Print the best parameters
    print("Best Parameters:", best_params)

## Bagging Classifier with Decision Tree

In [31]:
#Coded because it takes a long time running
#find_best_bagging_classifier_params(X, y)

Best Parameters: {'bootstrap': False, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 0.1, 'n_estimators': 100}

In [32]:
bcdt = BaggingClassifier(bootstrap = False, bootstrap_features = False, max_features = 0.5, max_samples = 0.1, n_estimators = 100, random_state = 8)
bcdt = bcdt.fit(X_train, y_train)

In [33]:
f1_score(y_val, bcdt.predict(X_val), average='weighted')

0.8007532730314232

In [34]:
metrics(y_train = y_train, pred_train = bcdt.predict(X_train) , y_test = y_val, pred_test = bcdt.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.85      0.97      0.91       323
         1.0       0.92      0.69      0.79       176

    accuracy                           0.87       499
   macro avg       0.88      0.83      0.85       499
weighted avg       0.88      0.87      0.86       499

[[312  11]
 [ 54 122]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.79      0.99      0.87       138
         1.0       0.95      0.51

## Bagging Classifier with KNN

In [35]:
#Coded because it takes a long time running
#find_best_bagging_classifier_params(X, y, base_estimator = KNeighborsClassifier())

Best Parameters: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 0.2, 'n_estimators': 100}

In [36]:
bcknc = BaggingClassifier(bootstrap = True, bootstrap_features = False, max_features = 0.5, max_samples = 0.2, n_estimators = 100, random_state = 8)
bcknc = bcknc.fit(X_train, y_train)

In [37]:
f1_score(y_val, bcknc.predict(X_val), average = "weighted")

0.811752928358734

In [38]:
metrics(y_train = y_train, pred_train = bcknc.predict(X_train) , y_test = y_val, pred_test = bcknc.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92       323
         1.0       0.94      0.72      0.82       176

    accuracy                           0.89       499
   macro avg       0.90      0.85      0.87       499
weighted avg       0.89      0.89      0.88       499

[[315   8]
 [ 49 127]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.80      0.96      0.87       138
         1.0       0.88      0.58

# RandomForestClassifier

In [39]:
def find_best_RF_Classifier(X, y):
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=15)

    # Create a BaggingClassifier
    rf_classifier = RandomForestClassifier()

    # Define the parameter grid to search
    param_grid_rf = {
                    'n_estimators': [10, 50, 100],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': [None, 'sqrt', 'log2']}
    
    # Use F1 score as the scoring metric for GridSearchCV
    f1_scorer = make_scorer(f1_score)

    # Create the RepeatedStratifiedKFold object
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=15)

    # Create the GridSearchCV object with RepeatedStratifiedKFold
    grid_search = GridSearchCV(rf_classifier, param_grid_rf, scoring=f1_scorer, cv=cv)

    # Fit the model to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Print the best parameters
    print("Best Parameters:", best_params)

In [40]:
#Coded because this cell takes a long time running
#find_best_RF_Classifier(X, y)

Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 100}

In [41]:
rfc = RandomForestClassifier(max_depth = None, max_features = None, min_samples_leaf = 1, min_samples_split = 6, n_estimators = 100, random_state = 8)
rfc = rfc.fit(X_train, y_train)

In [42]:
f1_score(y_val, rfc.predict(X_val), average = "weighted")

0.8115693633731336

In [43]:
metrics(y_train = y_train, pred_train = rfc.predict(X_train) , y_test = y_val, pred_test = rfc.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96       323
         1.0       0.96      0.89      0.92       176

    accuracy                           0.95       499
   macro avg       0.95      0.93      0.94       499
weighted avg       0.95      0.95      0.95       499

[[317   6]
 [ 20 156]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.82      0.92      0.87       138
         1.0       0.81      0.63

# Stacking Classifier

In [44]:
def return_f1score(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=15)
    n_scores = cross_val_score(model, X, y, scoring = 'f1', cv = cv)
    return n_scores

In [45]:
def return_results(models):
    results, names = [],[]
    for name, model in tqdm(models.items()):
        scores = return_f1score(model, X, y)
        results.append(scores)
        names.append(name)
        print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
    
    plt.figure(figsize=(15,7))
    plt.boxplot(results, labels = names, showmeans = True)
    plt.xticks(fontsize=14)
    plt.show()

In [46]:
def return_results_bar(models):
    results, names, mean, std = [],[],[],[]
    for name, model in tqdm(models.items()):
        scores = return_f1score(model, X, y)
        results.append(scores)
        names.append(name)
        print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
        mean.append(scores.mean())
        std.append(scores.std())
    
    #creates a dataset
    data = pd.DataFrame({'Model' : names, 'F1_Mean': mean, 'F1_std': std})
    
    plt.figure(figsize=(15,7))
    sns.barplot(x = 'Model', y = 'F1_Mean', data = data, 
               color = 'skyblue')
    plt.xticks(fontsize=16)
    plt.show()

In [47]:
estimators =[('lr',  LogisticRegression(random_state = 8)),
              ('kn', KNeighborsClassifier())]
dt = DecisionTreeClassifier(max_depth=3,random_state = 8).fit(X_train,y_train)
bg = BaggingClassifier(random_state = 8).fit(X_train,y_train)
rf = RandomForestClassifier(random_state = 8).fit(X_train,y_train)
lr = LogisticRegression(random_state = 8).fit(X_train,y_train)
nb = GaussianNB().fit(X_train,y_train)
kn = KNeighborsClassifier().fit(X_train,y_train)
estimators_2 = [('rf', RandomForestClassifier(random_state = 8)),
              ('dt', DecisionTreeClassifier()),
               ('bg', BaggingClassifier())]
estimators_3 = [('rf', RandomForestClassifier(random_state = 8)),
              ('nb', GaussianNB())]
estimators_4 = [('rf', RandomForestClassifier(random_state = 8)),
              ('dt', DecisionTreeClassifier(max_depth = 1))]

In [48]:
stc = StackingClassifier(estimators=estimators,final_estimator=LogisticRegression()).fit(X_train,y_train)
stc_2 = StackingClassifier(estimators=estimators_2,final_estimator=LogisticRegression()).fit(X_train,y_train)
stc_3 = StackingClassifier(estimators=estimators_3,final_estimator=LogisticRegression()).fit(X_train,y_train)
stc_4 = StackingClassifier(estimators=estimators_4,final_estimator=LogisticRegression()).fit(X_train,y_train)

In [49]:
#return_results_bar(models={'lr':lr,'nb':nb,'kn':kn,'dt':dt,'bg':bg,'rf':rf, 'stc':stc, 'stc_2': stc_2, "stc_3": stc_3, "stc_4": stc_4})

In [50]:
def optimize_stacking_classifier(X, y, n_splits=5, n_repeats=2):
    """
    Optimize StackingClassifier parameters to maximize F1 score using RepeatedStratifiedKFold.

    Parameters:
    - X: Features
    - y: Target variable
    - n_splits: Number of splits for RepeatedStratifiedKFold
    - n_repeats: Number of repeats for RepeatedStratifiedKFold

    Returns:
    - best_params: Best hyperparameters that maximize F1 score
    - best_f1_score: Maximum F1 score achieved
    """

    # Create RepeatedStratifiedKFold
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

    # Define the hyperparameter grid to search
    param_grid = {
        'stack_method': ['auto', 'predict_proba'],  # Choose the stack method based on your use case
        'final_estimator': [LogisticRegression(), DecisionTreeClassifier(max_depth=1)],
        'cv': [2, 5],  # Number of cross-validation folds for the stacking meta-estimator
        # Add other hyperparameters as needed
    }

    base_estimators = [('rf', RandomForestClassifier(random_state = 8)),
              ('dt', DecisionTreeClassifier(max_depth = 1))]

    best_f1_score = 0
    best_params = None

    # Iterate through hyperparameter combinations
    for params in ParameterGrid(param_grid):
        f1_scores = []

        # Perform cross-validation
        for train_index, val_index in rskf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # Initialize StackingClassifier with current parameters
            stacking_classifier = StackingClassifier(estimators=base_estimators, **params)

            # Fit the model
            stacking_classifier.fit(X_train, y_train)

            # Predict on validation data
            y_pred = stacking_classifier.predict(X_val)

            # Calculate F1 score and append to list
            f1_scores.append(f1_score(y_val, y_pred))

        # Calculate mean F1 score for the current parameter combination
        mean_f1_score = np.mean(f1_scores)

        # Update best parameters if the current combination has a higher F1 score
        if mean_f1_score > best_f1_score:
            best_f1_score = mean_f1_score
            best_params = params

    # Print the best parameters
    print("Best Parameters:", best_params)

In [51]:
#Coded because this cell takes a long time running
#optimize_stacking_classifier(X, y, n_splits=5, n_repeats=2)

Best Parameters: {"estimators" : estimators, 'cv': 3, 'final_estimator': LogisticRegression(), 'stack_method': 'auto'}

In [52]:
stc_final = StackingClassifier(estimators=[('rf', RandomForestClassifier(random_state = 8)), ('dt', DecisionTreeClassifier(max_depth = 1))], cv = 3, final_estimator = LogisticRegression(), stack_method = 'auto').fit(X_train,y_train)

In [53]:
f1_score(y_val, stc_final.predict(X_val), average = "weighted")

0.7975705453094053

In [54]:
metrics(y_train = y_train, pred_train = stc_final.predict(X_train) , y_test = y_val, pred_test = stc_final.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99       323
         1.0       1.00      0.95      0.97       176

    accuracy                           0.98       499
   macro avg       0.99      0.97      0.98       499
weighted avg       0.98      0.98      0.98       499

[[323   0]
 [  9 167]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.81      0.91      0.86       138
         1.0       0.78      0.62

# AdaBoost Classifier

This will be the choice of algorithm

In [55]:
def optimize_adaboost_classifier(X, y, n_splits=5, n_repeats=2):
    """
    Optimize AdaBoostClassifier parameters to maximize F1 score using RepeatedStratifiedKFold.

    Parameters:
    - X: Features
    - y: Target variable
    - n_splits: Number of splits for RepeatedStratifiedKFold
    - n_repeats: Number of repeats for RepeatedStratifiedKFold

    Returns:
    - best_params: Best hyperparameters that maximize F1 score
    - best_f1_score: Maximum F1 score achieved
    """

    # Create RepeatedStratifiedKFold
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=15)

    # Define the hyperparameter grid to search
    param_grid = {
        'n_estimators': [10, 50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2, 0.5, 1.0],
        'base_estimator': [None, DecisionTreeClassifier(max_depth=1), LogisticRegression()],
        'algorithm': ['SAMME', 'SAMME.R']
    }

    best_f1_score = 0
    best_params = None

    # Iterate through hyperparameter combinations
    for params in ParameterGrid(param_grid):
        f1_scores = []

        # Perform cross-validation
        for train_index, val_index in rskf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # Initialize AdaBoostClassifier with current parameters
            adaboost_classifier = AdaBoostClassifier(**params)

            # Fit the model
            adaboost_classifier.fit(X_train, y_train)

            # Predict on validation data
            y_pred = adaboost_classifier.predict(X_val)

            # Calculate F1 score and append to list
            f1_scores.append(f1_score(y_val, y_pred))

        # Calculate mean F1 score for the current parameter combination
        mean_f1_score = np.mean(f1_scores)

        # Update best parameters if the current combination has a higher F1 score
        if mean_f1_score > best_f1_score:
            best_f1_score = mean_f1_score
            best_params = params

    # Print the best parameters
    print("Best Parameters:", best_params)

In [56]:
#Coded because this cell takes a long time running
#optimize_adaboost_classifier(X, y, n_splits=5, n_repeats=2)

In [57]:
abc_1 = AdaBoostClassifier(algorithm = 'SAMME', base_estimator = None, learning_rate = 0.5, n_estimators = 150).fit(X_train,y_train)

In [58]:
metrics(y_train = y_train, pred_train = abc_1.predict(X_train) , y_test = y_val, pred_test = abc_1.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.87      0.89      0.88       323
         1.0       0.79      0.75      0.77       176

    accuracy                           0.84       499
   macro avg       0.83      0.82      0.82       499
weighted avg       0.84      0.84      0.84       499

[[288  35]
 [ 44 132]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.81      0.84      0.83       138
         1.0       0.69      0.64

In [59]:
abc_2 = AdaBoostClassifier(algorithm = 'SAMME', base_estimator = LogisticRegression(), learning_rate = 0.5, n_estimators = 150).fit(X_train,y_train)

In [60]:
metrics(y_train = y_train, pred_train = abc_2.predict(X_train) , y_test = y_val, pred_test = abc_2.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.85      0.87      0.86       323
         1.0       0.75      0.73      0.74       176

    accuracy                           0.82       499
   macro avg       0.80      0.80      0.80       499
weighted avg       0.82      0.82      0.82       499

[[281  42]
 [ 48 128]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.81      0.86      0.83       138
         1.0       0.71      0.63

In [61]:
abc_3 = AdaBoostClassifier(algorithm = 'SAMME', base_estimator = LogisticRegression(), learning_rate = 0.3, n_estimators = 100).fit(X_train,y_train)

In [62]:
metrics(y_train = y_train, pred_train = abc_3.predict(X_train) , y_test = y_val, pred_test = abc_3.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.86      0.83      0.84       323
         1.0       0.70      0.74      0.72       176

    accuracy                           0.80       499
   macro avg       0.78      0.79      0.78       499
weighted avg       0.80      0.80      0.80       499

[[267  56]
 [ 45 131]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.84      0.81      0.82       138
         1.0       0.68      0.71

Best Parameters: {'algorithm': 'SAMME', 'base_estimator': LogisticRegression(), 'learning_rate': 0.1, 'n_estimators': 100}

In [63]:
abc_best = AdaBoostClassifier(algorithm = 'SAMME', base_estimator = LogisticRegression(), learning_rate = 0.1, n_estimators = 100).fit(X_train,y_train)

In [64]:
f1_score(y_val, abc_best.predict(X_val), average = "weighted")

0.7949772445986377

In [65]:
metrics(y_train = y_train, pred_train = abc_best.predict(X_train) , y_test = y_val, pred_test = abc_best.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.86      0.85      0.85       323
         1.0       0.73      0.75      0.74       176

    accuracy                           0.81       499
   macro avg       0.80      0.80      0.80       499
weighted avg       0.81      0.81      0.81       499

[[274  49]
 [ 44 132]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.85      0.83      0.84       138
         1.0       0.71      0.72

# GradientBoosting Classifier

In [66]:
def optimize_gradient_boosting_classifier(X, y, n_splits=5, n_repeats=2):
    """
    Optimize GradientBoostingClassifier parameters to maximize F1 score using RepeatedStratifiedKFold.

    Parameters:
    - X: Features
    - y: Target variable
    - n_splits: Number of splits for RepeatedStratifiedKFold
    - n_repeats: Number of repeats for RepeatedStratifiedKFold

    Returns:
    - best_params: Best hyperparameters that maximize F1 score
    - best_f1_score: Maximum F1 score achieved
    """

    # Create RepeatedStratifiedKFold
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

    # Define the hyperparameter grid to search
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'max_features': ['sqrt', 'log2', None],
    }

    best_f1_score = 0
    best_params = None

    # Iterate through hyperparameter combinations
    for params in ParameterGrid(param_grid):
        f1_scores = []

        # Perform cross-validation
        for train_index, val_index in rskf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # Initialize GradientBoostingClassifier with current parameters
            gradient_boosting_classifier = GradientBoostingClassifier(**params)

            # Fit the model
            gradient_boosting_classifier.fit(X_train, y_train)

            # Predict on validation data
            y_pred = gradient_boosting_classifier.predict(X_val)

            # Calculate F1 score and append to list
            f1_scores.append(f1_score(y_val, y_pred))

        # Calculate mean F1 score for the current parameter combination
        mean_f1_score = np.mean(f1_scores)

        # Update best parameters if the current combination has a higher F1 score
        if mean_f1_score > best_f1_score:
            best_f1_score = mean_f1_score
            best_params = params

    # Print the best parameters
    print("Best Parameters:", best_params)

In [67]:
#Coded because this cell takes a long time running
#optimize_gradient_boosting_classifier(X, y, n_splits=5, n_repeats=2)

Best Parameters: {'learning_rate': 0.2, 'max_depth': 6, 'max_features': None, 'n_estimators': 100, 'subsample': 0.7}

In [68]:
gbc = GradientBoostingClassifier(learning_rate = 0.2, max_depth = 6, max_features = None, n_estimators = 100, subsample = 0.7, random_state = 8)
gbc = gbc.fit(X_train,y_train)

In [69]:
f1_score(y_val, gbc.predict(X_val), average = "weighted")

0.8054072833216956

In [70]:
metrics(y_train = y_train, pred_train = gbc.predict(X_train) , y_test = y_val, pred_test = gbc.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       323
         1.0       1.00      1.00      1.00       176

    accuracy                           1.00       499
   macro avg       1.00      1.00      1.00       499
weighted avg       1.00      1.00      1.00       499

[[323   0]
 [  0 176]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.83      0.88      0.86       138
         1.0       0.76      0.67

# MLP Classifier

In [71]:
def optimize_mlp_classifier(X, y, n_splits=5, n_repeats=2):
    """
    Optimize MLPClassifier parameters to maximize F1 score using RepeatedStratifiedKFold.

    Parameters:
    - X: Features
    - y: Target variable
    - n_splits: Number of splits for RepeatedStratifiedKFold
    - n_repeats: Number of repeats for RepeatedStratifiedKFold

    Returns:
    - best_params: Best hyperparameters that maximize F1 score
    - best_f1_score: Maximum F1 score achieved
    """

    # Create RepeatedStratifiedKFold
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

    # Define the hyperparameter grid to search
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'logistic', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [0.0001, 0.001, 0.01],
    }

    best_f1_score = 0
    best_params = None

    # Iterate through hyperparameter combinations
    for params in ParameterGrid(param_grid):
        f1_scores = []

        # Perform cross-validation
        for train_index, val_index in rskf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # Initialize MLPClassifier with current parameters
            mlp_classifier = MLPClassifier(**params)

            # Fit the model
            mlp_classifier.fit(X_train, y_train)

            # Predict on validation data
            y_pred = mlp_classifier.predict(X_val)

            # Calculate F1 score and append to list
            f1_scores.append(f1_score(y_val, y_pred))

        # Calculate mean F1 score for the current parameter combination
        mean_f1_score = np.mean(f1_scores)

        # Update best parameters if the current combination has a higher F1 score
        if mean_f1_score > best_f1_score:
            best_f1_score = mean_f1_score
            best_params = params

    # Print the best parameters
    print("Best Parameters:", best_params)

In [72]:
#Coded because this cell takes a long time running
#optimize_mlp_classifier(X, y, n_splits=5, n_repeats=2)

Best Parameters: {activation: 'relu', alpha: 0.3, hidden_layer_sizes: (50, 75), solver: 'adam'}

In [73]:
mlp = MLPClassifier(activation = 'relu', alpha = 0.3, hidden_layer_sizes = (50, 75), solver = 'adam', random_state = 8).fit(X_train, y_train)

In [74]:
f1_score(y_val, mlp.predict(X_val), average = "weighted")

0.7998282028388328

In [75]:
metrics(y_train = y_train, pred_train = mlp.predict(X_train) , y_test = y_val, pred_test = mlp.predict(X_val))

_____________________________________
                                                     TRAIN                                                 
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.87      0.93      0.90       323
         1.0       0.85      0.74      0.79       176

    accuracy                           0.86       499
   macro avg       0.86      0.83      0.84       499
weighted avg       0.86      0.86      0.86       499

[[300  23]
 [ 46 130]]
_____________________________________
                                                     TEST                                                  
-----------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

         0.0       0.81      0.93      0.86       138
         1.0       0.82      0.59