# Classification Models

In [1]:
#Import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
from pprint import pprint
import pickle 
import mglearn
import time
import os
root_dir = os.path.abspath(os.curdir)
feat_dir = os.path.dirname(root_dir)+"/Features/"
import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from imblearn.combine import SMOTETomek 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectFromModel
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
SEED=13

In [2]:
#Parameters
LRparams = dict()
#1e-5, 1e-4, 1e-3, 1e-2,
LRparams['C'] = [1, 10, 100]
LRparams['dual'] = [True,False]
LRparams['fit_intercept'] = [True,False]
LRparams['penalty'] = ['none', 'l1', 'l2']
LRparams['solver'] = ['newton-cg', 'lbfgs', 'liblinear','sag', 'saga']
LRparams['max_iter'] = [10000]

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

RFparams = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

DTparams = {
     'criterion' : ['gini', 'entropy'],
     'max_depth' : range(2, 32, 1),
     'min_samples_leaf' : range(1, 10, 1),
     'min_samples_split' : range(2, 10, 1),
     'splitter' : ['best', 'random']}

MNparams = dict()
MNparams['alpha'] = np.linspace(0.5, 1.5, 6)
MNparams['fit_prior'] = [True, False]

XGBparams = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2),
    'colsample_bytree': [1.0,0.9,0.8],
#     'max_depth': range(2, 10, 1),
    'n_estimators': [300],
#     'learning_rate': [0.1, 0.01, 0.05],
    'learning_rate': [0.3],
    'tree_method' : ['gpu_hist'],
    'subsample': [1]}

classifierInfo = [("Logistic Regression", LogisticRegression(random_state=SEED),LRparams),\
                  ("Random Forest", RandomForestClassifier(random_state=SEED),RFparams),\
                  ("Decision Tree",DecisionTreeClassifier(random_state=SEED),DTparams),\
                 ("MultinomialNB",MultinomialNB(),MNparams),\
                ("XGBoost",XGBClassifier(random_state=SEED,objective='multi:softmax',nthread=4,\
                use_label_encoder=False,gpu_id=-1,eval_metric='merror'),XGBparams)]

In [3]:
def main():
    #Read targets and get class distributions 
    with open(feat_dir+'target.pkl','rb') as f: y = pickle.load(f)
    y = y.astype(int)
    y = y.flatten()
    nontox, slightly, highly = np.bincount(y)
    total = nontox + slightly + highly
    print('Total: {}\n    Non toxic: {} ({:.2f}% of total)\n'.format(
        total, nontox, 100 * nontox / total))
    print('Total: {}\n    Slightly toxic: {} ({:.2f}% of total)\n'.format(
        total, slightly, 100 * slightly / total))
    print('Total: {}\n    Highly toxic: {} ({:.2f}% of total)\n'.format(
        total, highly, 100 * highly / total))
    print('―' * 100)
    
    featureFiles = {'tf.pkl':'Unigram','tfbig.pkl':'Bigram','tfn.pkl':'Ngram','tfidf.pkl':'TFIDF',\
    'hatefulFeats.pkl':'Hateful','word2vec-features.pkl':'Word2Vec','doc2vec-features.pkl':'Doc2Vec',\
                    'allFeatures.pkl':'All'}


    for filename, title in featureFiles.items(): 
        with open(feat_dir+filename,'rb') as f: train = pickle.load(f)
        x = train
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2,stratify=y,random_state=SEED)
        print(title+" features\n")
        X_data_transformed, Y_data_transformed = performStandardization(X_train,X_test)
        train_fs, test_fs = performFeatureSelection(X_data_transformed,Y_data_transformed,y_train)
#         X_resampled,y_resampled = performClassImbalanceHandling(train_fs, y_train)
        for classifier in classifierInfo:
            clfname = classifier[0]
            clf_model = classifier[1]
            clf_params = classifier[2]
            gridResult = runClassification(clfname,clf_model,clf_params,train_fs, y_train)
            performPrediction(gridResult,test_fs,y_test)
        print('―' * 100)
    print("\nDone!")
            
def performStandardization(train,test):
    #Perform standardization for the training and testing sets (step 1)
    scaler = preprocessing.MinMaxScaler().fit(train) #MaxAbsScaler
    X_data_transformed = scaler.transform(train)

    # #Do the same for testing
    Y_data_transformed = scaler.transform(test)
    return X_data_transformed, Y_data_transformed

def performFeatureSelection(train,test,y):
    # Feature Selection Method 1: Random Forest Based feature selection (step 2)
    print("Shape before feature selection:",train.shape)
    clf = RandomForestClassifier(random_state=SEED)
    clf = clf.fit(train,y)
    #print(clf.feature_importances_ ) 
    model = SelectFromModel(clf, prefit=True)
    train_fs = model.transform(train)

    test_fs = model.transform(test)
    print("Shape after feature selection:",train_fs.shape) 
    return train_fs, test_fs

def performClassImbalanceHandling(train,y):
    #Apply sampling on training set only (step 3)
    imbalanceHandeler = SMOTETomek(random_state=SEED,n_jobs=-1) #,sampling_strategy=strategy
    X_resampled, y_resampled = imbalanceHandeler.fit_resample(train, y)
    #Counts
    print("Number of training instances before sampling:",train.shape[0])
    print("Distribution in each class before sampling:")
    print(pd.Series(y).value_counts())
    print()
    print("Number of training instances after sampling:",X_resampled.shape[0])
    print("Distribution in each class after sampling:")
    print(pd.Series(y_resampled).value_counts())
    return X_resampled, y_resampled

def runClassification(clfname,clf_model,clf_params,X_resampled, y_resampled):
    scoring = ['roc_auc_ovr','f1_macro']
    rnd_clf = clf_model
    print('─' * 100) 
    print(clfname)
    print('_' * 100)
#     print('Parameters currently in use:\n')
#     pprint(rnd_clf.get_params())
    
    #Perform random grid search
    rf_random = RandomizedSearchCV(estimator=rnd_clf, param_distributions=clf_params,cv = 3, verbose=2,random_state=SEED,scoring=scoring, 
                          refit='roc_auc_ovr', n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_resampled[:1000], y_resampled[:1000])
    #Get best parameters from random search
    best = rf_random.best_params_
    # param_grid = {}
    keys = []
    vals = []
    for k, v in best.items():
        _k = 'clf_cv__'+k
        _v = [v]
        keys.append(_k)
        vals.append(_v)

    param_grid = {key: value for key, value in zip(keys, vals)}
    
    #Perform classification
    class_clf = clf_model #, max_iter=4000
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=SEED)

#     steps = [('clf_cv',class_clf)]
#     pipe = Pipeline(steps=steps)
    pipeline = imbpipeline(steps = [['smotetomek', SMOTETomek(random_state=SEED,n_jobs=-1)],
                                ['clf_cv', class_clf]])
    grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, 
                              refit='roc_auc_ovr', return_train_score=True,verbose=2, n_jobs=-1)#roc_auc_ovr

    gridResult = grid.fit(X_resampled , y_resampled)
    print("Best parameters:\n")
    pprint(gridResult.best_params_)
    #Evaluate
    print("Training ROC AUC score: {a} ({b})"\
          .format(a=gridResult.best_score_,b=gridResult.cv_results_['std_test_roc_auc_ovr'].max()))
    return gridResult

def performPrediction(gridResult,test_fs,y_test):
    predicted_y_test = gridResult.predict(test_fs)
    proba_y_test = gridResult.predict_proba(test_fs)
    print("Confusion Matrix: \n{}".format(confusion_matrix(y_test, predicted_y_test)))
    print("\nClassification report: \n{}".format(classification_report(y_test, predicted_y_test)))

    print("Accuracy: ",accuracy_score(y_test, predicted_y_test))
    f1 = f1_score(y_test, predicted_y_test, average='macro')
    pre = precision_score(y_test, predicted_y_test, average='macro')
    rec = recall_score(y_test, predicted_y_test, average='macro')
    acc = accuracy_score(y_test, predicted_y_test)
    print('Precision:',pre)
    print('Recall:',rec)
    print('F1:',f1)
    print("ROC AUC Score:{}".format(roc_auc_score(y_test,proba_y_test,multi_class='ovr')))

In [None]:
main()

Total: 10065
    Non toxic: 8210 (81.57% of total)

Total: 10065
    Slightly toxic: 1189 (11.81% of total)

Total: 10065
    Highly toxic: 666 (6.62% of total)

――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
Unigram features

Shape before feature selection: (8052, 3000)
Shape after feature selection: (8052, 521)
────────────────────────────────────────────────────────────────────────────────────────────────────
Logistic Regression
____________________________________________________________________________________________________
Fitting 3 folds for each of 10 candidates, totalling 30 fits


18 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__C': 10,
 'clf_cv__dual': False,
 'clf_cv__fit_intercept': True,
 'clf_cv__max_iter': 10000,
 'clf_cv__penalty': 'l2',
 'clf_cv__solver': 'lbfgs'}
Training ROC AUC score: 0.8549902207563967 (0.011711018628895406)
Confusion Matrix: 
[[1500  119   23]
 [  61  132   45]
 [   8   44   81]]

Classification report: 
              precision    recall  f1-score   support

           0       0.96      0.91      0.93      1642
           1       0.45      0.55      0.50       238
           2       0.54      0.61      0.57       133

    accuracy                           0.85      2013
   macro avg       0.65      0.69      0.67      2013
weighted avg       0.87      0.85      0.86      2013

Accuracy:  0.8509687034277198
Precision: 0.6490349109143796
Recall: 0.6923881675242057
F1: 0.668022012422593
ROC AUC Score:0.87812379186431
───────────────────────────────────────────────────────────────────────────────



Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__colsample_bytree': 0.8,
 'clf_cv__learning_rate': 0.3,
 'clf_cv__max_depth': 3,
 'clf_cv__min_child_weight': 1,
 'clf_cv__n_estimators': 300,
 'clf_cv__subsample': 1,
 'clf_cv__tree_method': 'gpu_hist'}
Training ROC AUC score: 0.9087429082727249 (0.007509481347334376)
Confusion Matrix: 
[[1599   38    5]
 [ 104  100   34]
 [  17   41   75]]

Classification report: 
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1642
           1       0.56      0.42      0.48       238
           2       0.66      0.56      0.61       133

    accuracy                           0.88      2013
   macro avg       0.72      0.65      0.68      2013
weighted avg       0.87      0.88      0.87      2013

Accuracy:  0.8812717337307501
Precision: 0.7154017058366327
Recall: 0.652630088512102
F1: 0.6793744228474238
ROC AUC Score:0.9132860279675379
――――――――――――――――――――

18 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__C': 10,
 'clf_cv__dual': False,
 'clf_cv__fit_intercept': True,
 'clf_cv__max_iter': 10000,
 'clf_cv__penalty': 'l2',
 'clf_cv__solver': 'lbfgs'}
Training ROC AUC score: 0.5913163857395156 (0.012069426861702549)
Confusion Matrix: 
[[1025  341  276]
 [ 105   81   52]
 [  47   41   45]]

Classification report: 
              precision    recall  f1-score   support

           0       0.87      0.62      0.73      1642
           1       0.17      0.34      0.23       238
           2       0.12      0.34      0.18       133

    accuracy                           0.57      2013
   macro avg       0.39      0.43      0.38      2013
weighted avg       0.74      0.57      0.63      2013

Accuracy:  0.5717834078489816
Precision: 0.3888158499346038
Recall: 0.43430691078918904
F1: 0.3787240911100311
ROC AUC Score:0.6252891508318585
───────────────────────────────────────────────────────────────────────────



Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__colsample_bytree': 0.9,
 'clf_cv__learning_rate': 0.3,
 'clf_cv__max_depth': 9,
 'clf_cv__min_child_weight': 1,
 'clf_cv__n_estimators': 300,
 'clf_cv__subsample': 1,
 'clf_cv__tree_method': 'gpu_hist'}
Training ROC AUC score: 0.6257442268706818 (0.017098073376946823)
Confusion Matrix: 
[[1596   42    4]
 [ 200   29    9]
 [  90   29   14]]

Classification report: 
              precision    recall  f1-score   support

           0       0.85      0.97      0.90      1642
           1       0.29      0.12      0.17       238
           2       0.52      0.11      0.17       133

    accuracy                           0.81      2013
   macro avg       0.55      0.40      0.42      2013
weighted avg       0.76      0.81      0.77      2013

Accuracy:  0.8142076502732241
Precision: 0.5515846457981488
Recall: 0.3996990936896587
F1: 0.41711984596599977
ROC AUC Score:0.668377580874885
―――――――――――――――――――

18 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__C': 10,
 'clf_cv__dual': False,
 'clf_cv__fit_intercept': True,
 'clf_cv__max_iter': 10000,
 'clf_cv__penalty': 'l2',
 'clf_cv__solver': 'lbfgs'}
Training ROC AUC score: 0.5001262216043725 (0.017488896814259483)
Confusion Matrix: 
[[659 343 640]
 [ 90  62  86]
 [ 29  35  69]]

Classification report: 
              precision    recall  f1-score   support

           0       0.85      0.40      0.54      1642
           1       0.14      0.26      0.18       238
           2       0.09      0.52      0.15       133

    accuracy                           0.39      2013
   macro avg       0.36      0.39      0.29      2013
weighted avg       0.71      0.39      0.48      2013

Accuracy:  0.39244908097367115
Precision: 0.35824841517958844
Recall: 0.3935470078793746
F1: 0.29207528372750163
ROC AUC Score:0.5579097051938696
──────────────────────────────────────────────────────────────────────────────────



Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__colsample_bytree': 0.8,
 'clf_cv__learning_rate': 0.3,
 'clf_cv__max_depth': 5,
 'clf_cv__min_child_weight': 1,
 'clf_cv__n_estimators': 300,
 'clf_cv__subsample': 1,
 'clf_cv__tree_method': 'gpu_hist'}
Training ROC AUC score: 0.5359788971605318 (0.01432245570805022)
Confusion Matrix: 
[[1505   82   55]
 [ 195   31   12]
 [ 107   14   12]]

Classification report: 
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1642
           1       0.24      0.13      0.17       238
           2       0.15      0.09      0.11       133

    accuracy                           0.77      2013
   macro avg       0.41      0.38      0.39      2013
weighted avg       0.72      0.77      0.74      2013

Accuracy:  0.7690014903129657
Precision: 0.4096217953912024
Recall: 0.3790142763945761
F1: 0.38526243011841643
ROC AUC Score:0.5931635157805815
―――――――――――――――――――

18 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__C': 10,
 'clf_cv__dual': False,
 'clf_cv__fit_intercept': True,
 'clf_cv__max_iter': 10000,
 'clf_cv__penalty': 'l2',
 'clf_cv__solver': 'lbfgs'}
Training ROC AUC score: 0.8734349037239646 (0.010514900614072684)
Confusion Matrix: 
[[1502  106   34]
 [  60  127   51]
 [   7   51   75]]

Classification report: 
              precision    recall  f1-score   support

           0       0.96      0.91      0.94      1642
           1       0.45      0.53      0.49       238
           2       0.47      0.56      0.51       133

    accuracy                           0.85      2013
   macro avg       0.62      0.67      0.64      2013
weighted avg       0.86      0.85      0.85      2013

Accuracy:  0.8464977645305514
Precision: 0.6244102468005398
Recall: 0.6707537813509915
F1: 0.6446898441105494
ROC AUC Score:0.8924118883582515
────────────────────────────────────────────────────────────────────────────

18 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__C': 100,
 'clf_cv__dual': False,
 'clf_cv__fit_intercept': False,
 'clf_cv__max_iter': 10000,
 'clf_cv__penalty': 'l1',
 'clf_cv__solver': 'saga'}
Training ROC AUC score: 0.6656813759601732 (0.013944080529916636)
Confusion Matrix: 
[[983 240 419]
 [ 76  55 107]
 [ 41  21  71]]

Classification report: 
              precision    recall  f1-score   support

           0       0.89      0.60      0.72      1642
           1       0.17      0.23      0.20       238
           2       0.12      0.53      0.19       133

    accuracy                           0.55      2013
   macro avg       0.40      0.45      0.37      2013
weighted avg       0.76      0.55      0.62      2013

Accuracy:  0.5509190263288624
Precision: 0.39553832324902866
Recall: 0.4545290646549023
F1: 0.37002379962056287
ROC AUC Score:0.6597016282965323
──────────────────────────────────────────────────────────────────────────────────

18 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Hind\anaconda3\envs\tf-gpu\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Fitting 15 folds for each of 1 candidates, totalling 15 fits
Best parameters:

{'clf_cv__C': 10,
 'clf_cv__dual': False,
 'clf_cv__fit_intercept': False,
 'clf_cv__max_iter': 10000,
 'clf_cv__penalty': 'l2',
 'clf_cv__solver': 'newton-cg'}
Training ROC AUC score: 0.7992874897003693 (0.007855075296594174)
Confusion Matrix: 
[[1234  292  116]
 [  66  103   69]
 [  19   24   90]]

Classification report: 
              precision    recall  f1-score   support

           0       0.94      0.75      0.83      1642
           1       0.25      0.43      0.31       238
           2       0.33      0.68      0.44       133

    accuracy                           0.71      2013
   macro avg       0.50      0.62      0.53      2013
weighted avg       0.81      0.71      0.75      2013

Accuracy:  0.7088922006954794
Precision: 0.5028844522092643
Recall: 0.6203291240209142
F1: 0.5294083629760075
ROC AUC Score:0.806477277387498
────────────────────────────────────────────────────────────────────────