# Tuning All the Models

In [32]:
%store -r X_train

In [34]:
%store -r y_train

In [35]:
%store -r default_models

## Tuning Logistic Regression Model

In [37]:
import time
import numpy as np

from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [38]:
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', lowercase=False)

tuned_params = {}

def grid_search(params, name, models=default_models):
    '''
    Performs grid search on classification models, returns best cross validation scores and parameters.
    '''
    for model, grid in params.items():
        print(f'Running... {model} GridSearch')
        print(f'Time Started: {time.asctime()}')
        
        pipe = Pipeline(steps=[('tfidf', tfidf), ('classifier', models[model]['classifier'])])
        pipe.fit(X_train, y_train)
        
        gridsearch = GridSearchCV(estimator=pipe, param_grid=grid, scoring='recall_micro', cv=5, refit=False)
        gridsearch.fit(X_train, y_train)
        
        print(f'Time Finished: {time.asctime()}\n')
        print(f'Best cross validation score: {gridsearch.best_score_ :.2%}')
        print(f'Optimal parameters: {gridsearch.best_params_}')
        
        tuned_params[name] = gridsearch.best_params_

import warnings
warnings.filterwarnings('ignore')

### Logistic Regression

In [39]:
params_lr1 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]
}]}

grid_search(params_lr1, name='LogisticRegression')

Running... LogisticRegression GridSearch
Time Started: Tue Oct  8 20:25:05 2024
Time Finished: Tue Oct  8 20:25:23 2024

Best cross validation score: 67.76%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'lbfgs'}


In [40]:
lr_best_params = tuned_params['LogisticRegression']
%store lr_best_params

Stored 'lr_best_params' (dict)


### MultinomialNB

In [41]:
params_mn = {'MultinomialNB': [{
    'classifier__alpha':[.001, .01, .05, .1, .2, .4, .6, .8, 1]
}]}

grid_search(params_mn, name='MultinomialNB')

Running... MultinomialNB GridSearch
Time Started: Tue Oct  8 20:28:13 2024
Time Finished: Tue Oct  8 20:28:18 2024

Best cross validation score: 66.24%
Optimal parameters: {'classifier__alpha': 0.2}


In [42]:
mn_best_params = tuned_params['MultinomialNB']
%store mn_best_params

Stored 'mn_best_params' (dict)


### Decision Tree

In [43]:
params_dt1 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__splitter':['best', 'random'],
    'classifier__max_depth':[None, 1, 2, 5],
    'classifier__min_samples_split': [2, 3, 5],
    'classifier__min_samples_leaf': [1, 2, 5]
}]}

grid_search(params_dt1, name='DecisionTree')

Running... DecisionTree GridSearch
Time Started: Tue Oct  8 20:28:29 2024
Time Finished: Tue Oct  8 20:46:53 2024

Best cross validation score: 64.67%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__splitter': 'random'}


In [44]:
dt_best_params = tuned_params['DecisionTree']
%store dt_best_params

Stored 'dt_best_params' (dict)


### GradientBoost

In [45]:
params_gb1 = {'GradientBoost': [{
    'classifier__max_depth':[3, 2],
    'classifier__min_samples_split':[2, 3],
    'classifier__min_samples_leaf':[1, 2]
}]}

grid_search(params_gb1, name='GradientBoost')

Running... GradientBoost GridSearch
Time Started: Tue Oct  8 20:50:32 2024
Time Finished: Tue Oct  8 21:04:57 2024

Best cross validation score: 65.91%
Optimal parameters: {'classifier__max_depth': 3, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}


In [46]:
gb_best_params = tuned_params['GradientBoost']
%store gb_best_params

Stored 'gb_best_params' (dict)


### VectorClass

In [47]:
params_svc1 = {'VectorClass': [{
    'classifier__C':[1, 2, 3],
    'classifier__degree':[1, 2, 3],
    'classifier__gamma':['scale', 'auto'],
    'classifier__shrinking':[True, False],
    'classifier__class_weight':['balanced', None],
}]}

grid_search(params_svc1, name='VectorClass')

Running... VectorClass GridSearch
Time Started: Tue Oct  8 21:17:20 2024
Time Finished: Tue Oct  8 22:56:53 2024

Best cross validation score: 68.71%
Optimal parameters: {'classifier__C': 1, 'classifier__class_weight': None, 'classifier__degree': 1, 'classifier__gamma': 'scale', 'classifier__shrinking': True}


In [48]:
svc_best_params = tuned_params['VectorClass']
%store svc_best_params

Stored 'svc_best_params' (dict)


### SDG Classifier

In [49]:
params_sgd3 = {'SGDClassifier': [{
    'classifier__penalty':['l1', 'l2', 'elasticnet'],
    'classifier__alpha':[0.000001, 0.00001, 0.0001],
    'classifier__class_weight':['balanced', None],
}]}

grid_search(params_sgd3, name='SGDClassifier')

Running... SGDClassifier GridSearch
Time Started: Wed Oct  9 07:06:53 2024
Time Finished: Wed Oct  9 07:07:15 2024

Best cross validation score: 67.79%
Optimal parameters: {'classifier__alpha': 0.0001, 'classifier__class_weight': None, 'classifier__penalty': 'l2'}


In [51]:
sgd_best_params = tuned_params['SGDClassifier']
%store sgd_best_params

Stored 'sgd_best_params' (dict)
