# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task. 

*   **Deadline: 23:59, 17/4/2023**



In [4]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/ML'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/ML


# Import libraries

In [5]:
# code
import numpy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn import datasets
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy.cluster.hierarchy as shc
from scipy.stats import mode
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [3]:
#code
iris = datasets.load_iris()

In [4]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

In [5]:
sorted(metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [6]:
grid_rf_class = GridSearchCV(estimator=SVC(),
             param_grid= {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']},
             scoring='accuracy', n_jobs=4, cv =10, refit = True, return_train_score=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42)

In [8]:
grid_rf_class.fit(X_train, y_train)

In [9]:
y_predict= grid_rf_class.predict(X_test)

In [10]:
accuracy_score(y_test, y_predict)

1.0

In [11]:
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
print(cv_results_df.shape)

(50, 33)


In [26]:
hyperparameters = print("Best hyperparameters: ", grid_rf_class.best_params_)
score = print("Best score: ", grid_rf_class.best_score_)

Best hyperparameters:  {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best score:  0.9727272727272727


*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [13]:
#code
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [14]:
accuracy_list = []

In [15]:
neighbors_list = [5,7,9,11,13,15]
for test_number in neighbors_list:
  model = KNeighborsClassifier(n_neighbors=test_number)
  predictions = model.fit(X_train, y_train).predict(X_test)
  accuracy = np.mean(cross_val_score(model, iris.data, iris.target, cv=5))
  accuracy_list.append(accuracy)

In [28]:
grid_rf_class1 = GridSearchCV(estimator =model, param_grid = grid_params, scoring='accuracy', n_jobs=4, cv =10, refit = True, return_train_score=True)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

In [30]:
grid_rf_class1.fit(X_train, y_train)

In [31]:
y_predict1 = grid_rf_class1.predict(X_test)

In [32]:
accuracy_score(y_test, y_predict1)

1.0

In [33]:
cv_results_df = pd.DataFrame(grid_rf_class1.cv_results_)
print(cv_results_df.shape)

(36, 33)


In [16]:
results_df = pd.DataFrame({'neighbors' : neighbors_list, 'accuracy' : accuracy_list})
print(results_df)

   neighbors  accuracy
0          5  0.973333
1          7  0.980000
2          9  0.973333
3         11  0.980000
4         13  0.973333
5         15  0.966667


In [45]:
hyperparameters1 = print("Best hyperparameters: ", grid_rf_class1.best_params_)
score1 = print("Best score: ", grid_rf_class1.best_score_)

Best hyperparameters:  {'metric': 'minkowski', 'n_neighbors': 11, 'weights': 'uniform'}
Best score:  0.9583333333333334


*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [35]:
#code
param_grid1 = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [36]:
rf_class = RandomForestClassifier(criterion='entropy', max_features='auto')

In [37]:
grid_rf_class2 = GridSearchCV(estimator =rf_class, param_grid = param_grid1, scoring='accuracy', n_jobs=4, cv =10, refit = True, return_train_score=True)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

In [39]:
grid_rf_class2.fit(X_train, y_train)

In [40]:
y_predict2 = grid_rf_class2.predict(X_test)

In [41]:
accuracy_score(y_test, y_predict2)

1.0

In [42]:
cv_results_df = pd.DataFrame(grid_rf_class2.cv_results_)
print(cv_results_df.shape)

(108, 34)


In [43]:
hyperparameters2 = print("Best hyperparameters: ", grid_rf_class2.best_params_)
score2 = print("Best score: ", grid_rf_class2.best_score_)

Best hyperparameters:  {'max_depth': 3, 'max_features': 'log2', 'max_leaf_nodes': 9, 'n_estimators': 100}
Best score:  0.9499999999999998


*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [49]:
from prettytable import PrettyTable

t = PrettyTable(['Compare','Best hyperparameters','Best score'])

t.add_row(['GridSearchCV for SVM',grid_rf_class.best_params_,grid_rf_class.best_score_])
t.add_row(['GridSearchCV for kNN',grid_rf_class1.best_params_,grid_rf_class1.best_score_])
t.add_row(['GridSearchCV for Random Forest',grid_rf_class2.best_params_,grid_rf_class2.best_score_])
print(t)

+--------------------------------+------------------------------------------------------------------------------------+--------------------+
|            Compare             |                                Best hyperparameters                                |     Best score     |
+--------------------------------+------------------------------------------------------------------------------------+--------------------+
|      GridSearchCV for SVM      |                      {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}                      | 0.9727272727272727 |
|      GridSearchCV for kNN      |          {'metric': 'minkowski', 'n_neighbors': 11, 'weights': 'uniform'}          | 0.9583333333333334 |
| GridSearchCV for Random Forest | {'max_depth': 3, 'max_features': 'log2', 'max_leaf_nodes': 9, 'n_estimators': 100} | 0.9499999999999998 |
+--------------------------------+------------------------------------------------------------------------------------+--------------------+


#Task 2. 
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

*   2.1. Apply **GridSearchCV** to **SVM** 


In [51]:
# code
cancer = datasets.load_breast_cancer()

In [52]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

In [53]:
sorted(metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [55]:
grid_rf_class3 = GridSearchCV(estimator=SVC(),
             param_grid= {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']},
             scoring='accuracy', n_jobs=4, cv =10, refit = True, return_train_score=True)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=42)

In [57]:
grid_rf_class3.fit(X_train, y_train)

In [58]:
y_predict3= grid_rf_class3.predict(X_test)

In [59]:
accuracy_score(y_test, y_predict3)

0.9473684210526315

In [60]:
cv_results_df = pd.DataFrame(grid_rf_class3.cv_results_)
print(cv_results_df.shape)

(50, 33)


In [61]:
hyperparameters3 = print("Best hyperparameters: ", grid_rf_class3.best_params_)
score3 = print("Best score: ", grid_rf_class3.best_score_)

Best hyperparameters:  {'C': 100, 'gamma': 1, 'kernel': 'linear'}
Best score:  0.9671153846153846


*   2.2. Apply **GridSearchCV** to **kNN** 

In [62]:
#code
grid_params4 = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [63]:
accuracy_list4 = []

In [64]:
neighbors_list = [5,7,9,11,13,15]
for test_number in neighbors_list:
  model4 = KNeighborsClassifier(n_neighbors=test_number)
  predictions4 = model.fit(X_train, y_train).predict(X_test)
  accuracy4 = np.mean(cross_val_score(model4, cancer.data, cancer.target, cv=5))
  accuracy_list4.append(accuracy)

In [65]:
grid_rf_class4 = GridSearchCV(estimator =model4, param_grid = grid_params4, scoring='accuracy', n_jobs=4, cv =10, refit = True, return_train_score=True)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=42)

In [67]:
grid_rf_class4.fit(X_train, y_train)

In [68]:
y_predict4 = grid_rf_class4.predict(X_test)

In [69]:
accuracy_score(y_test, y_predict4)

0.9385964912280702

In [70]:
cv_results_df = pd.DataFrame(grid_rf_class4.cv_results_)
print(cv_results_df.shape)

(36, 33)


In [71]:
results_df = pd.DataFrame({'neighbors' : neighbors_list, 'accuracy' : accuracy_list4})
print(results_df)

   neighbors  accuracy
0          5  0.966667
1          7  0.966667
2          9  0.966667
3         11  0.966667
4         13  0.966667
5         15  0.966667


In [72]:
hyperparameters4 = print("Best hyperparameters: ", grid_rf_class4.best_params_)
score4 = print("Best score: ", grid_rf_class4.best_score_)

Best hyperparameters:  {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Best score:  0.9384057971014492


*   2.3. Apply **GridSearchCV** to **LogisticRegression** 

In [138]:
#code
parameters5 = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['liblinear'],
}

In [139]:
logisticRegression= LogisticRegression(C = 0.1, penalty = 'l2', solver = 'liblinear', max_iter=5000)
grid_rf_class5 = GridSearchCV(logisticRegression,                 
                   param_grid = parameters5, 
                   scoring='accuracy',       
                   cv=10) 

In [140]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=42)

In [141]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [142]:
grid_rf_class5.fit(X_train_scaled, y_train)

In [143]:
y_predict5 = grid_rf_class5.predict(X_test)

In [144]:
accuracy_score(y_test, y_predict5)

0.37719298245614036

In [145]:
cv_results_df = pd.DataFrame(grid_rf_class5.cv_results_)
print(cv_results_df.shape)

(14, 21)


In [146]:
hyperparamparameters5 = print("Best hyperparameters: ", grid_rf_class5.best_params_)
score5score5 = print("Best score: ", grid_rf_class5.best_score_)

Best hyperparameters:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best score:  0.9802415458937197


*   2.4. Apply **GridSearchCV** to **RandomForest** 

In [147]:
#code
param_grid6 = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [148]:
rf_class = RandomForestClassifier(criterion='entropy', max_features='auto')

In [149]:
grid_rf_class6 = GridSearchCV(estimator =rf_class, param_grid = param_grid6, scoring='accuracy', n_jobs=4, cv =10, refit = True, return_train_score=True)

In [150]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=42)

In [154]:
grid_rf_class6.fit(X_train, y_train)

In [155]:
y_predict6 = grid_rf_class6.predict(X_test)

In [156]:
accuracy_score(y_test, y_predict6)

0.956140350877193

In [157]:
cv_results_df = pd.DataFrame(grid_rf_class6.cv_results_)
print(cv_results_df.shape)

(108, 34)


In [158]:
hyperparameters6 = print("Best hyperparameters: ", grid_rf_class6.best_params_)
score6 = print("Best score: ", grid_rf_class6.best_score_)

Best hyperparameters:  {'max_depth': 9, 'max_features': None, 'max_leaf_nodes': 9, 'n_estimators': 50}
Best score:  0.9670048309178745


*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [160]:
#code
from prettytable import PrettyTable

t = PrettyTable(['Compare','Best hyperparameters','Best score'])

t.add_row(['GridSearchCV for SVM',grid_rf_class3.best_params_,grid_rf_class3.best_score_])
t.add_row(['GridSearchCV for kNN',grid_rf_class4.best_params_,grid_rf_class4.best_score_])
t.add_row(['GridSearchCV for LogisticRegression',grid_rf_class5.best_params_,grid_rf_class5.best_score_])
t.add_row(['GridSearchCV for Random Forest',grid_rf_class6.best_params_,grid_rf_class6.best_score_])
print(t)

+-------------------------------------+---------------------------------------------------------------------------------+--------------------+
|               Compare               |                               Best hyperparameters                              |     Best score     |
+-------------------------------------+---------------------------------------------------------------------------------+--------------------+
|         GridSearchCV for SVM        |                    {'C': 100, 'gamma': 1, 'kernel': 'linear'}                   | 0.9671153846153846 |
|         GridSearchCV for kNN        |         {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}        | 0.9384057971014492 |
| GridSearchCV for LogisticRegression |                {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}               | 0.9802415458937197 |
|    GridSearchCV for Random Forest   | {'max_depth': 9, 'max_features': None, 'max_leaf_nodes': 9, 'n_estimators': 50} | 0.9670048309178745 |

#Task 3. 
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion. 
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   3.1 Importing additional libraries

In [6]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


*   3.2. Movie reviews information

In [7]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   3.3. Create dataset from movie reviews

In [8]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [9]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
------------------------------
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [10]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   3.4. Train test split

In [11]:
train, test = train_test_split(documents, test_size = 0.33, random_state=42)

In [12]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

Counter({'neg': 674, 'pos': 666})
Counter({'pos': 334, 'neg': 326})


In [13]:
X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   3.5. Text Vectorization

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

*   3.6. Apply **SVM** with **GridSearchCV** 

In [31]:
#code
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

In [32]:
sorted(metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [33]:
grid_rf_class_SVM = GridSearchCV(estimator=SVC(),
             param_grid= {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']},
             scoring='accuracy', n_jobs=4, cv =10, refit = True, return_train_score=True)

In [34]:
grid_rf_class_SVM.fit(X_train_bow, y_train)

In [35]:
y_predict_SVM= grid_rf_class_SVM.predict(X_test_bow)

In [36]:
cv_results_df = pd.DataFrame(grid_rf_class_SVM.cv_results_)
print(cv_results_df.shape)

(50, 33)


In [37]:
hyperparameters = print("Best hyperparameters: ", grid_rf_class_SVM.best_params_)
score = print("Best score: ", grid_rf_class_SVM.best_score_)

Best hyperparameters:  {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best score:  0.858955223880597


*   3.7. Apply **RandomForest** with **GridSearchCV** 

In [39]:
#code
param_grid_RF = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [40]:
rf_class = RandomForestClassifier(criterion='entropy', max_features='auto')


In [41]:
grid_rf_class_RF = GridSearchCV(estimator =rf_class, param_grid = param_grid_RF, scoring='accuracy', n_jobs=4, cv =10, refit = True, return_train_score=True)

In [42]:
grid_rf_class_RF.fit(X_train_bow, y_train)

In [43]:
y_predict_RF= grid_rf_class_RF.predict(X_test_bow)

In [44]:
cv_results_df = pd.DataFrame(grid_rf_class_RF.cv_results_)
print(cv_results_df.shape)

(108, 34)


In [45]:
hyperparameters = print("Best hyperparameters: ", grid_rf_class_RF.best_params_)
score = print("Best score: ", grid_rf_class_RF.best_score_)

Best hyperparameters:  {'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 100}
Best score:  0.8


*   3.8. Apply **kNN** with **GridSearchCV** 

In [15]:
#code
knn = KNeighborsClassifier()

In [16]:
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

In [17]:
grid_rf_class_kNN = GridSearchCV(knn, param_grid, cv=5)

In [18]:
grid_rf_class_kNN.fit(X_train_bow, y_train)

In [19]:
y_predict_kNN= grid_rf_class_kNN.predict(X_test_bow)

In [20]:
cv_results_df = pd.DataFrame(grid_rf_class_kNN.cv_results_)
print(cv_results_df.shape)

(5, 14)


In [21]:
hyperparameters = print("Best hyperparameters: ", grid_rf_class_kNN.best_params_)
score = print("Best score: ", grid_rf_class_kNN.best_score_)

Best hyperparameters:  {'n_neighbors': 7}
Best score:  0.6231343283582089


*   3.9. Apply **LogisticRegression** with **GridSearchCV** 

In [22]:
#code
parameters_LR = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['liblinear'],
}

In [23]:
logisticRegression = LogisticRegression(C = 0.1, penalty = 'l2', solver = 'liblinear', max_iter=5000)
grid_rf_class_LR = GridSearchCV(logisticRegression,                 
                   param_grid = parameters_LR, 
                   scoring='accuracy',       
                   cv=10) 

In [24]:
grid_rf_class_LR.fit(X_train_bow, y_train)

In [25]:
y_predict_LR= grid_rf_class_LR.predict(X_test_bow)

In [26]:
cv_results_df = pd.DataFrame(grid_rf_class_LR.cv_results_)
print(cv_results_df.shape)

(14, 21)


In [27]:
hyperparameters = print("Best hyperparameters: ", grid_rf_class_LR.best_params_)
score = print("Best score: ", grid_rf_class_LR.best_score_)

Best hyperparameters:  {'C': 10.0, 'penalty': 'l2', 'solver': 'liblinear'}
Best score:  0.8529850746268657


*   3.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [46]:
from prettytable import PrettyTable

t = PrettyTable(['Compare','Best hyperparameters','Best score'])

t.add_row(['GridSearchCV for SVM',grid_rf_class_SVM.best_params_,grid_rf_class_SVM.best_score_])
t.add_row(['GridSearchCV for kNN',grid_rf_class_kNN.best_params_,grid_rf_class_kNN.best_score_])
t.add_row(['GridSearchCV for LogisticRegression',grid_rf_class_LR.best_params_,grid_rf_class_LR.best_score_])
t.add_row(['GridSearchCV for Random Forest',grid_rf_class_RF.best_params_,grid_rf_class_RF.best_score_])
print(t)

+-------------------------------------+------------------------------------------------------------------------------------+--------------------+
|               Compare               |                                Best hyperparameters                                |     Best score     |
+-------------------------------------+------------------------------------------------------------------------------------+--------------------+
|         GridSearchCV for SVM        |                      {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}                      | 0.858955223880597  |
|         GridSearchCV for kNN        |                                 {'n_neighbors': 7}                                 | 0.6231343283582089 |
| GridSearchCV for LogisticRegression |                {'C': 10.0, 'penalty': 'l2', 'solver': 'liblinear'}                 | 0.8529850746268657 |
|    GridSearchCV for Random Forest   | {'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 100} |

#Finally,
Save a copy in your Github. Remember renaming the notebook.