# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task. 

*   **Deadline: 23:59, 17/4/2023**



# Import libraries

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Classroom'

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from prettytable import PrettyTable

Mounted at /content/gdrive
/content/gdrive/MyDrive/Classroom


#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [51]:
dataset = datasets.load_iris()

x = dataset['data']
y = dataset['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [52]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(x_train, y_train)
yred = grid.predict(x_test)

print(classification_report(y_test, yred))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.917 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=1.000 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.958 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.958 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.958 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.917 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.958 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.958 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.958 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.958 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf

In [53]:
bestSVM = grid.best_score_
print(bestSVM)

0.9833333333333334


*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [54]:
knn = KNeighborsClassifier(n_neighbors=5)

In [55]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
grid = GridSearchCV(knn, grid_params, refit = True, verbose = 3)
grid.fit(x_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.917 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=1.000 total time=   0.0s
[CV 3/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.958 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.958 total time=   0.0s
[CV 5/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.958 total time=   0.0s
[CV 1/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.917 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=1.000 total time=   0.0s
[CV 3/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=1.000 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.958 total time=   0.0s
[CV 5/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.958 to

In [56]:
yred = grid.predict(x_test)

print(classification_report(y_test, yred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.88      1.00      0.93         7
           2       1.00      0.91      0.95        11

    accuracy                           0.97        30
   macro avg       0.96      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30



In [57]:
bestKNN = grid.best_score_
print(bestKNN)

0.975


*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [58]:
rfc=RandomForestClassifier(random_state=42)

In [59]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
grid = GridSearchCV(rfc, param_grid, refit = True, verbose = 3)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.875 total time=   0.1s
[CV 2/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=1.000 total time=   0.1s
[CV 3/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.958 total time=   0.1s
[CV 4/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.958 total time=   0.1s
[CV 5/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.958 total time=   0.1s
[CV 1/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=0.875 total time=   0.1s
[CV 2/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=1.000 total time=   0.1s
[CV 3/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=0.958 total time=   0.1s
[CV 4/5] END max_depth=3, max_features=sq

In [60]:
yred = grid.predict(x_test)

print(classification_report(y_test, yred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.70      1.00      0.82         7
           2       1.00      0.73      0.84        11

    accuracy                           0.90        30
   macro avg       0.90      0.91      0.89        30
weighted avg       0.93      0.90      0.90        30



In [61]:
bestRandom = grid.best_score_
print(bestRandom)

0.95


*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [62]:
columns = ["Name", "Best"]
myTable = PrettyTable()

myTable.add_column(columns[0], ["SVM", "KNN", "RandomForest"])
myTable.add_column(columns[1],[bestSVM, bestKNN,  bestRandom])

print(myTable)

+--------------+--------------------+
|     Name     |        Best        |
+--------------+--------------------+
|     SVM      | 0.9833333333333334 |
|     KNN      |       0.975        |
| RandomForest |        0.95        |
+--------------+--------------------+


#Task 2. 
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

In [2]:
cancer = datasets.load_breast_cancer()
x = cancer['data']
y = cancer['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [3]:
dataframe = pd.DataFrame(cancer.data, columns = cancer.feature_names)

In [4]:
dataframe.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
dataframe['label'] = cancer.target

In [6]:
dataframe.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


In [7]:
x = dataframe.drop(columns = 'label', axis=1)
y = dataframe['label']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=2)

*   2.1. Apply **GridSearchCV** to **SVM** 


In [9]:
model = SVC()

In [33]:
parameters = {
          'kernel':['linear','poly','rbf','sigmoid'],
          'C':[1, 5, 10, 20]
}

In [34]:
grid = GridSearchCV(SVC(), parameters, refit = True, verbose = 3)
grid.fit(x_train, y_train)
yred = grid.predict(x_test)

print(classification_report(y_test, yred))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ................C=1, kernel=linear;, score=0.945 total time=   0.2s
[CV 2/5] END ................C=1, kernel=linear;, score=0.923 total time=   0.2s
[CV 3/5] END ................C=1, kernel=linear;, score=0.978 total time=   0.6s
[CV 4/5] END ................C=1, kernel=linear;, score=0.956 total time=   0.7s
[CV 5/5] END ................C=1, kernel=linear;, score=0.945 total time=   0.5s
[CV 1/5] END ..................C=1, kernel=poly;, score=0.912 total time=   0.0s
[CV 2/5] END ..................C=1, kernel=poly;, score=0.923 total time=   0.0s
[CV 3/5] END ..................C=1, kernel=poly;, score=0.901 total time=   0.0s
[CV 4/5] END ..................C=1, kernel=poly;, score=0.934 total time=   0.0s
[CV 5/5] END ..................C=1, kernel=poly;, score=0.912 total time=   0.0s
[CV 1/5] END ...................C=1, kernel=rbf;, score=0.912 total time=   0.0s
[CV 2/5] END ...................C=1, kernel=rbf;

In [35]:
bestSVM = grid.best_score_
print(bestSVM)

0.9538461538461538


*   2.2. Apply **GridSearchCV** to **kNN** 

In [36]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [37]:
knn = KNeighborsClassifier(n_neighbors=5)
grid = GridSearchCV(knn, grid_params, refit = True, verbose = 3)
grid.fit(x_train, y_train)
yred = grid.predict(x_test)

print(classification_report(y_test, yred))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.934 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.923 total time=   0.0s
[CV 3/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.912 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.912 total time=   0.0s
[CV 5/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=0.967 total time=   0.0s
[CV 1/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.934 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.934 total time=   0.0s
[CV 3/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.912 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.912 total time=   0.0s
[CV 5/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.967 to

In [38]:
bestKNN = grid.best_score_
print(bestKNN)

0.9516483516483516


*   2.3. Apply **GridSearchCV** to **LogisticRegression** 

In [43]:
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

In [44]:
logistic = LogisticRegression()
grid = GridSearchCV(logistic, parameters, refit = True, scoring='accuracy', verbose = 3)
grid.fit(x_train, y_train)
yred = grid.predict(x_test)

print(classification_report(y_test, yred))

Fitting 5 folds for each of 42 candidates, totalling 210 fits
[CV 1/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END C=0.001, penalty=l1, solver=liblinear;, score=0.923 total time=   0.0s
[CV 2/5] END C=0.001, penalty=l1, solve

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 4/5] END C=0.001, penalty=l2, solver=liblinear;, score=0.912 total time=   0.0s
[CV 5/5] END C=0.001, penalty=l2, solver=liblinear;, score=0.945 total time=   0.0s
[CV 1/5] END C=0.01, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.01, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.01, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.01, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=0.01, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END C=0.01, p

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 3/5] END ..C=0.01, penalty=l2, solver=lbfgs;, score=0.934 total time=   0.0s
[CV 4/5] END ..C=0.01, penalty=l2, solver=lbfgs;, score=0.912 total time=   0.0s
[CV 5/5] END ..C=0.01, penalty=l2, solver=lbfgs;, score=0.945 total time=   0.0s
[CV 1/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.923 total time=   0.0s
[CV 2/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.923 total time=   0.0s
[CV 3/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.934 total time=   0.0s
[CV 4/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.912 total time=   0.0s
[CV 5/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.945 total time=   0.0s
[CV 1/5] END .C=0.1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END .C=0.1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END .C=0.1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END .C=0.1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END .C=0.



[CV 5/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.956 total time=   0.1s
[CV 1/5] END C=0.1, penalty=l2, solver=newton-cg;, score=0.945 total time=   0.1s
[CV 2/5] END C=0.1, penalty=l2, solver=newton-cg;, score=0.912 total time=   0.1s
[CV 3/5] END C=0.1, penalty=l2, solver=newton-cg;, score=0.967 total time=   0.1s
[CV 4/5] END C=0.1, penalty=l2, solver=newton-cg;, score=0.934 total time=   0.1s
[CV 5/5] END C=0.1, penalty=l2, solver=newton-cg;, score=0.967 total time=   0.1s
[CV 1/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.0s
[CV 2/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.901 total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 3/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.1s
[CV 4/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.945 total time=   0.0s
[CV 5/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.945 total time=   0.0s
[CV 1/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.923 total time=   0.0s
[CV 2/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.901 total time=   0.0s
[CV 3/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.956 total time=   0.0s
[CV 4/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.945 total time=   0.0s
[CV 5/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.945 total time=   0.0s
[CV 1/5] END .C=1.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END .C=1.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END .C=1.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END .C=1.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END .C=1.0, pe



[CV 1/5] END C=1.0, penalty=l2, solver=newton-cg;, score=0.945 total time=   0.2s
[CV 2/5] END C=1.0, penalty=l2, solver=newton-cg;, score=0.934 total time=   0.1s
[CV 3/5] END C=1.0, penalty=l2, solver=newton-cg;, score=0.967 total time=   0.1s
[CV 4/5] END C=1.0, penalty=l2, solver=newton-cg;, score=0.923 total time=   0.1s
[CV 5/5] END C=1.0, penalty=l2, solver=newton-cg;, score=0.967 total time=   0.1s
[CV 1/5] END ...C=1.0, penalty=l2, solver=lbfgs;, score=0.934 total time=   0.0s
[CV 2/5] END ...C=1.0, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.0s
[CV 3/5] END ...C=1.0, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.1s
[CV 4/5] END ...C=1.0, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 5/5] END ...C=1.0, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.0s
[CV 1/5] END C=1.0, penalty=l2, solver=liblinear;, score=0.934 total time=   0.0s
[CV 2/5] END C=1.0, penalty=l2, solver=liblinear;, score=0.923 total time=   0.0s
[CV 3/5] END C=1.0, penalty=l2, solver=liblinear;, score=0.967 total time=   0.0s
[CV 4/5] END C=1.0, penalty=l2, solver=liblinear;, score=0.923 total time=   0.0s
[CV 5/5] END C=1.0, penalty=l2, solver=liblinear;, score=0.956 total time=   0.0s
[CV 1/5] END C=10.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=10.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=10.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=10.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=10.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END ....C=10.0, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ....C=10.0



[CV 4/5] END C=10.0, penalty=l1, solver=liblinear;, score=0.923 total time=   0.8s
[CV 5/5] END C=10.0, penalty=l1, solver=liblinear;, score=0.978 total time=   0.4s
[CV 1/5] END C=10.0, penalty=l2, solver=newton-cg;, score=0.967 total time=   0.1s
[CV 2/5] END C=10.0, penalty=l2, solver=newton-cg;, score=0.945 total time=   0.1s
[CV 3/5] END C=10.0, penalty=l2, solver=newton-cg;, score=0.967 total time=   0.1s
[CV 4/5] END C=10.0, penalty=l2, solver=newton-cg;, score=0.923 total time=   0.1s
[CV 5/5] END C=10.0, penalty=l2, solver=newton-cg;, score=0.967 total time=   0.1s
[CV 1/5] END ..C=10.0, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.0s
[CV 2/5] END ..C=10.0, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 3/5] END ..C=10.0, penalty=l2, solver=lbfgs;, score=0.967 total time=   0.0s
[CV 4/5] END ..C=10.0, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.1s
[CV 5/5] END ..C=10.0, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.0s
[CV 1/5] END C=10.0, penalty=l2, solver=liblinear;, score=0.956 total time=   0.0s
[CV 2/5] END C=10.0, penalty=l2, solver=liblinear;, score=0.934 total time=   0.0s
[CV 3/5] END C=10.0, penalty=l2, solver=liblinear;, score=0.967 total time=   0.0s
[CV 4/5] END C=10.0, penalty=l2, solver=liblinear;, score=0.923 total time=   0.0s
[CV 5/5] END C=10.0, penalty=l2, solver=liblinear;, score=0.967 total time=   0.0s
[CV 1/5] END C=100.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=100.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=100.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=100.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C



[CV 3/5] END C=100.0, penalty=l1, solver=liblinear;, score=0.978 total time=   0.2s




[CV 4/5] END C=100.0, penalty=l1, solver=liblinear;, score=0.934 total time=   1.0s




[CV 5/5] END C=100.0, penalty=l1, solver=liblinear;, score=0.989 total time=   1.0s
[CV 1/5] END C=100.0, penalty=l2, solver=newton-cg;, score=0.956 total time=   0.4s
[CV 2/5] END C=100.0, penalty=l2, solver=newton-cg;, score=0.956 total time=   0.4s
[CV 3/5] END C=100.0, penalty=l2, solver=newton-cg;, score=0.978 total time=   0.4s
[CV 4/5] END C=100.0, penalty=l2, solver=newton-cg;, score=0.934 total time=   0.3s
[CV 5/5] END C=100.0, penalty=l2, solver=newton-cg;, score=0.978 total time=   0.4s
[CV 1/5] END .C=100.0, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.1s
[CV 2/5] END .C=100.0, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 3/5] END .C=100.0, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.1s
[CV 4/5] END .C=100.0, penalty=l2, solver=lbfgs;, score=0.934 total time=   0.1s
[CV 5/5] END .C=100.0, penalty=l2, solver=lbfgs;, score=0.967 total time=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=100.0, penalty=l2, solver=liblinear;, score=0.956 total time=   0.1s
[CV 2/5] END C=100.0, penalty=l2, solver=liblinear;, score=0.934 total time=   0.0s
[CV 3/5] END C=100.0, penalty=l2, solver=liblinear;, score=0.967 total time=   0.0s
[CV 4/5] END C=100.0, penalty=l2, solver=liblinear;, score=0.934 total time=   0.0s
[CV 5/5] END C=100.0, penalty=l2, solver=liblinear;, score=0.967 total time=   0.0s
[CV 1/5] END C=1000.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=1000.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=1000.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=1000.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=1000.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END ..C=1000.0, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ..C=1000.0, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV



[CV 2/5] END C=1000.0, penalty=l1, solver=liblinear;, score=0.967 total time=   0.3s




[CV 3/5] END C=1000.0, penalty=l1, solver=liblinear;, score=0.967 total time=   1.0s




[CV 4/5] END C=1000.0, penalty=l1, solver=liblinear;, score=0.945 total time=   0.3s




[CV 5/5] END C=1000.0, penalty=l1, solver=liblinear;, score=0.967 total time=   0.7s
[CV 1/5] END C=1000.0, penalty=l2, solver=newton-cg;, score=0.956 total time=   0.2s




[CV 2/5] END C=1000.0, penalty=l2, solver=newton-cg;, score=0.967 total time=   0.2s
[CV 3/5] END C=1000.0, penalty=l2, solver=newton-cg;, score=0.978 total time=   0.2s




[CV 4/5] END C=1000.0, penalty=l2, solver=newton-cg;, score=0.923 total time=   0.2s
[CV 5/5] END C=1000.0, penalty=l2, solver=newton-cg;, score=0.989 total time=   0.2s
[CV 1/5] END C=1000.0, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.0s
[CV 2/5] END C=1000.0, penalty=l2, solver=lbfgs;, score=0.912 total time=   0.0s
[CV 3/5] END C=1000.0, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.0s
[CV 4/5] END C=1000.0, penalty=l2, solver=lbfgs;, score=0.923 total time=   0.0s
[CV 5/5] END C=1000.0, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.0s
[CV 1/5] END C=1000.0, penalty=l2, solver=liblinear;, score=0.956 total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 2/5] END C=1000.0, penalty=l2, solver=liblinear;, score=0.967 total time=   0.0s
[CV 3/5] END C=1000.0, penalty=l2, solver=liblinear;, score=0.967 total time=   0.0s
[CV 4/5] END C=1000.0, penalty=l2, solver=liblinear;, score=0.934 total time=   0.0s
[CV 5/5] END C=1000.0, penalty=l2, solver=liblinear;, score=0.978 total time=   0.0s


70 fits failed out of a total of 210.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

------------------------------------

              precision    recall  f1-score   support

           0       0.88      0.98      0.93        45
           1       0.98      0.91      0.95        69

    accuracy                           0.94       114
   macro avg       0.93      0.95      0.94       114
weighted avg       0.94      0.94      0.94       114





In [45]:
bestLogic = grid.best_score_
print(bestLogic)

0.9626373626373625


*   2.4. Apply **GridSearchCV** to **RandomForest** 

In [46]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [47]:
rfc=RandomForestClassifier(random_state=42)
grid = GridSearchCV(rfc, param_grid, refit = True, verbose = 3)
grid.fit(x_train, y_train)
yred = grid.predict(x_test)

print(classification_report(y_test, yred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.912 total time=   0.1s
[CV 2/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.945 total time=   0.1s
[CV 3/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.967 total time=   0.1s
[CV 4/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.934 total time=   0.1s
[CV 5/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.923 total time=   0.1s
[CV 1/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=0.901 total time=   0.1s
[CV 2/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=0.967 total time=   0.1s
[CV 3/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=0.967 total time=   0.1s
[CV 4/5] END max_depth=3, max_features=sq

In [48]:
bestRandom = grid.best_score_
print(bestRandom)

0.9604395604395604


*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [49]:
columns = ["Name", "Best"]
myTable = PrettyTable()

myTable.add_column(columns[0], ["SVM", "KNN", "LogicticRegression", "RandomForest"])
myTable.add_column(columns[1],[bestSVM, bestKNN, bestLogic, bestRandom])

print(myTable)

+--------------------+--------------------+
|        Name        |        Best        |
+--------------------+--------------------+
|        SVM         | 0.9538461538461538 |
|        KNN         | 0.9516483516483516 |
| LogicticRegression | 0.9626373626373625 |
|    RandomForest    | 0.9604395604395604 |
+--------------------+--------------------+


#Task 3. 
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion. 
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   3.1 Importing additional libraries

In [2]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


*   3.2. Movie reviews information

In [3]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   3.3. Create dataset from movie reviews

In [4]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [5]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
------------------------------
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [6]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   3.4. Train test split

In [5]:
train, test = train_test_split(documents, test_size = 0.33, random_state=42)

In [6]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

Counter({'neg': 674, 'pos': 666})
Counter({'pos': 334, 'neg': 326})


In [7]:
x_train = [' '.join(words) for (words, label) in train]
x_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   3.5. Text Vectorization

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(x_train) # fit train
X_test_bow = tfidf_vec.transform(x_test) # transform test

*   3.6. Apply **SVM** with **GridSearchCV** 

In [9]:
parameters = {
          'kernel':['linear','poly','rbf','sigmoid'],
          'C':[1, 5, 10, 20]
}

In [10]:
grid = GridSearchCV(SVC(), parameters, refit = True, cv=5)
grid.fit(X_train_bow, y_train)

bestSVM = grid.best_score_
print(bestSVM)

0.8477611940298507


*   3.7. Apply **RandomForest** with **GridSearchCV** 

In [11]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [None]:
rfc=RandomForestClassifier(random_state=42)
grid = GridSearchCV(rfc, param_grid, refit = True, verbose = 3)
grid.fit(X_train_bow, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.743 total time=   0.1s
[CV 2/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.735 total time=   0.1s
[CV 3/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.754 total time=   0.1s
[CV 4/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.675 total time=   0.1s
[CV 5/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=25;, score=0.735 total time=   0.1s
[CV 1/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=0.739 total time=   0.2s
[CV 2/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=0.746 total time=   0.2s
[CV 3/5] END max_depth=3, max_features=sqrt, max_leaf_nodes=3, n_estimators=50;, score=0.776 total time=   0.2s
[CV 4/5] END max_depth=3, max_features=sq

In [14]:
bestRandom = grid.best_score_
print(bestRandom)

0.7977611940298507


*   3.8. Apply **kNN** with **GridSearchCV** 

In [15]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [16]:
knn = KNeighborsClassifier(n_neighbors=5)
grid = GridSearchCV(knn, grid_params, refit = True, cv=5)
grid.fit(X_train_bow, y_train)


bestKNN = grid.best_score_
print(bestKNN)

0.6462686567164179


*   3.9. Apply **LogisticRegression** with **GridSearchCV** 

In [17]:
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

In [18]:
logistic = LogisticRegression()
grid = GridSearchCV(logistic, parameters, refit = True, scoring='accuracy', cv=5)
grid.fit(X_train_bow, y_train)

bestLogic = grid.best_score_
print(bestLogic)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8544776119402986


*   3.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [19]:
columns = ["Name", "Best"]
myTable = PrettyTable()

myTable.add_column(columns[0], ["SVM", "KNN", "LogicticRegression", "RandomForest"])
myTable.add_column(columns[1],[bestSVM, bestKNN, bestLogic, bestRandom])

print(myTable)

+--------------------+--------------------+
|        Name        |        Best        |
+--------------------+--------------------+
|        SVM         | 0.8477611940298507 |
|        KNN         | 0.6462686567164179 |
| LogicticRegression | 0.8544776119402986 |
|    RandomForest    | 0.7977611940298507 |
+--------------------+--------------------+


#Finally,
Save a copy in your Github. Remember renaming the notebook.