In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
shuffle = False
import pandas as pd
from sklearn.model_selection import train_test_split


dataset = pd.read_csv('/content/drive/MyDrive/datasets/movie.csv', sep=",")
X = dataset["text"]
y = dataset["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, shuffle=shuffle)

X_train, y_train

(0       I grew up (b. 1965) watching and loving the Th...
 1       When I put this movie in my DVD player, and sa...
 2       Why do people who do not know what a particula...
 3       Even though I have great interest in Biblical ...
 4       Im a die hard Dads Army fan and nothing will e...
                               ...                        
 3995    It aired on TV yesterday, so I decided to chec...
 3996    The only reason I bought the DVD was to satisf...
 3997    I ordered this extremely rare and highly overr...
 3998    A few weeks ago the German broadcaster "SAT1" ...
 3999    Every second of the film is gorgeous. And that...
 Name: text, Length: 4000, dtype: object, 0       0
 1       0
 2       0
 3       0
 4       1
        ..
 3995    1
 3996    0
 3997    0
 3998    0
 3999    1
 Name: label, Length: 4000, dtype: int64)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

steps = [('TF', TfidfVectorizer()), ('SVM', SVC())]

clf = Pipeline(steps)
parameters={
    'SVM__C':[0.001, 10, 1000],
    'SVM__gamma':[0.001, 0.1, 0.01, 0.001],
    'SVM__kernel': ["rbf", "linear"],
    'TF__stop_words': ['english']
}
kfold = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(clf, refit=True, cv=kfold, param_grid=parameters, verbose=10)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, TF__stop_words=english
[CV 1/5; 1/24] END SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, TF__stop_words=english;, score=0.516 total time=  11.9s
[CV 2/5; 1/24] START SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, TF__stop_words=english
[CV 2/5; 1/24] END SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, TF__stop_words=english;, score=0.516 total time=  15.3s
[CV 3/5; 1/24] START SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, TF__stop_words=english
[CV 3/5; 1/24] END SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, TF__stop_words=english;, score=0.515 total time=  19.1s
[CV 4/5; 1/24] START SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, TF__stop_words=english
[CV 4/5; 1/24] END SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, TF__stop_words=english;, score=0.515 total time=  14.2s
[CV 5/5; 1/24] START SVM__C=0.001, SVM__gamma=0.001, SVM__kernel=rbf, 

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('TF', TfidfVectorizer()),
                                       ('SVM', SVC())]),
             param_grid={'SVM__C': [0.001, 10, 1000],
                         'SVM__gamma': [0.001, 0.1, 0.01, 0.001],
                         'SVM__kernel': ['rbf', 'linear'],
                         'TF__stop_words': ['english']},
             verbose=10)

In [5]:
grid_search.best_params_

{'SVM__C': 10,
 'SVM__gamma': 0.1,
 'SVM__kernel': 'rbf',
 'TF__stop_words': 'english'}

In [7]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

for name, score in [("accuracy", accuracy_score), ("recall", recall_score), ("precision", precision_score), ("f1", f1_score)]:
    print(name, score(grid_search.predict(X_test), y_test))


accuracy 0.8531388888888889
recall 0.8511341114292006
precision 0.8568419885828299
f1 0.8539785124423453
