In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# train

In [10]:
df_train = pd.read_csv('./train.csv')
df_train.head()

Unnamed: 0,text,sentiment
0,"""Her Cardboard Lover"" is Norma Shearer's last ...",0
1,I cannot believe that this movie was ever crea...,0
2,"After ""Beau travail"", everybody was waiting fo...",1
3,Daniel Day-Lewis is the most versatile actor a...,1
4,I had started to lose my faith in films of rec...,1


In [11]:
df_train.columns.tolist()

['text', 'sentiment']

In [12]:
df_train.isnull().sum()

text         0
sentiment    0
dtype: int64

In [13]:
df_train.shape

(25000, 2)

In [14]:
df_train['sentiment'].value_counts()

sentiment
0    12500
1    12500
Name: count, dtype: int64

In [15]:
X_train = df_train['text']
y_train = df_train['sentiment']

# test

In [17]:
df_test = pd.read_csv('./test.csv')
df_test.head()

Unnamed: 0,text,sentiment
0,this movie takes the voice of terror and makes...,1
1,This is the most frightening film ever made in...,0
2,From the beginning of the film I found myself ...,1
3,"Rachel and Chuck Yoman (Valerie Harper, Gerald...",1
4,This is hands down the most annoying and frust...,0


In [18]:
df_test.shape

(25000, 2)

In [19]:
df_test['sentiment'].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

In [20]:
X_test = df_test['text']
y_test = df_test['sentiment']

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [22]:
text_clf = Pipeline([('tfid',TfidfVectorizer()),('clf',LinearSVC())])

In [23]:
%%time
text_clf.fit(X_train,y_train)

CPU times: user 4.97 s, sys: 193 ms, total: 5.16 s
Wall time: 6.11 s


In [24]:
predictions = text_clf.predict(X_test)

In [25]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [26]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

[[11104  1396]
 [ 1674 10826]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88     12500
           1       0.89      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

0.8772


# support vector classifier

In [28]:
%%time
from sklearn.svm import SVC
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',SVC())])
text_clf.fit(X_train,y_train)

CPU times: user 23min 57s, sys: 12.5 s, total: 24min 9s
Wall time: 25min 27s


In [29]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

[[11104  1396]
 [ 1674 10826]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88     12500
           1       0.89      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

0.8772


# logistic regression

In [31]:
%%time
from sklearn.linear_model import LogisticRegression 
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LogisticRegression())])
text_clf.fit(X_train,y_train)
predictions = text_clf.predict(X_test)

CPU times: user 14.4 s, sys: 523 ms, total: 14.9 s
Wall time: 27.7 s


In [32]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

[[11055  1445]
 [ 1482 11018]]
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

0.88292


# random forest

In [34]:
%%time
from sklearn.ensemble import RandomForestClassifier
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',RandomForestClassifier())])
text_clf.fit(X_train,y_train)

CPU times: user 1min 38s, sys: 1.79 s, total: 1min 40s
Wall time: 2min 21s


In [35]:
predictions = text_clf.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

[[10621  1879]
 [ 2240 10260]]
              precision    recall  f1-score   support

           0       0.83      0.85      0.84     12500
           1       0.85      0.82      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

0.83524


# xgboost

In [37]:
%%time
from xgboost import XGBClassifier
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',XGBClassifier())])
text_clf.fit(X_train,y_train)

CPU times: user 4min 28s, sys: 12 s, total: 4min 40s
Wall time: 2min 6s


In [38]:
predictions = text_clf.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

[[10465  2035]
 [ 1558 10942]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.85     12500
           1       0.84      0.88      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

0.85628
