In [27]:
# IMPORT LIBRARIES
import numpy as np
import pandas as pd

In [28]:
# READ DATASET
df = pd.read_csv('../../Datasets/moviereviews2.tsv',sep='\t')

In [29]:
# EXPLORE FEATURES OF THE DATASET
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [30]:
df.tail()

Unnamed: 0,label,review
5995,pos,"Of the three remakes of this plot, I like them..."
5996,neg,Poor Whoopi Goldberg. Imagine her at a friend'...
5997,neg,"Honestly before I watched this movie, I had he..."
5998,pos,This movie is essentially shot on a hand held ...
5999,pos,It has singing. It has drama. It has comedy. I...


In [31]:
len(df)

6000

In [32]:
# CHECK FOR MISSIMG VALUES
df.isnull().sum()

label      0
review    20
dtype: int64

In [33]:
# REMOVE "NOT A NUMBER"
df.dropna(inplace=True)

In [34]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [35]:
# check for empty strings

blanks = []
# index, label, review
for i,lb,rw in df.itertuples():
    if rw.isspace():
        blanks.append(i)

len(blanks)

0

In [36]:
df.drop(blanks, inplace=True)

In [37]:
len(df)

5980

In [38]:
df['label'].value_counts()

pos    2990
neg    2990
Name: label, dtype: int64

---

In [14]:
# SPLIT THE DATA -- TRAINING vs TEST

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X = df['review']

y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [41]:
# build a pipeline to vectorize the data, then train and fit the model

# PIPELINE
from sklearn.pipeline import Pipeline
# FEATURE EXTRACTION
from sklearn.feature_extraction.text import TfidfVectorizer
# CLASSIFIER
from sklearn.svm import LinearSVC

In [42]:
# set up the pipeline

pipeline_model = Pipeline([('tfidf',TfidfVectorizer()),
                           ('classifier',LinearSVC())])

In [43]:
# train the model
pipeline_model.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('classifier', LinearSVC())])

In [44]:
# predictions

predictions = pipeline_model.predict(X_test)

In [45]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [46]:
# confusion matrix
print(confusion_matrix(y_test,predictions))

[[900  91]
 [ 63 920]]


In [47]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.93      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



In [48]:
print("{:.2f}".format(accuracy_score(y_test,predictions)))

0.92


In [25]:
pipeline_model.predict(['This movie sucks!'])

array(['neg'], dtype=object)

In [26]:
pipeline_model.predict(['I really enjoyed this movie'])

array(['pos'], dtype=object)