In [8]:
#Import pandas and numpy

import pandas as pd
import numpy as np


In [4]:
#Read DataFrame 

df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')

In [5]:
df.head(10)

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
5,neg,"to put it bluntly , ed wood would have been pr..."
6,neg,"synopsis : melissa , a mentally-disturbed woma..."
7,neg,tim robbins and martin lawernce team up in thi...
8,neg,"in "" gia "" , angelina jolie plays the titular ..."
9,neg,"in 1990 , the surprise success an unheralded l..."


In [6]:
# Check for number of reviews and if there is any problem or issue with the data

len(df)

2000

In [13]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [14]:
#There are 35 null reviews. To solve this problem, we are simply going to delete those reviews.

df.dropna(inplace=True)



In [15]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [18]:
#The first step to clean our data is finished. Now we must check if there are reviews which are basically white space

blanks=[]

for index,label,review_text in df.itertuples():
    if review_text.isspace():
        blanks.append(index)


In [20]:
#Check the index of all the elements that are just blank spaces

blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [21]:
# Now we are going to delete these items

df.drop(blanks,inplace=True)

In [22]:
# CHeck the new length of your Dataset, after the cleaning process

len(df)

1938

In [23]:
# Now split the dataset into train and test sets so we can start getting into the Machine Learning part

from sklearn.model_selection import train_test_split

X=df['review']
y=df['label']

X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.30, random_state=42)

In [24]:
# Use the Pipeline to build the sequence of models necessary to train our dataset

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())
                     ])


In [25]:
# Fit the data 

text_clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [26]:
# Predict using X_test 

predictions = text_clf.predict(X_test)

In [27]:
#Now check how are the metrics of your model

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [28]:
print(confusion_matrix(y_test, predictions))

[[235  47]
 [ 41 259]]


In [29]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

   micro avg       0.85      0.85      0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582



In [30]:
print(accuracy_score(y_test, predictions))

0.8487972508591065


In [31]:
# THe result is quite good given the fact that this model was built considering only the raw review text

In [39]:
df['label'].value_counts()

pos    969
neg    969
Name: label, dtype: int64