In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../TextFiles/moviereviews2.tsv', sep='\t')
df.head(10)


Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...
5,neg,Lately they have been trying to hock this film...
6,neg,This is without a doubt the worst movie I have...
7,neg,"PLAN B has the appearance of a quickly made, u..."
8,pos,At least something good came out of Damon Runy...
9,pos,The story of Cinderella is one of my favorites...


In [2]:
# Check for NaN values:

df.dropna(inplace=True)

len(df)

5980

In [3]:
# Check for whitespace strings:


blanks = []

for index, label, review_text in df.itertuples():
    if review_text.isspace():
        blanks.append(index)

In [4]:
blanks #There are not whitespace strings

[]

In [5]:
#Split the data into train and test sets


from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']


X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [6]:
#Build a pipeline to vectorize the data


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC


model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
                  ])


model.fit(X_train,y_train)





Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [7]:
# Create prediction set


predictions = model.predict(X_test)

In [8]:
# Report the confusion matrix

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print(confusion_matrix(y_test, predictions))

[[900  91]
 [ 63 920]]


In [9]:
# Classification report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.93      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



In [10]:
# Overall accuracy

print(accuracy_score(y_test, predictions))

0.9219858156028369


In [11]:
# Test with random strings

my_text = ["I didn't enjoy the movie. The photography was aweful", "I thought that the movie was simply amazing. Espetacular acting", "just so-so"]

model.predict(my_text)

array(['neg', 'pos', 'neg'], dtype=object)