In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('dataset/MovieReviewTrainingDatabase.csv')

In [3]:
data.head()

Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...


The data is presented in two columns 
* **sentiment** : there is two labels [positive , negative] presenting the sentiment expressed by the content.
* **review** : present the content in question

We shall proceed by preparing the data to be fed to our model. In another words , we have to turn this string data into numerical. For the class column is simple , we can map postive into 1 and negative to 0. 

In [4]:
data.sentiment.unique()

array(['Positive', 'Negative'], dtype=object)

In [6]:
data.sentiment = data.sentiment.apply(lambda x: 1 if x=='Positive' else 0)

In [8]:
len(data[data.duplicated()])

96

In [9]:
data.drop_duplicates(inplace=True)

For the review column we should do further preprocessing:
* Remove stop words 
* Remove punctuations
* Uncaptialize words
* Encode words into vectors

In [88]:
import re 
from stop_words import get_stop_words
from string import punctuation
stops = get_stop_words(language='en')
def preprocess(text):
    text = text.translate(str.maketrans('', '', punctuation))
    return " ".join([word.lower() for word in text.split(" ") if not word.lower() in stops])

_________

In [21]:
from sklearn.svm import SVC

In [68]:
from sklearn.metrics import accuracy_score, classification_report

In [38]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [44]:
from tqdm.notebook import tqdm
tqdm.pandas()
data['encoding'] = data.review.progress_apply(lambda w:model.encode(w))

  0%|          | 0/24904 [00:00<?, ?it/s]

In [46]:
X_train , X_test , y_train , y_test = train_test_split(data['encoding'],data.sentiment,random_state=42,test_size=0.25)

In [61]:
svm = SVC()
model = svm.fit(np.array(X_train.tolist()),y_train)

In [62]:
pred = model.predict(np.array(X_train.tolist()))

In [63]:
accuracy_score(y_train,pred)

0.9166934361280651

In [64]:
pred_test = model.predict(np.array(X_test.tolist()))

In [66]:
accuracy_score(y_test,pred_test)

0.8355284291680052

In [70]:
print(classification_report(y_train,pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      9343
           1       0.91      0.92      0.92      9335

    accuracy                           0.92     18678
   macro avg       0.92      0.92      0.92     18678
weighted avg       0.92      0.92      0.92     18678



In [71]:
print(classification_report(y_test,pred_test))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83      3089
           1       0.83      0.85      0.84      3137

    accuracy                           0.84      6226
   macro avg       0.84      0.84      0.84      6226
weighted avg       0.84      0.84      0.84      6226

