In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("movies_sentiment_data.csv")
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [13]:
df.shape

(19000, 3)

In [14]:
df.sentiment.value_counts()

positive    9500
negative    9500
Name: sentiment, dtype: int64

In [15]:
df['positive_sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.head()

Unnamed: 0,review,sentiment,positive_sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1


## Train Test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.review, df.positive_sentiment, test_size=0.2)

In [17]:
X_train.shape

(15200,)

In [18]:
X_test.shape

(3800,)

# Exrecise 1

## Training model using sklearn pipeline using Random Forest

In [32]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=50, criterion='entropy'))
])

In [33]:
clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1859
           1       0.84      0.82      0.83      1941

    accuracy                           0.83      3800
   macro avg       0.83      0.83      0.83      3800
weighted avg       0.83      0.83      0.83      3800



# Exercise 2

## Training model using sklearn pipeline using KNN

In [36]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])

In [37]:
clf.fit(X_train, y_train)

In [39]:
y_pred = clf.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.64      0.64      1859
           1       0.65      0.64      0.65      1941

    accuracy                           0.64      3800
   macro avg       0.64      0.64      0.64      3800
weighted avg       0.64      0.64      0.64      3800



# Exercise 3

## Training model using sklearn pipeline using Multinomial Naive Bayes 

In [41]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [42]:
clf.fit(X_train, y_train)

In [43]:
y_pred = clf.predict(X_test)

In [44]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85      1859
           1       0.88      0.80      0.84      1941

    accuracy                           0.84      3800
   macro avg       0.85      0.85      0.84      3800
weighted avg       0.85      0.84      0.84      3800

