# MOVIE REVIEWS CLASSIFICATION
- Objective: To develop a sentiment analysis model to classify movies as either positive or negative
- Data: IMDB movie reviews data

### 1. Import required modules

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import f1_score, precision_score, recall_score

### 2. Import raw data and define X, y variables

In [2]:
raw_data = pd.read_csv('Data/imdb_reviews_combined.csv')
raw_data.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,positive
1,Homelessness (or Houselessness as George Carli...,positive
2,Brilliant over-acting by Lesley Ann Warren. Be...,positive
3,This is easily the most underrated film inn th...,positive
4,This is not the typical Mel Brooks film. It wa...,positive


In [3]:
raw_data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [4]:
raw_data.shape

(50000, 2)

In [5]:
X = raw_data['review']
y = raw_data['sentiment']

### 3. Data pre-processing
- Encode the target variable
- Vectorize the reviews
- Split data into train and test sets

In [6]:
y = LabelEncoder().fit_transform(y)
X = TfidfVectorizer(strip_accents = 'ascii', stop_words='english').fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### 4. Model fitting and evaluation

In [7]:
lr = LogisticRegression().fit(X_train, y_train)
print(f'Train Score: {lr.score(X_train, y_train):.2%}')
print(f'Test Score: {lr.score(X_test, y_test):.2%}')
print(f'Precision: {precision_score(y_test, lr.predict(X_test)):.2%}')
print(f'Recall: {recall_score(y_test, lr.predict(X_test)):.2%}')
print(f'F1: {f1_score(y_test, lr.predict(X_test)):.2%}')

Train Score: 93.38%
Test Score: 89.29%
Precision: 88.34%
Recall: 90.57%
F1: 89.44%


In [8]:
cnb = ComplementNB().fit(X_train, y_train)
print(f'Train Score:{cnb.score(X_train, y_train):.2%}')
print(f'Test Score:{cnb.score(X_test, y_test):.2%}')
print(f'Precision: {precision_score(y_test, cnb.predict(X_test)):.2%}')
print(f'Recall: {recall_score(y_test, cnb.predict(X_test)):.2%}')
print(f'F1: {f1_score(y_test, cnb.predict(X_test)):.2%}')

Train Score:91.07%
Test Score:86.03%
Precision: 87.41%
Recall: 84.24%
F1: 85.80%
