## 1. Import necassary libraries

In [18]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

## 2. Load and Inspect the Dataset

In [7]:
path = r'imdb-50K-movie-reviews'
train_path = os.path.join(path, 'IMDB Dataset.csv')

In [8]:

movies = pd.read_csv(train_path)

In [9]:
movies.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
movies['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [11]:
movies['sentiment'] = movies['sentiment'].map({'positive':1, 'negative':0})

In [12]:
movies['sentiment'].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

## 3. Data Preprocessing

In [13]:
lemmat = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [32]:
def preprocessing(text):
    
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    #tokens = word_tokenize(text)
    tokens = text.split()
    tokens = [lemmat.lemmatize(word) for word in tokens if word not in stopwords]
    tokens = ' '.join(tokens)
    return tokens

In [15]:
movies['clean'] = movies['review'].apply(preprocessing)

In [16]:
X, Y = movies['clean'], movies['sentiment']

## 4. Split Data into Training and Testing Sets


In [19]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size= 0.2, random_state= 42)

## 5. Feature Extraction using TF-IDF

In [21]:
vectorizer = TfidfVectorizer(max_features= 5000)


In [22]:
train_vec = vectorizer.fit_transform(xtrain)
test_vec = vectorizer.transform(xtest)

## 6. Define Evaluation Metrics

In [23]:
from sklearn.metrics import accuracy_score, classification_report

In [24]:
def metric(y, ypred):
    acc = accuracy_score(y, ypred)
    report = classification_report(y, ypred)
    return acc, report

## 7. Build and Train Models

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [26]:
naive = MultinomialNB()
naive.fit(train_vec, ytrain)

In [27]:
svm = SVC(kernel= 'linear')
svm.fit(train_vec, ytrain)

## 8. Evaluate Models

In [40]:
def predict_sentiment(text, model):
    #text = preprocessing(text)
    text_vect = vectorizer.transform(text)
    prediction = model.predict(text_vect)
    return prediction

In [46]:
print('naive Bayes performance: ')
acc, rep = metric(ytest, predict_sentiment(xtest, naive))
print(acc, rep)


naive Bayes performance: 
0.8552               precision    recall  f1-score   support

           0       0.86      0.85      0.85      4961
           1       0.85      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [49]:
print('SVM performance: ')
acc_s, rep_s = metric(ytest, predict_sentiment(xtest, svm))
print(acc_s, rep_s)

SVM performance: 
0.8866               precision    recall  f1-score   support

           0       0.89      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

