# Sentiment Classification Movie Review

The goal is to predict the sentiment (positive or negative) of a given movie review. 

Naive Bayes

SVM

Logistic Regression

Decision Tree

Neural Networks

Random Forest

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('labeledTrainData.tsv', sep='\t',usecols=['review','sentiment'])

In [None]:
data

In [None]:
sns.barplot(data['sentiment'].value_counts())

## Text Preprocessing

#### Removing Stop Words

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:


def remove_stopwords(text):
    x = []
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    return x


In [None]:
data['cleaned_review'] = data['review'].apply(remove_stopwords)

#### Lemmatization or Stemming

In [None]:
import nltk

# Specify the data path where NLTK should look for resources
nltk.data.path.append("/path/to/nltk_data")

# Download the "wordnet" resource
nltk.download('wordnet')

In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text.split()]
    return ' '.join(lemmatized_text)


In [None]:
data['cleaned_review'] = data['review'].apply(lemmatize_text)

#### Removing special characters

In [None]:
import re

def remove_special_characters(text):
    # Remove special characters and punctuation
    clean_text = re.sub('[^A-Za-z\s]', '', text)
    return clean_text


In [None]:
data['cleaned_review'] = data['review'].apply(remove_special_characters)

#### Converting to lower case

In [None]:
def convert_to_lowercase(text):
    # Convert text to lowercase
    lowercase_text = text.lower()
    return lowercase_text


In [None]:
data['cleaned_review'] = data['review'].apply(convert_to_lowercase)

#### Removing Numerical digits

In [None]:
def remove_numeric(text):
    # Remove numeric values
    clean_text = re.sub(r'\d+', '', text)
    return clean_text


In [None]:
data['cleaned_review'] = data['review'].apply(remove_numeric)

#### Removing symbols

In [None]:
import re

def remove_symbols(text):
    # Remove symbols using regular expression
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    return cleaned_text

In [None]:
data['cleaned_review'] = data['review'].apply(remove_symbols)

#### Text Processing Completed

In [None]:
data['cleaned_review'].sample(1)

Exporting cleaned data set to csv file and importing and splitting the data set 

data.to_csv('cleaned.csv',index=False)

In [2]:
df = pd.read_csv('cleaned.csv',usecols=['cleaned_review','sentiment'])

In [3]:
import re

def removing_specific_symbols(text):
    # Remove symbols using regular expression
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    return cleaned_text

df['review'] = df['cleaned_review'].apply(removing_specific_symbols)

In [4]:
df = df[['review','sentiment']]

#### Splitting Data set

In [5]:
X = df[['review']]
y = df['sentiment']

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and temporary (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temporary data into training (80%) and validation (20%)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [7]:
print(X_train.shape,y_train.shape,X_valid.shape,y_valid.shape,X_test.shape,y_test.shape)

(15000, 1) (15000,) (5000, 1) (5000,) (5000, 1) (5000,)


#### Vectorizing Features

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000)

In [9]:
X_train = cv.fit_transform(X_train['review']).toarray()

In [10]:
X_valid = cv.transform(X_valid['review']).toarray()

In [11]:
X_test = cv.transform(X_test['review']).toarray()

In [12]:
X_train.shape

(15000, 1000)

In [13]:
print(X_train.shape,y_train.shape,X_valid.shape,y_valid.shape,X_test.shape,y_test.shape)

(15000, 1000) (15000,) (5000, 1000) (5000,) (5000, 1000) (5000,)


In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [16]:
def train_model(models, X_train, y_train,X_valid,y_valid):
    for model in models:
        model.fit(X_train,y_train)
        
        # Evaluate the model on the validation set
        accuracy = model.score(X_valid, y_valid)
        
        print(f'Accuracy % of {model.__class__.__name__}: {accuracy}')
        print('----------------------------------------------')


# Define the list of models
models = [GaussianNB(), MultinomialNB(), BernoulliNB()]

# Train and evaluate the models
train_model(models, X_train, y_train,X_valid,y_valid)

Accuracy % of GaussianNB: 0.792
----------------------------------------------
Accuracy % of MultinomialNB: 0.82
----------------------------------------------
Accuracy % of BernoulliNB: 0.8262
----------------------------------------------


In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
bernoulli_nb = BernoulliNB()
grid_search = GridSearchCV(bernoulli_nb, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Parameters: ", grid_search.best_params_)
print("Best Cross-Validation Score: {:.2f}".format(grid_search.best_score_))

Best Parameters:  {'alpha': 0.5}
Best Cross-Validation Score: 0.82


In [19]:
model = BernoulliNB(alpha=0.5)
model.fit(X_train,y_train)

In [20]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print("Test Accuracy: {:.2f}".format(accuracy))


Test Accuracy: 0.83


In [21]:
from sklearn.metrics import recall_score,precision_score,f1_score

prediction = model.predict(X_valid)
print("Precision = ",format(precision_score(y_valid,prediction)))
print("Recall = ",format(recall_score(y_valid,prediction)))
print("F-1 Score = ",format(f1_score(y_valid,prediction)))

Precision =  0.8121442125237192
Recall =  0.8515718265021887
F-1 Score =  0.8313908313908314
