# **Sentiment Analysis on Movie Reviews**

# Step 1.Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df=pd.read_csv("/content/labeledTrainData.tsv",sep='\t')


In [4]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
df.shape

(25000, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [7]:
df.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

# Step 2.Data Preprocessing



1.   Removal of punctuations














In [8]:
PUNCH_TO_REMOVE=string.punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('','',PUNCH_TO_REMOVE))

df['review'] = df['review'].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...
2,7759_3,0,The film starts with a manager Nicholas Bell g...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...





2.   Removal of Stop words



In [9]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords
",".join(stopwords.words("english"))

"i,me,my,myself,we,our,ours,ourselves,you,you're,you've,you'll,you'd,your,yours,yourself,yourselves,he,him,his,himself,she,she's,her,hers,herself,it,it's,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,that'll,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,don't,should,should've,now,d,ll,m,o,re,ve,y,ain,aren,aren't,couldn,couldn't,didn,didn't,doesn,doesn't,hadn,hadn't,hasn,hasn't,haven,haven't,isn,isn't,ma,mightn,mightn't,mustn,mustn't,needn,needn't,shan,shan't,shouldn,shouldn't,wasn,wasn't,weren,weren't,won,won't,wouldn,wouldn't"

In [11]:
STOP_WORDS=set(stopwords.words("english"))
def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in STOP_WORDS])

df['review']=df['review'].apply(lambda text:remove_stopwords(text))
df.head()


Unnamed: 0,id,sentiment,review
0,5814_8,1,With stuff going moment MJ ive started listeni...
1,2381_9,1,The Classic War Worlds Timothy Hines entertain...
2,7759_3,0,The film starts manager Nicholas Bell giving w...
3,3630_4,0,It must assumed praised film greatest filmed o...
4,9495_8,1,Superbly trashy wondrously unpretentious 80s e...





3. Removal of contractions



In [12]:
#contractions
!pip install contractions



In [13]:
import contractions

In [14]:
def remove_contractions(text):
  return contractions.fix(text)


df['review']=df['review'].apply(lambda text:remove_contractions(text))
df.head()


Unnamed: 0,id,sentiment,review
0,5814_8,1,With stuff going moment MJ i have started list...
1,2381_9,1,The Classic War Worlds Timothy Hines entertain...
2,7759_3,0,The film starts manager Nicholas Bell giving w...
3,3630_4,0,It must assumed praised film greatest filmed o...
4,9495_8,1,Superbly trashy wondrously unpretentious 80s e...






4.   Lemmatization






In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()
def lemmatize_words(text):
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df['review']=df['review'].apply(lambda text:lemmatize_words(text))
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With stuff going moment MJ i have started list...
1,2381_9,1,The Classic War Worlds Timothy Hines entertain...
2,7759_3,0,The film start manager Nicholas Bell giving we...
3,3630_4,0,It must assumed praised film greatest filmed o...
4,9495_8,1,Superbly trashy wondrously unpretentious 80 ex...


## Step 3.Feature Extraction

In [17]:
#Feature Extraction - Bag of Words (BoW)
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

## Step 4: Model Building and Evaluation

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# Spliting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Building and training the Logistic Regression model
from sklearn.linear_model import LogisticRegression

In [22]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [23]:
# Predict sentiment on the testing set
y_pred = model.predict(X_test)

In [24]:
print(y_pred)

[0 0 0 ... 0 0 0]


In [25]:
# Evaluating the model's performance
from sklearn.metrics import accuracy_score, classification_report

In [26]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [27]:
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Accuracy: 0.8582
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.86      2481
           1       0.85      0.87      0.86      2519

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



In [28]:
# Building and training the MultinomialNB model
from sklearn.naive_bayes import MultinomialNB
naivebayes = MultinomialNB()
naivebayes.fit(X_train, y_train)

In [29]:
# Evaluating the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [30]:
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Accuracy: 0.8582
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.86      2481
           1       0.85      0.87      0.86      2519

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



Both models are giving same accuracy
