# Day-1 Understanding the data

In [1]:
# Import Required Libraries
import numpy as np
import pandas as pd

In [2]:
# Load the Dataset
df= pd.read_csv('IMDB Dataset.csv')

In [3]:
# First 5 Rows
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Dataset Shape
df.shape

(50000, 2)

In [5]:
# Column Names
df.columns

Index(['review', 'sentiment'], dtype='object')

In [6]:
# Data Types
df.dtypes

review       object
sentiment    object
dtype: object

In [7]:
# Class Distribution
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
# Missing Values
df.isnull().sum()
# here ve dont have null values in dataset

review       0
sentiment    0
dtype: int64

In [9]:
df['review'][0] # full review of 0th row

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [10]:
# here we are mapping 1 and 0 for pos and neg.
df['sentiment'] = df['sentiment'].map({'positive': 1,'negative': 0})


In [11]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [12]:
df.info()
# review is object and sentiment is int64 now.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


# Day-2 Text Preprocessing

In [13]:
import re # Regular Expression:- Used for pattern matching and text cleaning,To remove:HTML tags (<br />),Numbers,Special characters.
import nltk # NLTK = Natural Language Toolkit : for text and language processing,What does it provide:-Stopwords,Tokenizers,Lemmatizers,NLP datasets
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# These all are needed to clean data of reviews and remove unwanted words which are not useful for sentiment analysis.

In [14]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') # Open Multilingual WordNet:-Required dependency

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
# Initialize Stopwords & Lemmatizer
stop_words=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

In [16]:
# Text Cleaning Function
def clean_text(text):
    # convert to lower-case.
    text=text.lower()

    # Remove html tags
    text=re.sub(r'<.*?>','',text)

    # Remove punctuations and letters
    text=re.sub(r'[^a-z\s]','',text)

    # Tokenize(split into words)
    words=text.split()

    # Remove stopwords and lemmatizers from word
    words=[lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return " ".join(words)

In [17]:
df['clean_review']=df['review'].apply(clean_text)

In [18]:
# Compare Raw vs Cleaned Review
df[['review','clean_review']].head()

Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...


In [19]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [20]:
df['clean_review'][0]

'one reviewer mentioned watching oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death stare dodgy dealing shady agreement never far awayi would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate turned prison bit

In [21]:
df.columns

Index(['review', 'sentiment', 'clean_review'], dtype='object')

# Day-3 (Feature extraction -BagOfWords,Tf-IDF)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [23]:
# Prepare Input Text
X_text=df['clean_review']
y=df['sentiment']

In [24]:
bow_vectorizer=CountVectorizer()

In [25]:
X_bow=bow_vectorizer.fit_transform(X_text)

In [26]:
X_bow.shape

(50000, 203415)

In [27]:
# View Vocabulary (Sample)
bow_vectorizer.get_feature_names_out()[:20]

array(['aa', 'aaa', 'aaaaaaaaaaaahhhhhhhhhhhhhh', 'aaaaaaaargh',
       'aaaaaaah', 'aaaaaaahhhhhhggg', 'aaaaagh', 'aaaaah', 'aaaaargh',
       'aaaaarrrrrrgggggghhhhhh', 'aaaaatchkah', 'aaaaaw', 'aaaahhhhhh',
       'aaaahhhhhhh', 'aaaand', 'aaaarrgh', 'aaaawwwwww', 'aaaggghhhhhhh',
       'aaaghi', 'aaah'], dtype=object)

In [28]:
# Create TF-IDF Vectorizer
tfidf_vectorizer=TfidfVectorizer()

In [29]:
X_tfidf=tfidf_vectorizer.fit_transform(X_text)

In [30]:
X_tfidf.shape

(50000, 203415)

In [31]:
# Understand the Output Type
type(X_tfidf)

scipy.sparse._csr.csr_matrix

In [32]:
# Convert One Review to See Values
X_tfidf[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 140 stored elements and shape (1, 203415)>

In [33]:
# Day 3 converts cleaned text into numerical feature vectors so that machine learning models can learn sentiment patterns from reviews.

# Day-4 (Model Training)

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [35]:
X_train,X_test,y_train,y_test=train_test_split(X_tfidf,y,test_size=0.2,random_state=42)

In [36]:
# Naive Bayes Model
nb_model=MultinomialNB()

In [37]:
nb_model.fit(X_train,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [38]:
# Logistic Regression Model
lr_model=LogisticRegression(max_iter=1000)

In [39]:
lr_model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [40]:
# Make Predictions
nb_preds=nb_model.predict(X_test)
lr_preds=lr_model.predict(X_test)

In [41]:
# Quick Sanity Check
nb_preds[:10],lr_preds[:10]

(array([1, 1, 0, 1, 0, 1, 1, 0, 0, 0]), array([0, 1, 0, 1, 0, 1, 1, 0, 0, 0]))

# Day-5 Model evaluation and comparision.

In [42]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [43]:
# Accuracy Score
nb_accuracy=accuracy_score(y_test,nb_preds)
print('Navie Bayes Accuracy: ',nb_accuracy)

lr_accuracy=accuracy_score(y_test,lr_preds)
print('Logistic Regression Accuracy: ',lr_accuracy)

Navie Bayes Accuracy:  0.8663
Logistic Regression Accuracy:  0.8943


In [44]:
# Classification Report
print("Naive Bayes Classification Report:\n")
print(classification_report(y_test,nb_preds))

print("Logistic Regression Classification Report:\n")
print(classification_report(y_test,lr_preds))

Naive Bayes Classification Report:

              precision    recall  f1-score   support

           0       0.85      0.88      0.87      4961
           1       0.88      0.85      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.88      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [45]:
# Confusion Matrix
print('Navie Bayes:')
print(confusion_matrix(y_test,nb_preds))

print('\nLogistic Regression:')
print(confusion_matrix(y_test,lr_preds))

Navie Bayes:
[[4371  590]
 [ 747 4292]]

Logistic Regression:
[[4364  597]
 [ 460 4579]]


# Day-6 Sentiment → Recommendation Logic

In [46]:
# Assume We Have Reviews of One Movie
movie_reviews=[
    "The movie was amazing and emotionally powerful",
    "Great acting but the story was slow",
    "Boring movie, I almost slept",
    "Excellent direction and background music",
    "Not good, very disappointing"
]


In [47]:
cleaned_reviews=[clean_text(review) for review in movie_reviews]

In [48]:
# Convert ALL Reviews to TF-IDF
review_vectors=tfidf_vectorizer.transform(cleaned_reviews)

In [49]:
review_vectors.shape

(5, 203415)

In [50]:
# Predict Sentiment for EACH Review
prediction=lr_model.predict(review_vectors)

In [51]:
prediction

array([1, 1, 0, 1, 0])

In [52]:
# Count Positive vs Negative Reviews
positive_count=prediction.sum()
negative_count=len(prediction)-positive_count

In [53]:
# Decide Watch / Not Watch
if positive_count>negative_count:
    recommendation="Recommended to Watch"
else:
    recommendation="Not Recommended"

In [54]:
# Generate IMDB style rating out of 10:-

In [55]:
# Get Positive Probabilities for Each Review.
probabilities=lr_model.predict_proba(review_vectors)

In [56]:
# [
#  [0.08, 0.92],
#  [0.30, 0.70],
#  [0.85, 0.15],
#  [0.10, 0.90],
#  [0.60, 0.40]
# ]
# [negative_probability, positive_probability]

In [57]:
# Extract Positive Sentiment Scores
positive_scores=probabilities[:,1]

In [58]:
avg_positive_scores=positive_scores.mean()

In [59]:
rating=round(avg_positive_scores*10,1)

In [60]:
print("Total Reviews:",len(movie_reviews))
print("Positive Reviews:",positive_count)
print("Negative Reviews:",negative_count)
print("Recommendation:",recommendation)
print("Rating:",rating,)

Total Reviews: 5
Positive Reviews: 3
Negative Reviews: 2
Recommendation: Recommended to Watch
Rating: 5.8


# Exporting .pkl for the model trained so that can be used in the application

In [61]:
import pickle

In [65]:
# Save TF-IDF vectorizer
with open("tfidf_vectorizer.pkl","wb") as f:
    pickle.dump(tfidf_vectorizer,f)

In [63]:
with open("lr_model.pkl","wb") as f:
    pickle.dump(lr_model,f)

In [64]:
print("Model and Vectorizer saved Successfully")

Model and Vectorizer saved Successfully
