# Notebook Set-up

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Raw Data

In [3]:
raw_data = pd.read_csv(r'C:\Users\ibrah\Desktop\Projects\review-sentiment-analysis\data\IMDB Dataset.csv')

In [4]:
raw_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Exploratory analysis

In [5]:
# Get info about the dataset
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
# Check for missing values
raw_data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
# Get basic statistics
raw_data.describe(include='all')

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [8]:
# Check sentiment distribution
raw_data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Cleaning

In [17]:
import logging

# Set up basic logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [18]:
# Function to clean the text data (removing punctiation, lowercase, lemmaisation and removing stopwords)
nltk.download("stopwords")
nltk.download("wordnet")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text).lower()
    logging.info(f"Lowercase done")
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords.words("english")]
    logging.info(f"Lowercase done")
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    logging.info(f"lemmarisation done")
    
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ibrah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ibrah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Apply the cleaning function to the review column
# cleaned_data = raw_data.copy()
# cleaned_data['review'] = cleaned_data['review'].apply(preprocess)
# cleaned_data

2025-05-24 22:38:59,415 - INFO - Lowercase done
2025-05-24 22:38:59,457 - INFO - Lowercase done
2025-05-24 22:38:59,458 - INFO - lemmarisation done
2025-05-24 22:38:59,458 - INFO - Lowercase done
2025-05-24 22:38:59,480 - INFO - Lowercase done
2025-05-24 22:38:59,481 - INFO - lemmarisation done
2025-05-24 22:38:59,481 - INFO - Lowercase done
2025-05-24 22:38:59,503 - INFO - Lowercase done
2025-05-24 22:38:59,504 - INFO - lemmarisation done
2025-05-24 22:38:59,504 - INFO - Lowercase done
2025-05-24 22:38:59,523 - INFO - Lowercase done
2025-05-24 22:38:59,523 - INFO - lemmarisation done
2025-05-24 22:38:59,524 - INFO - Lowercase done
2025-05-24 22:38:59,555 - INFO - Lowercase done
2025-05-24 22:38:59,556 - INFO - lemmarisation done
2025-05-24 22:38:59,556 - INFO - Lowercase done
2025-05-24 22:38:59,573 - INFO - Lowercase done
2025-05-24 22:38:59,574 - INFO - lemmarisation done
2025-05-24 22:38:59,574 - INFO - Lowercase done
2025-05-24 22:38:59,597 - INFO - Lowercase done
2025-05-24 22:38

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive
...,...,...
49995,thought movie right good job creative original...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,going disagree previous comment side maltin on...,negative


In [24]:
cleaned_data = cleaned_data.drop_duplicates()

In [25]:
cleaned_data.to_pickle('cleaned_data.pkl')

In [26]:
cleaned_data = pd.read_pickle('cleaned_data.pkl')
cleaned_data

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive
...,...,...
49995,thought movie right good job creative original...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,going disagree previous comment side maltin on...,negative


# Exploratary analysis - clean version

In [27]:
# Get info about the dataset
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49576 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49576 non-null  object
 1   sentiment  49576 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [28]:
# Check for missing values
cleaned_data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [29]:
# Get basic statistics
cleaned_data.describe(include='all')

Unnamed: 0,review,sentiment
count,49576,49576
unique,49576,2
top,one expects star trek movie high art fan expec...,positive
freq,1,24880


In [30]:
# Check sentiment distribution
cleaned_data['sentiment'].value_counts()

sentiment
positive    24880
negative    24696
Name: count, dtype: int64