# 🎬 IMDB Movie Review Sentiment Analysis – Preprocessing

In [2]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m 

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [4]:
import re
import string
import contractions

In [5]:
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet

# Download required resources for tokenize and remove stopwords
nltk.download('punkt_tab')
nltk.download('stopwords')

# Download required resources for lemmatization and POS tagging
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [7]:
import pandas as pd

In [8]:
df = pd.read_csv(path + "/IMDB Dataset.csv")

In [9]:
def clean_text(text):
   # 1. Lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # 3. Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # 4. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 5. Remove numbers
    text = re.sub(r'\d+', '', text)

    # 6. Remove extra whitespaces
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)

    return text

In [10]:
def expand_contractions(text):
    return contractions.fix(text)

In [11]:
def tokenize_and_remove_stopwords(text):
    # Tokenize
    words = word_tokenize(text)

    # Remove stop words
    words = [word for word in words if word not in stop_words]

    return words

In [12]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_tokens(tokens):
    pos_tags = pos_tag(tokens)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return lemmatized_words

In [13]:
def handle_negations(tokens):
    negation_words = {"not", "no", "never", "none", "cannot", "n't"}
    new_tokens = []
    negate = False

    for word in tokens:
        if word in negation_words:
            negate = True
            continue
        if negate:
            new_tokens.append("NOT_" + word)
            negate = False
        else:
            new_tokens.append(word)
    return new_tokens

In [14]:
# Clean the text: remove unwanted characters, punctuation, and make lowercase
df['cleaned_review'] = df['review'].apply(clean_text)

# Expand contractions: e.g., "can't" → "cannot", "won't" → "will not"
df['expanded_review'] = df['cleaned_review'].apply(expand_contractions)

# Tokenize and remove stopwords: split sentences into words and remove common filler words
df['tokens'] = df['expanded_review'].apply(tokenize_and_remove_stopwords)

# Lemmatize tokens: reduce words to their base or dictionary form (e.g., "running" → "run")
df['lemmatized_tokens'] = df['tokens'].apply(lemmatize_tokens)

# Handle negations: adjust token meanings around negation words (e.g., "not good" → "not_good")
df['final_tokens'] = df['lemmatized_tokens'].apply(handle_negations)

# Move the 'sentiment' column to the end of the DataFrame
if 'sentiment' in df.columns:
    cols = [col for col in df.columns if col != 'sentiment'] + ['sentiment']
    df = df[cols]

In [15]:
# Save the processed DataFrame to a CSV file
df.to_csv('processed_reviews.csv', index=False)