In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np

In [2]:
df = pd.read_csv("imdb62.csv")
df.head()

Unnamed: 0,reviewId,userId,itemId,rating,title,content
0,490485,33913,444584,1.0,Somebody call PETA !,I caught glimpses of this show which feature a...
1,490043,33913,123366,1.0,Just Awful ! Mail Rosie a mussel to shut her b...,"Believe it or not , I watched this show in the..."
2,490772,33913,117509,1.0,It's not for me ! I just hated it !,After only watching a few minutes of this adap...
3,490415,33913,169455,2.0,"Rosie O'Donnell , Barbara Walters , What has t...",I'm sorry but Rosie O'Donnell talking about he...
4,490180,33913,1191056,2.0,I thought the Kardashians were bad !,"Okay , the reality show premise is five housew..."


In [3]:
def remove_unicode(text):
    if isinstance(text, str):
        # Remove non-ASCII characters
        text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

# Step 2: Lowercasing
def to_lowercase(text):
    if isinstance(text, str):
        return text.lower()
    return text

# Apply to both columns
for col in ['title', 'content']:
    df[col] = df[col].apply(remove_unicode)
    df[col] = df[col].apply(to_lowercase)

# DATASET 1: BOTH STOPWORDS AND PUNCTUATION INCLUDED 

In [6]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/parth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/parth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/parth/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /home/parth/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
lemmatizer = WordNetLemmatizer()

# Helper function to map POS tags to WordNet format
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

# Tokenisation + Lemmatization
def tokenize_and_lemmatize(text):
    if isinstance(text, str):
        # Step 1: Tokenize
        tokens = word_tokenize(text)
        
        # Step 2: POS tagging for better lemmatization
        pos_tags = nltk.pos_tag(tokens)
        
        # Step 3: Lemmatize using POS tags
        lemmatized = [
            lemmatizer.lemmatize(word, get_wordnet_pos(tag))
            for word, tag in pos_tags
        ]
        
        return lemmatized  # returns a list of tokens
    return text

# Apply to both columns
for col in ['title', 'content']:
    df[col] = df[col].apply(tokenize_and_lemmatize)

In [8]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/parth/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [10]:
df.head(10)

Unnamed: 0,reviewId,userId,itemId,rating,title,content
0,490485,33913,444584,1.0,"[somebody, call, peta, !]","[i, catch, glimpse, of, this, show, which, fea..."
1,490043,33913,123366,1.0,"[just, awful, !, mail, rosie, a, mussel, to, s...","[believe, it, or, not, ,, i, watch, this, show..."
2,490772,33913,117509,1.0,"[it, 's, not, for, me, !, i, just, hat, it, !]","[after, only, watch, a, few, minute, of, this,..."
3,490415,33913,169455,2.0,"[rosie, o'donnell, ,, barbara, walter, ,, what...","[i, 'm, sorry, but, rosie, o'donnell, talk, ab..."
4,490180,33913,1191056,2.0,"[i, think, the, kardashians, be, bad, !]","[okay, ,, the, reality, show, premise, be, fiv..."
5,490230,33913,388795,2.0,"[why, brokeback, lose, the, oscar, ?]","[do, you, really, want, to, know, why, it, do,..."
6,489825,33913,257295,2.0,"[orwell, would, have, be, shock, !]","[i, do, n't, get, big, brother, ., i, read, ge..."
7,490100,33913,1086761,2.0,"[the, guide, to, parent, not, !]","[remember, robert, kardashian, ,, he, be, the,..."
8,490337,33913,157246,2.0,"[just, a, disgrace, !]","[i, try, to, like, will, and, grace, ., believ..."
9,490338,33913,461721,2.0,"[sorry, ,, this, be, n't, it, ,, seth, ?]","[i, love, seth, green, ., his, appearance, on,..."


In [11]:
df.to_csv('imdb62_A.csv', index=False)

# DATASET 2: STOPWORDS REMOVED

In [13]:
df2 = pd.read_csv("imdb62.csv")
df2.head(10)

Unnamed: 0,reviewId,userId,itemId,rating,title,content
0,490485,33913,444584,1.0,Somebody call PETA !,I caught glimpses of this show which feature a...
1,490043,33913,123366,1.0,Just Awful ! Mail Rosie a mussel to shut her b...,"Believe it or not , I watched this show in the..."
2,490772,33913,117509,1.0,It's not for me ! I just hated it !,After only watching a few minutes of this adap...
3,490415,33913,169455,2.0,"Rosie O'Donnell , Barbara Walters , What has t...",I'm sorry but Rosie O'Donnell talking about he...
4,490180,33913,1191056,2.0,I thought the Kardashians were bad !,"Okay , the reality show premise is five housew..."
5,490230,33913,388795,2.0,Why Brokeback Lost the Oscar ?,Do you really want to know why it didn't win t...
6,489825,33913,257295,2.0,Orwell would have been shocked !,I don't get Big Brother . I read George Orwell...
7,490100,33913,1086761,2.0,The Guide to Parenting Not !,"Remember Robert Kardashian , he was the attorn..."
8,490337,33913,157246,2.0,Just a Disgrace !,I tried to like Will and Grace . Believe me I ...
9,490338,33913,461721,2.0,"Sorry , this isn't it , Seth ?",I love Seth Green . His appearances on THat 70...


In [16]:
from nltk.corpus import stopwords

# Initialize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def preprocess(text):
    if isinstance(text, str):
        # Step 1: Remove Unicode strings
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        
        # Step 2: Lowercase
        text = text.lower()
        
        # Step 3: Tokenize
        tokens = word_tokenize(text)
        
        # Step 4: Remove stopwords
        tokens = [t for t in tokens if t not in stop_words]
        
        # Step 5: Lemmatize
        pos_tags = nltk.pos_tag(tokens)
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        
        return ' '.join(tokens)
    return text

df2['title'] = df2['title'].apply(preprocess)
df2['content'] = df2['content'].apply(preprocess)

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/parth/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
df2.head(10)

Unnamed: 0,reviewId,userId,itemId,rating,title,content
0,490485,33913,444584,1.0,somebody call peta !,caught glimpse show feature gay male couple do...
1,490043,33913,123366,1.0,awful ! mail rosie mussel shut big mouth !,"believe , watch show beginning 1996 fan . anym..."
2,490772,33913,117509,1.0,'s ! hat !,watch minute adaptation william shakespeare 's...
3,490415,33913,169455,2.0,"rosie o'donnell , barbara walter , show turn ?",'m sorry rosie o'donnell talk mother 's death ...
4,490180,33913,1191056,2.0,thought kardashians bad !,"okay , reality show premise five housewife new..."
5,490230,33913,388795,2.0,brokeback lose oscar ?,really want know n't win oscar ? well maybe pr...
6,489825,33913,257295,2.0,orwell would shock !,n't get big brother . read george orwell 's no...
7,490100,33913,1086761,2.0,guide parenting !,"remember robert kardashian , attorney best fri..."
8,490337,33913,157246,2.0,disgrace !,try like grace . believe tried . always get wr...
9,490338,33913,461721,2.0,"sorry , n't , seth ?",love seth green . appearance 70 ' show always ...


In [19]:
df2.to_csv('imdb62_B.csv', index=False)

# DATASET 3: PUNTUATION REMOVED

In [20]:
df3 = pd.read_csv("imdb62.csv")

In [21]:
df3.head(10)

Unnamed: 0,reviewId,userId,itemId,rating,title,content
0,490485,33913,444584,1.0,Somebody call PETA !,I caught glimpses of this show which feature a...
1,490043,33913,123366,1.0,Just Awful ! Mail Rosie a mussel to shut her b...,"Believe it or not , I watched this show in the..."
2,490772,33913,117509,1.0,It's not for me ! I just hated it !,After only watching a few minutes of this adap...
3,490415,33913,169455,2.0,"Rosie O'Donnell , Barbara Walters , What has t...",I'm sorry but Rosie O'Donnell talking about he...
4,490180,33913,1191056,2.0,I thought the Kardashians were bad !,"Okay , the reality show premise is five housew..."
5,490230,33913,388795,2.0,Why Brokeback Lost the Oscar ?,Do you really want to know why it didn't win t...
6,489825,33913,257295,2.0,Orwell would have been shocked !,I don't get Big Brother . I read George Orwell...
7,490100,33913,1086761,2.0,The Guide to Parenting Not !,"Remember Robert Kardashian , he was the attorn..."
8,490337,33913,157246,2.0,Just a Disgrace !,I tried to like Will and Grace . Believe me I ...
9,490338,33913,461721,2.0,"Sorry , this isn't it , Seth ?",I love Seth Green . His appearances on THat 70...


In [22]:
import string

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def preprocess(text):
    if isinstance(text, str):
        # Step 1: Remove Unicode strings
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        
        # Step 2: Lowercase
        text = text.lower()
        
        # Step 3: Tokenize
        tokens = word_tokenize(text)
        
        # Step 4: Remove punctuation
        tokens = [t for t in tokens if t not in string.punctuation]
        
        # Step 5: Lemmatize
        pos_tags = nltk.pos_tag(tokens)
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        
        return ' '.join(tokens)
    return text

# Apply to both columns
df3['title'] = df3['title'].apply(preprocess)
df3['content'] = df3['content'].apply(preprocess)

In [23]:
df3.head(10)

Unnamed: 0,reviewId,userId,itemId,rating,title,content
0,490485,33913,444584,1.0,somebody call peta,i catch glimpse of this show which feature a g...
1,490043,33913,123366,1.0,just awful mail rosie a mussel to shut her big...,believe it or not i watch this show in the beg...
2,490772,33913,117509,1.0,it 's not for me i just hat it,after only watch a few minute of this adaptati...
3,490415,33913,169455,2.0,rosie o'donnell barbara walter what have this ...,i 'm sorry but rosie o'donnell talk about her ...
4,490180,33913,1191056,2.0,i think the kardashians be bad,okay the reality show premise be five housewif...
5,490230,33913,388795,2.0,why brokeback lose the oscar,do you really want to know why it do n't win t...
6,489825,33913,257295,2.0,orwell would have be shock,i do n't get big brother i read george orwell ...
7,490100,33913,1086761,2.0,the guide to parent not,remember robert kardashian he be the attorney ...
8,490337,33913,157246,2.0,just a disgrace,i try to like will and grace believe me i try ...
9,490338,33913,461721,2.0,sorry this be n't it seth,i love seth green his appearance on that 70 sh...


In [24]:
df3.to_csv('imdb62_C.csv', index=False)

# DATASET 4: NO STOPWORDS AND PUNCTUATION

In [25]:
df4 = pd.read_csv("imdb62.csv")

In [26]:
df4.head(10)

Unnamed: 0,reviewId,userId,itemId,rating,title,content
0,490485,33913,444584,1.0,Somebody call PETA !,I caught glimpses of this show which feature a...
1,490043,33913,123366,1.0,Just Awful ! Mail Rosie a mussel to shut her b...,"Believe it or not , I watched this show in the..."
2,490772,33913,117509,1.0,It's not for me ! I just hated it !,After only watching a few minutes of this adap...
3,490415,33913,169455,2.0,"Rosie O'Donnell , Barbara Walters , What has t...",I'm sorry but Rosie O'Donnell talking about he...
4,490180,33913,1191056,2.0,I thought the Kardashians were bad !,"Okay , the reality show premise is five housew..."
5,490230,33913,388795,2.0,Why Brokeback Lost the Oscar ?,Do you really want to know why it didn't win t...
6,489825,33913,257295,2.0,Orwell would have been shocked !,I don't get Big Brother . I read George Orwell...
7,490100,33913,1086761,2.0,The Guide to Parenting Not !,"Remember Robert Kardashian , he was the attorn..."
8,490337,33913,157246,2.0,Just a Disgrace !,I tried to like Will and Grace . Believe me I ...
9,490338,33913,461721,2.0,"Sorry , this isn't it , Seth ?",I love Seth Green . His appearances on THat 70...


In [27]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def preprocess(text):
    if isinstance(text, str):
        # Step 1: Remove Unicode strings
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        
        # Step 2: Lowercase
        text = text.lower()
        
        # Step 3: Tokenize
        tokens = word_tokenize(text)
        
        # Step 4: Remove punctuation and stopwords
        tokens = [t for t in tokens if t not in string.punctuation and t not in stop_words]
        
        # Step 5: Lemmatize
        pos_tags = nltk.pos_tag(tokens)
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        
        return ' '.join(tokens)
    return text

# Apply to both columns
df4['title'] = df4['title'].apply(preprocess)
df4['content'] = df4['content'].apply(preprocess)

In [28]:
df4.head(10)

Unnamed: 0,reviewId,userId,itemId,rating,title,content
0,490485,33913,444584,1.0,somebody call peta,caught glimpse show feature gay male couple do...
1,490043,33913,123366,1.0,awful mail rosie mussel shut big mouth,believe watch show beginning 1996 fan anymore ...
2,490772,33913,117509,1.0,'s hat,watch minute adaptation william shakespeare 's...
3,490415,33913,169455,2.0,rosie o'donnell barbara walter show turn,'m sorry rosie o'donnell talk mother 's death ...
4,490180,33913,1191056,2.0,thought kardashians bad,okay reality show premise five housewife new y...
5,490230,33913,388795,2.0,brokeback lose oscar,really want know n't win oscar well maybe prio...
6,489825,33913,257295,2.0,orwell would shock,n't get big brother read george orwell 's nove...
7,490100,33913,1086761,2.0,guide parenting,remember robert kardashian attorney best frien...
8,490337,33913,157246,2.0,disgrace,try like grace believe try always get wrong im...
9,490338,33913,461721,2.0,sorry n't seth,love seth green appearance 70 show always wort...


In [29]:
df4.to_csv('imdb62_D.csv', index=False)