In [4]:
import pandas as pd

# Load the file from your current folder
df = pd.read_csv("IMDB Dataset.csv")

# Display the first few rows
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
import pandas as pd
import string
import re
import emoji
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from autocorrect import Speller
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load sample data (first 5 rows)
sample = df.head(5).copy()

# Tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
spell = Speller(lang='en')

# Chat words (expandable)
chat_words_dict = {
    "u": "you", "ur": "your", "r": "are", "luv": "love", "omg": "oh my god", "idk": "i don't know"
}


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#Lowercasing
sample['step1_lower'] = sample['review'].apply(lambda x: x.lower())
sample[['review', 'step1_lower']]


Unnamed: 0,review,step1_lower
0,One of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,a wonderful little production. <br /><br />the...
2,I thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...","petter mattei's ""love in the time of money"" is..."


In [10]:
#Removal HTML Tags
sample['step2_no_html'] = sample['step1_lower'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
sample[['step1_lower', 'step2_no_html']]


Unnamed: 0,step1_lower,step2_no_html
0,one of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,a wonderful little production. <br /><br />the...,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is...","petter mattei's ""love in the time of money"" is..."


In [14]:
#Remove Punctuation
sample['step3_no_punct'] = sample['step2_no_html'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
sample[['step2_no_html', 'step3_no_punct']]


Unnamed: 0,step2_no_html,step3_no_punct
0,one of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,a wonderful little production. the filming tec...,a wonderful little production the filming tech...
2,i thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...,basically theres a family where a little boy j...
4,"petter mattei's ""love in the time of money"" is...",petter matteis love in the time of money is a ...


In [16]:
#Handle Chat Words
def replace_chat_words(text):
    words = word_tokenize(text)
    return " ".join([chat_words_dict.get(w, w) for w in words])

sample['step4_chat_fixed'] = sample['step3_no_punct'].apply(replace_chat_words)
sample[['step3_no_punct', 'step4_chat_fixed']]


Unnamed: 0,step3_no_punct,step4_chat_fixed
0,one of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,a wonderful little production the filming tech...,a wonderful little production the filming tech...
2,i thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,basically theres a family where a little boy j...,basically theres a family where a little boy j...
4,petter matteis love in the time of money is a ...,petter matteis love in the time of money is a ...


In [18]:
#Spelling Correction
def correct_spelling(text):
    words = word_tokenize(text)
    return " ".join([spell(w) for w in words])

# Only correct 1 row for now
print("Before:", sample['step4_chat_fixed'][0])
print("After :", correct_spelling(sample['step4_chat_fixed'][0]))


Before: one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with methe first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordit is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayi would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty

In [20]:
#Remove Stopwords
def remove_stopwords(text):
    words = word_tokenize(text)
    return " ".join([w for w in words if w not in stop_words])

sample['step6_no_stopwords'] = sample['step4_chat_fixed'].apply(remove_stopwords)
sample[['step4_chat_fixed', 'step6_no_stopwords']]


Unnamed: 0,step4_chat_fixed,step6_no_stopwords
0,one of the other reviewers has mentioned that ...,one reviewers mentioned watching 1 oz episode ...
1,a wonderful little production the filming tech...,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,basically theres a family where a little boy j...,basically theres family little boy jake thinks...
4,petter matteis love in the time of money is a ...,petter matteis love time money visually stunni...


In [22]:
#Remove Emojis
sample['step7_no_emoji'] = sample['step6_no_stopwords'].apply(lambda x: emoji.replace_emoji(x, replace=''))
sample[['step6_no_stopwords', 'step7_no_emoji']]


Unnamed: 0,step6_no_stopwords,step7_no_emoji
0,one reviewers mentioned watching 1 oz episode ...,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production filming technique ...,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...,petter matteis love time money visually stunni...


In [24]:
#Tokenization
sample['step8_tokens'] = sample['step7_no_emoji'].apply(word_tokenize)
sample[['step7_no_emoji', 'step8_tokens']]


Unnamed: 0,step7_no_emoji,step8_tokens
0,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn..."
2,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su..."
3,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,..."
4,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,..."


In [26]:
#Stemming
def stem_words(tokens):
    return [stemmer.stem(w) for w in tokens]

sample['step9_stemmed'] = sample['step8_tokens'].apply(stem_words)
sample[['step8_tokens', 'step9_stemmed']]


Unnamed: 0,step8_tokens,step9_stemmed
0,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, review, mention, watch, 1, oz, episod, y..."
1,"[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass..."
2,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
3,"[basically, theres, family, little, boy, jake,...","[basic, there, famili, littl, boy, jake, think..."
4,"[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st..."


In [28]:
#Lemmatization
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]

sample['step10_lemmatized'] = sample['step9_stemmed'].apply(lemmatize_words)
sample[['step9_stemmed', 'step10_lemmatized']]


Unnamed: 0,step9_stemmed,step10_lemmatized
0,"[one, review, mention, watch, 1, oz, episod, y...","[one, review, mention, watch, 1, oz, episod, y..."
1,"[wonder, littl, product, film, techniqu, unass...","[wonder, littl, product, film, techniqu, unass..."
2,"[thought, wonder, way, spend, time, hot, summe...","[thought, wonder, way, spend, time, hot, summe..."
3,"[basic, there, famili, littl, boy, jake, think...","[basic, there, famili, littl, boy, jake, think..."
4,"[petter, mattei, love, time, money, visual, st...","[petter, mattei, love, time, money, visual, st..."
