In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\HP\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("review.csv")

In [5]:
df.shape

(50000, 2)

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Select first 100 dataframes

In [7]:
df = df.head(100)

In [8]:
df.shape

(100, 2)

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

# Lower case conversion

In [11]:
df['review'] = df['review'].str.lower()

# Remove html tags

In [12]:
import re

In [13]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [14]:
text = "<html><body><h1>This is sample text.<br>This is just for an example</h1></body></html>"

In [15]:
remove_html_tags(text)

'This is sample text.This is just for an example'

In [16]:
df['review'] = df['review'].apply(remove_html_tags)

In [17]:
df['review'][13]

"the cast played shakespeare.shakespeare lost.i appreciate that this is trying to bring shakespeare to the masses, but why ruin something so good.is it because 'the scottish play' is my favorite shakespeare? i do not know. what i do know is that a certain rev bowdler (hence bowdlerization) tried to do something similar in the victorian era.in other words, you cannot improve perfection.i have no more to write but as i have to write at least ten lines of text (and english composition was never my forte i will just have to keep going and say that this movie, as the saying goes, just does not cut it."

# Remove url in a string

In [18]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [19]:
text_url = 'This is my youtube page https://www.youtube.com/@hnasr/search'

In [20]:
remove_url(text_url)

'This is my youtube page '

# Remove punctuation

In [21]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
exclude = string.punctuation
exclude


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [23]:
def remove_punc_loop(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [24]:
text_punc = 'string. with | punctuation%'

In [25]:
def remove_punc(series):
    return series.str.translate(str.maketrans('','',exclude))

In [26]:
df['review'] = remove_punc(df['review'])

In [27]:
df['review']

0     one of the other reviewers has mentioned that ...
1     a wonderful little production the filming tech...
2     i thought this was a wonderful way to spend ti...
3     basically theres a family where a little boy j...
4     petter matteis love in the time of money is a ...
                            ...                        
95    daniel daylewis is the most versatile actor al...
96    my guess would be this was originally going to...
97    well i like to watch bad horror bmovies cause ...
98    this is the worst movie i have ever seen as we...
99    i have been a mario fan for as long as i can r...
Name: review, Length: 100, dtype: object

# Remove url

In [28]:
df['review'] = df['review'].str.replace(r'https?://\S+|www\.\S+', '', regex=True)

# Chat conversion handle

In [29]:
chat_words = {"ASAP":"as soon as possible","AFK":"away from keyboard"}

In [30]:
sample_text = 'Those who are AFK please get back to your workstation ASAP'

In [31]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [32]:
chat_conversion(sample_text)
        

'Those who are away from keyboard please get back to your workstation as soon as possible'

In [58]:
df['review'].apply(chat_conversion)

0     one reviewers mentioned watching 1 oz episode ...
1     wonderful little production filming technique ...
2     thought wonderful way spend time hot summer we...
3     basically theres family little boy jake thinks...
4     petter matteis love time money visually stunni...
                            ...                        
95    daniel daylewis versatile actor alive english ...
96    guess would originally going least two parts t...
97    well like watch bad horror bmovies cause think...
98    worst movie ever seen well worst probably ever...
99    mario fan long remember fond memories playing ...
Name: review, Length: 100, dtype: object

# Handling incorrect text

In [34]:
from textblob import TextBlob

In [42]:
incorrect_text = 'helloo world howw are youu'
textblb = TextBlob(incorrect_text)
textblb.correct().string

'hello world how are you'

# Stopwords

In [43]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [46]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [47]:
len(stopwords.words('english'))

198

In [48]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)
    

In [51]:
sample_review = 'I watched the movie and I loved it. It was really awesome a love above all'

In [52]:
remove_stopwords(sample_review)

'I watched  movie  I loved it. It  really awesome  love  '

In [55]:
df['review'] = df['review'].apply(remove_stopwords)

# Handling emoji

In [59]:
pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
   ---------------------------------------- 0.0/590.6 kB ? eta -:--:--
   --------------------------------------- 590.6/590.6 kB 10.6 MB/s eta 0:00:00
Installing collected packages: emoji
Successfully installed emoji-2.14.1
Note: you may need to restart the kernel to use updated packages.


In [60]:
import emoji

In [61]:
print(emoji.demojize('Python is emoticon'))

Python is emoticon


In [62]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               "]",flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [63]:
remove_emoji('')

''

# Tokenization

Using split()

In [64]:
# word tokenization
sample_token = 'I am going to delhi'
sample_token.split()

['I', 'am', 'going', 'to', 'delhi']

In [67]:
# sentence tokenization
sample_token_sent = 'I am going to delhi. I will be staying there for a month'
sample_token_sent.split('.')

['I am going to delhi', ' I will be staying there for a month']