## Library

In [2]:
import pandas as pd
import numpy as np
from IPython.display import display

## Preview Data

In [3]:
data = pd.read_csv('IMDB Dataset.csv')
display(data.head(10))

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Check for null

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


## Data Preprocess

### Lower Case

In [5]:
data['review'] = data['review'].str.lower()
display(data.head())

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Remove HTML tags

In [6]:
import re 

def remove_html_tag(text):
    pattern = re.compile('<.*?>')  # Pattern matches anything inside <...>
    return pattern.sub(r'', text)  # Replace the HTML tags with an empty string

data['review'] = data['review'].apply(remove_html_tag)

data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Remove URLs

In [7]:
import re

def remove_urls(text):
    # This regex pattern matches:
    # - Any URL starting with http:// or https:// followed by any non-space characters (\S+)
    # - OR any URL starting with www. followed by any non-space characters
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)  # Replace matched URLs with an empty string

text4 = 'For notebook click https://www.kaggle.com/campusx/notebook8223fc1abb to search check www.google.com'
print(remove_urls(text4))  # Output: 'For notebook click  to search check '

data['review'] = data['review'].apply(remove_urls)

For notebook click  to search check 


### Remove Punctuations

In [8]:
import re
import string

def remove_punctuation(text):
    # This pattern matches any character in the punctuation set
    pattern = re.compile(f"[{re.escape(string.punctuation)}]")
    return pattern.sub("", text)  # Replace punctuation with empty string

text5 = "Wow!! This product is amazing... Right?"
print(remove_punctuation(text5))  # Output: 'Wow This product is amazing Right'

data['review'] = data['review'].apply(remove_punctuation)

Wow This product is amazing Right


### Slang Replacement

In [9]:
slang_dict = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'B4N': 'Bye For Now',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': 'For What It\'s Worth',
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you (also a chat program)',
    'ILU': 'I Love You',
    'IMHO': 'In My Honest/Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My A.. Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'PITA': 'Pain In The A..',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My A.. Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The F...',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait...',
    '7K': 'Sick:-D Laughter',
    'TFW': 'That feeling when',
    'MFW': 'My face when',
    'MRW': 'My reaction when',
    'IFYP': 'I feel your pain',
    'LOL': 'Laughing out loud',
    'TNTL': 'Trying not to laugh',
    'JK': 'Just kidding',
    'IDC': 'I donâ€™t care',
    'ILY': 'I love you',
    'IMU': 'I miss you',
    'ADIH': 'Another day in hell',
    'IDC': 'I donâ€™t care',
    'ZZZ': 'Sleeping, bored, tired',
    'WYWH': 'Wish you were here',
    'TIME': 'Tears in my eyes',
    'BAE': 'Before anyone else',
    'FIMH': 'Forever in my heart',
    'BSAAW': 'Big smile and a wink',
    'BWL': 'Bursting with laughter',
    'LMAO': 'Laughing my a** off',
    'BFF': 'Best friends forever',
    'CSL': 'Canâ€™t stop laughing'
}

In [12]:
# Function to convert slang to full form
def chat_conversion(text):
    # Split text into words, convert slang to full form if it exists
    new_text = [slang_dict.get(w.upper(), w) for w in text.split()]
    return ' '.join(new_text)

text_slang = "FYI I already knew"
print(chat_conversion(text_slang))  # Output: "For Your Information I already knew"

data['review'] = data['review'].apply(chat_conversion)

For Your Information I already knew


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend Te...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the Tears in my eyes of...,positive


### Remove Emoji

In [17]:
emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                             "]+", flags=re.UNICODE)
def remove_emoji(text):
    return emoji_pattern.sub(r'', text)

# Example
remove_emoji("You are very funny ðŸ˜‚ðŸ˜‚ðŸ˜‚")

data['review'] = data['review'].apply(remove_emoji)

### Remove Stopwords

In [21]:
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()

stopwords = stopwords.words('english')

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stopwords])

# Example
text = 'This is a really great time for the field of AI. It is advancing exponentially'
remove_stopwords(text)

data['review2'] = data['review'].progress_apply(remove_stopwords)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50000/50000 [00:15<00:00, 3298.43it/s]


In [22]:
data.head()

Unnamed: 0,review,sentiment,review2
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,a wonderful little production the filming tech...,positive,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend Te...,positive,thought wonderful way spend Tears eyes hot sum...
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...
4,petter matteis love in the Tears in my eyes of...,positive,petter matteis love Tears eyes money visually ...


### Tokenize

In [24]:
from nltk.tokenize import word_tokenize, sent_tokenize

sent1= 'I am going to Mumbai'

print(word_tokenize(sent1))
print(sent_tokenize(sent1))

def word_tokens(text):
    text_list = word_tokenize(text)
    return text_list
data['review_list'] = data['review2'].progress_apply(word_tokens)

['I', 'am', 'going', 'to', 'Mumbai']
['I am going to Mumbai']


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50000/50000 [00:12<00:00, 3911.23it/s]


In [25]:
data.head()

Unnamed: 0,review,sentiment,review2,review_list
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,a wonderful little production the filming tech...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend Te...,positive,thought wonderful way spend Tears eyes hot sum...,"[thought, wonderful, way, spend, Tears, eyes, ..."
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,..."
4,petter matteis love in the Tears in my eyes of...,positive,petter matteis love Tears eyes money visually ...,"[petter, matteis, love, Tears, eyes, money, vi..."


### Lemmatization

In [38]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_review(text):
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in text]
    return ' '.join(lemmatized_words)  # Return lemmatized text as a string

# Example review
review = "The cats are running quickly towards the fish"
review = word_tokenize(review)
lemmatized_review = lemmatize_review(review)
print(lemmatized_review)

The cat be run quickly towards the fish


In [40]:
data['review_lemma'] = data['review_list'].progress_apply(lemmatize_review)
data.head()

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50000/50000 [00:18<00:00, 2759.29it/s]


Unnamed: 0,review,sentiment,review2,review_list,review_lemma
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e...",one reviewers mention watch 1 oz episode youll...
1,a wonderful little production the filming tech...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...",wonderful little production film technique una...
2,i thought this was a wonderful way to spend Te...,positive,thought wonderful way spend Tears eyes hot sum...,"[thought, wonderful, way, spend, Tears, eyes, ...",think wonderful way spend Tears eye hot summer...
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,...",basically theres family little boy jake think ...
4,petter matteis love in the Tears in my eyes of...,positive,petter matteis love Tears eyes money visually ...,"[petter, matteis, love, Tears, eyes, money, vi...",petter matteis love Tears eye money visually s...


## Sentiment Analysis with HuggingFace Transformer

### With Lemmatization and Stop Word Removal

In [50]:
df = data.loc[:, ['review_lemma', 'sentiment']] 
df.head()

Unnamed: 0,review_lemma,sentiment
0,one reviewers mention watch 1 oz episode youll...,positive
1,wonderful little production film technique una...,positive
2,think wonderful way spend Tears eye hot summer...,positive
3,basically theres family little boy jake think ...,negative
4,petter matteis love Tears eye money visually s...,positive


In [51]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.head()

Unnamed: 0,review_lemma,sentiment
0,one reviewers mention watch 1 oz episode youll...,1
1,wonderful little production film technique una...,1
2,think wonderful way spend Tears eye hot summer...,1
3,basically theres family little boy jake think ...,0
4,petter matteis love Tears eye money visually s...,1


In [53]:
from transformers import pipeline
from tqdm import tqdm  # progress bar

# Call the Hugging Face model
classifier = pipeline("sentiment-analysis", model='distilbert-base-uncased-finetuned-sst-2-english')

# Classify in batches
batch_size = 32 
batch_results = []

# Initialize the progress bar
for i in tqdm(range(0, len(df), batch_size), desc="Processing batches", unit="batch"):
    batch = df['review_lemma'][i:i+batch_size].tolist()  # Get the current batch
    batch_results.extend(classifier(batch, truncation=True))  # Classify the batch

# Update the dataframe with predictions
df['sentiment_pred'] = [result['label'] for result in batch_results]
df['sentiment_pred'] = df['sentiment_pred'].apply(lambda x: 1 if x == 'POSITIVE' else 0)  # Convert labels to 1/0

# Display the results
df.head()

Device set to use cuda:0
Processing batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1563/1563 [07:24<00:00,  3.51batch/s]


Unnamed: 0,review_lemma,sentiment,sentiment_pred
0,one reviewers mention watch 1 oz episode youll...,1,0
1,wonderful little production film technique una...,1,1
2,think wonderful way spend Tears eye hot summer...,1,1
3,basically theres family little boy jake think ...,0,0
4,petter matteis love Tears eye money visually s...,1,1


In [56]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy
accuracy = accuracy_score(df['sentiment'], df['sentiment_pred'])
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.7888


### Without Lemma and Stop Words Removal

In [60]:
df2 = data.loc[:, ['review', 'sentiment']]
df2['sentiment'] = df2['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df2.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend Te...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the Tears in my eyes of...,1


### Create a function for ease

In [62]:
from tqdm import tqdm
from transformers import pipeline

def classify_sentiment(df, classifier, batch_size=32):
    """
    Classifies sentiment of reviews in a dataframe using a Hugging Face model in batches.
    
    Parameters:
    - df: pandas DataFrame containing the reviews (expects the first column to be the reviews).
    - classifier: Hugging Face sentiment-analysis pipeline.
    - batch_size: Number of reviews to process in each batch (default is 32).
    
    Returns:
    - df: The updated DataFrame with the sentiment predictions ('sentiment_pred' column).
    """
    batch_results = []

    # Get the name of the first column
    review_column = df.columns[0]

    # Initialize the progress bar
    for i in tqdm(range(0, len(df), batch_size), desc="Processing batches", unit="batch"):
        batch = df[review_column][i:i+batch_size].tolist()  # Get the current batch using the first column
        batch_results.extend(classifier(batch, truncation=True))  # Classify the batch

    # Update the dataframe with predictions
    df['sentiment_pred'] = [result['label'] for result in batch_results]
    df['sentiment_pred'] = df['sentiment_pred'].apply(lambda x: 1 if x == 'POSITIVE' else 0)  # Convert labels to 1/0

    return df

# Call the Hugging Face model
classifier = pipeline("sentiment-analysis", model='distilbert-base-uncased-finetuned-sst-2-english')

# Apply sentiment classification to the dataframe
df2 = classify_sentiment(df2, classifier)

Device set to use cuda:0
Processing batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1563/1563 [09:31<00:00,  2.74batch/s]


In [65]:
# Calculate the accuracy
accuracy = accuracy_score(df2['sentiment'], df2['sentiment_pred'])
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8611


### Without Lemma and With Stop Words Removal

In [64]:
df3 = data.loc[:, ['review2', 'sentiment']]
df3['sentiment'] = df3['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df3.head()

Unnamed: 0,review2,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend Tears eyes hot sum...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love Tears eyes money visually ...,1


In [66]:
# Apply sentiment classification to the dataframe
df3 = classify_sentiment(df3, classifier)

Processing batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1563/1563 [08:28<00:00,  3.07batch/s]


In [67]:
# Calculate the accuracy
accuracy = accuracy_score(df3['sentiment'], df3['sentiment_pred'])
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8050
