## Sentiment Analysis with IMDB Moives Reviews

### 🎬 Exploring Sentiment Analysis with IMDB Reviews and Machine Learning 🤖

### 1. Importing Libraries:


In [1]:
# Importing pandas for data manipulation and analysis
import pandas as pd  

# Importing NumPy for numerical operations
import numpy as np  

# Importing Matplotlib for data visualization
import matplotlib.pyplot as plt  

# Importing warnings module to manage warning messages
import warnings  

# Importing re for regular expression operations
import re  

# Suppress all warnings to keep the output clean
warnings.filterwarnings('ignore')


In [2]:
import sklearn  # Import the scikit-learn library

# Print the version of scikit-learn
sklearn.__version__

'1.5.1'

In [3]:
 # Importing defaultdict for dictionary with default values
from collections import defaultdict 

# Importing sentence tokenizer from NLTK
from nltk.tokenize import sent_tokenize  

# Importing word tokenizer from NLTK
from nltk.tokenize import word_tokenize 

# Importing tokenizer that splits words and punctuation
from nltk.tokenize import WordPunctTokenizer

# Importing tokenizer using regular expressions
from nltk.tokenize import regexp_tokenize

# Importing part-of-speech tagger from NLTK
from nltk import pos_tag  

# Importing stopwords from NLTK
from nltk.corpus import stopwords  

# Importing WordNet lexical database from NLTK
from nltk.corpus import wordnet 

# Importing lemmatizer from NLTK for word normalization
from nltk.stem import WordNetLemmatizer  

# Importing LabelEncoder for encoding labels as numbers
from sklearn.preprocessing import LabelEncoder  

# Importing TF-IDF Vectorizer for text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer  

# Importing Naive Bayes, SVM, and Linear models from scikit-learn
from sklearn import naive_bayes, svm, linear_model  

# Importing train-test split function for splitting datasets
from sklearn.model_selection import train_test_split  

# Importing metrics for model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  

### 2. Loading and Inspecting Data:



In [4]:
# Importing pandas for data manipulation
import pandas as pd  

# Import the Data
df = pd.read_csv('IMDB Dataset.csv')  # Reading the CSV file into a DataFrame

# Display the first few rows of the DataFrame to verify the data has been loaded correctly
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
df.head(10)  # Displays the first 10 rows of the DataFrame

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


#### Display the dimensions of the DataFrame `df`

In [6]:
# The `shape` attribute returns a tuple (number of rows, number of columns)
df.shape

(50000, 2)

#### Check for missing values in the DataFrame `df`


In [7]:
# The `isnull()` function returns a DataFrame of the same shape as `df`, where each cell contains a boolean value:
# `True` if the value is missing, and `False` otherwise.
# The `sum()` function then sums up the number of `True` values for each column, giving the total number of missing values per column.
df.isnull().sum()

review       0
sentiment    0
dtype: int64

#### Get a concise summary of the DataFrame `df`

In [8]:
# The `info()` method provides essential information about the DataFrame, including:
# - The number of entries (rows)
# - The number of columns
# - The names of the columns
# - The data type of each column
# - The number of non-null values in each column (helpful for identifying columns with missing data)
# - The memory usage of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


#### Generate descriptive statistics of the DataFrame `df`


In [9]:
# The `describe()` method provides a summary of statistical measures for each numerical column, including:
# - `count`: The number of non-null entries
# - `mean`: The average value
# - `std`: The standard deviation (a measure of the amount of variation or dispersion)
# - `min`: The minimum value
# - `25%`: The 25th percentile (the value below which 25% of the data falls)
# - `50%`: The 50th percentile (also known as the median, the value below which 50% of the data falls)
# - `75%`: The 75th percentile (the value below which 75% of the data falls)
# - `max`: The maximum value
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


#### Count the occurrences of each unique value in the 'sentiment' column


In [10]:
# The `value_counts()` method returns a Series containing counts of unique values.
# This is useful for understanding the distribution of categorical data, like sentiments in this case.
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

#### Count the number of duplicate rows in the DataFrame

In [11]:
# The `duplicated()` method returns a boolean Series where True indicates a duplicate row.
# The `len()` function then counts how many True values are in this Series, which gives the total number of duplicate rows.
num_duplicates = len(df[df.duplicated()])
num_duplicates

418

#### Drop duplicate rows from the DataFrame


In [12]:
# The `drop_duplicates()` method removes duplicate rows from the DataFrame.
# The `inplace=True` parameter modifies the original DataFrame in place, meaning no new DataFrame is returned.
df.drop_duplicates(inplace=True)

# Display the DataFrame after removing duplicates
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### 3. Data Preprocessing:



#### Convert Text to Lowercase


In [13]:
''' Define a function to convert a word to lowercase '''
def lower_word(word):
    return word.lower()

# Apply the `lower_word` function to each element in the 'review' column
# The `map()` function applies the function `lower_word` to every element in the 'review' column.
# This converts all the text in the 'review' column to lowercase.
df['review'] = df['review'].map(lower_word)

In [14]:
# Display the 'review' column to check the text after converting to lowercase
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

#### Clean HTML Tags



In [15]:
import re  # Import the regular expression module

# Define a function to clean HTML tags from text
def regex_(raw_text):
    # Compile a regular expression pattern to find HTML tags
    find_html = re.compile('<.*?>')
    
    # Remove HTML tags using the regular expression pattern
    clean_text = re.sub(find_html, '', raw_text)
    return clean_text

# Apply the regex_ function to each review in the DataFrame
df.review = df.review.apply(lambda x: regex_(x))

# Display the cleaned 'review' column
df.review

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

#### Tokenization

In [16]:
from nltk.tokenize import WordPunctTokenizer  # Import WordPunctTokenizer from nltk

# Create an instance of WordPunctTokenizer
w_token = WordPunctTokenizer()

# Tokenize each review in the DataFrame using the WordPunctTokenizer
df['review_tokenized'] = [w_token.tokenize(t) for t in df['review']]

# Display the updated DataFrame with tokenized reviews
df

Unnamed: 0,review,sentiment,review_tokenized
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production. the filming tec...,positive,"[a, wonderful, little, production, ., the, fil..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there's a family where a little boy ...,negative,"[basically, there, ', s, a, family, where, a, ..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, ', s, "", love, in, the, time,..."
...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,..."
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, ,, bad, dialogue, ,, bad, acting, ..."
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el..."
49998,i'm going to have to disagree with the previou...,negative,"[i, ', m, going, to, have, to, disagree, with,..."


In [17]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,review,sentiment,review_tokenized
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production. the filming tec...,positive,"[a, wonderful, little, production, ., the, fil..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there's a family where a little boy ...,negative,"[basically, there, ', s, a, family, where, a, ..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, ', s, "", love, in, the, time,..."


#### Lemmatization


In [18]:
from collections import defaultdict
from nltk.corpus import wordnet

# Create a default dictionary mapping POS tags to WordNet POS tags
tag_map = defaultdict(lambda: wordnet.NOUN)
tag_map['J'] = wordnet.ADJ
tag_map['V'] = wordnet.VERB
tag_map['R'] = wordnet.ADV

In [19]:
tag_map

defaultdict(<function __main__.<lambda>()>, {'J': 'a', 'V': 'v', 'R': 'r'})

### 4. Text Processing:



#### Iterate over each tokenized review in the DataFrame


In [20]:
for idx, t in enumerate(df.review_tokenized):
    # Print progress every 100 reviews to monitor the process
    if idx % 100 == 0:
        print(idx)
    
    # Initialize an empty list to store the cleaned and lemmatized words
    word_ls = []
    
    # Initialize the WordNetLemmatizer, which will be used to reduce words to their base forms
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # Iterate over each word and its corresponding part-of-speech (POS) tag in the tokenized review
    for word, tag in pos_tag(t):
        # Check if the word is not a stopword and consists only of alphabetic characters
        if word not in stopwords.words("english") and word.isalpha():
            # Lemmatize the word using the appropriate POS tag from tag_map
            word_p = wordnet_lemmatizer.lemmatize(word, tag_map[tag[0]])
            # Add the lemmatized word to the word list
            word_ls.append(word_p)
    
    # Convert the list of cleaned words to a string and store it in a new column 'review_tokenized_cleaned'
    df.loc[idx, "review_tokenized_cleaned"] = str(word_ls)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

#### Display the first 5 rows of the DataFrame


In [21]:
# df.head() returns the first 5 rows by default, which is useful for quickly inspecting the data.
df.head()

Unnamed: 0,review,sentiment,review_tokenized,review_tokenized_cleaned
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","['one', 'reviewer', 'mention', 'watch', 'oz', ..."
1,a wonderful little production. the filming tec...,positive,"[a, wonderful, little, production, ., the, fil...","['wonderful', 'little', 'production', 'filming..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","['think', 'wonderful', 'way', 'spend', 'time',..."
3,basically there's a family where a little boy ...,negative,"[basically, there, ', s, a, family, where, a, ...","['basically', 'family', 'little', 'boy', 'jake..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, ', s, "", love, in, the, time,...","['petter', 'mattei', 'love', 'time', 'money', ..."


#### Check for missing values (Not a Number [NaN]) in the 'review_tokenized_cleaned' column


In [22]:
missing_values_count = df['review_tokenized_cleaned'].isna().sum()

# Display the count of missing values
missing_values_count

409

#### Drop rows with any missing values (NaN) in the DataFrame


In [23]:
# 'inplace=True' modifies the DataFrame in place, meaning it directly alters the original DataFrame without returning a new one.
df.dropna(inplace=True)

### 4. Data Preparation



#### Split the Data


###### Split the dataset into training and testing sets


In [24]:
# 'df['review_tokenized_cleaned']' contains the cleaned and tokenized reviews.
# 'df['sentiment']' contains the target labels (sentiments).
# 'test_size=0.25' means 25% of the data will be used for testing, and 75% for training.
# 'random_state=0' ensures reproducibility by providing a seed for the random number generator.
X_train, X_test, y_train, y_test = train_test_split(df['review_tokenized_cleaned'], df['sentiment'], test_size=0.25, random_state=0)


In [25]:
# Print the shape of the training set (number of samples and features)
print("X_train shape:", X_train.shape)

# Print the shape of the testing set (number of samples and features)
print("X_test shape:", X_test.shape)

X_train shape: (36879,)
X_test shape: (12294,)


In [26]:
# Print the first few values of the training labels to inspect them
print(y_train.head())

44934    positive
9456     positive
7708     positive
9458     negative
38835    negative
Name: sentiment, dtype: object


In [27]:
# Find and print the unique sentiment values in the DataFrame
unique_sentiments = df['sentiment'].unique()
print("Unique sentiment values:", unique_sentiments)

Unique sentiment values: ['positive' 'negative']


#### Encode Labels

In [28]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
enc = LabelEncoder()

# Fit the encoder on the training data and transform the training labels
# The fit_transform method both fits the encoder to the labels and encodes them
y_train = enc.fit_transform(y_train)

# Transform the test labels using the fitted encoder
# The transform method encodes the test labels based on the labels seen during training
y_test = enc.transform(y_test)

#### Transforming Text Data with 'TfidfVectorizer'

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer with a maximum of 5000 features
# This means the vectorizer will only consider the top 5000 terms based on their TF-IDF scores
tfidf = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the cleaned and tokenized reviews
# This step learns the vocabulary and IDF (Inverse Document Frequency) from the provided text data
tfidf.fit(df.review_tokenized_cleaned)

# Transform the training text data into TF-IDF feature vectors
# The transform method converts the text data into a matrix of TF-IDF features based on the learned vocabulary
X_train = tfidf.transform(X_train)

# Transform the test text data into TF-IDF feature vectors
# This step uses the same vocabulary learned from the training data to convert the test data
X_test = tfidf.transform(X_test)

#### Convert sparse matrices to dense matrices


In [30]:
X_train_dense = X_train.todense()
X_test_dense = X_test.todense()

### 5. Model Training and Evaluation:



#### Gaussian Naive Bayes



In [31]:
from sklearn import naive_bayes

# Initialize the Naive Bayes classifier
nb = naive_bayes.GaussianNB()

# Convert sparse data to dense data
X_train_dense = X_train.toarray() if hasattr(X_train, 'toarray') else X_train

# Fit the model on the training data
nb.fit(X_train_dense, y_train)

#### Predict on the test data using the trained Naive Bayes classifier


In [32]:
from sklearn.preprocessing import LabelBinarizer

# If X_test is a sparse matrix, convert it to a dense format
if hasattr(X_test, 'toarray'):
    X_test = X_test.toarray()

# Ensure that X_test is now a dense numpy array
print("Type of X_test after conversion:", type(X_test))

Type of X_test after conversion: <class 'numpy.ndarray'>


In [33]:
predictions = nb.predict(X_test)
predictions

array([1, 0, 1, ..., 0, 0, 0])

#### Calculate and print the accuracy score & confusion matrix

In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Calculate and print the accuracy score
# The accuracy score measures the proportion of correctly classified instances
print("Accuracy Score: ", accuracy_score(y_test, predictions))

# Calculate and print the confusion matrix
# The confusion matrix provides insight into the performance of the classification model
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions))

Accuracy Score:  0.5063445583211322
Confusion Matrix: 
 [[3216 2879]
 [3190 3009]]


#### Calculate and print the classification report


In [35]:
# The classification report provides detailed metrics including precision, recall, and F1-score for each class
print("Classification Report: \n", classification_report(y_test, predictions))

Classification Report: 
               precision    recall  f1-score   support

           0       0.50      0.53      0.51      6095
           1       0.51      0.49      0.50      6199

    accuracy                           0.51     12294
   macro avg       0.51      0.51      0.51     12294
weighted avg       0.51      0.51      0.51     12294



### Support Vector Machine


#### Initialize the Support Vector Machine model with specified hyperparameters


In [36]:
from sklearn import svm

# C: Regularization parameter, which controls the trade-off between achieving a low training error and a low testing error
# kernel: Specifies the kernel type to be used in the algorithm (here, 'linear' means a linear kernel)
# degree: Degree of the polynomial kernel function (ignored by 'linear' kernel)
# gamma: Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels; 'auto' uses 1 / n_features
svm_model = svm.SVC(C=1.0, kernel="linear", degree=3, gamma="auto")

# Fit the SVM model on the training data
# This step trains the SVM classifier using the training data (X_train) and corresponding labels (y_train)
svm_model.fit(X_train, y_train)

#### Predict on the test data using the trained SVM model


In [37]:
pred_svm = svm_model.predict(X_test)

#### Calculate and print the accuracy score, confusion matrix & classification report

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Calculate and print the accuracy score
# Accuracy Score measures the proportion of correctly classified instances
print("Accuracy Score: ", accuracy_score(y_test, pred_svm))

# Calculate and print the confusion matrix
# The Confusion Matrix provides insight into the number of true positives, true negatives, false positives, and false negatives
print("Confusion Matrix: \n", confusion_matrix(y_test, pred_svm))

# Calculate and print the classification report
# The Classification Report provides detailed metrics including precision, recall, and F1-score for each class
print("Classification Report: \n", classification_report(y_test, pred_svm))

Accuracy Score:  0.5015454693346347
Confusion Matrix: 
 [[2964 3131]
 [2997 3202]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.50      0.49      0.49      6095
           1       0.51      0.52      0.51      6199

    accuracy                           0.50     12294
   macro avg       0.50      0.50      0.50     12294
weighted avg       0.50      0.50      0.50     12294

