In [24]:
%pip install -r ../requirements.txt
# %pip install kaggle
# !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews -p ../Dataset/

In [25]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import zipfile

from bs4 import BeautifulSoup
from langdetect import detect
from nltk.corpus import stopwords
from nltk.corpus import wordnet

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from urllib.parse import urlsplit
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mahesh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mahesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mahesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mahesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
# Specify the path to the ZIP file
zip_file_path = r"../Dataset/imdb-dataset-of-50k-movie-reviews.zip"

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Find the first file with a .csv extension (assuming it's the one you want)
    csv_file = [name for name in zip_ref.namelist() if name.endswith('.csv')][0]
    
    # Read the CSV file directly from the ZIP archive into a DataFrame
    df = pd.read_csv(zip_ref.open(csv_file))

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [27]:
df_subset = df.sample(n=5000).reset_index(drop=True)
df_subset.head()

Unnamed: 0,review,sentiment
0,'R Xmas is one of the only films I've seen whe...,negative
1,My life is about saving animals. I do voluntee...,negative
2,Cant believe it.... after all these years fina...,positive
3,This movie is horrendous. Decent fight scenes ...,negative
4,If you overlook the fact that the plot has bee...,positive


In [28]:
df_subset['sentiment'].value_counts()

sentiment
positive    2516
negative    2484
Name: count, dtype: int64

In [29]:
#lowercase
df_subset["review"]=df_subset["review"].apply(lambda x:x.lower())

#Remove punctuations
df_subset['review'] = df_subset['review'].str.replace('[{}]'.format(string.punctuation), ' ')

# Remove numbers from the 'reviews' column
df_subset['review'] = df_subset['review'].str.replace(r'\d+', '')

# Stopwords

In [30]:
# Function to remove stopwords from a text
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Apply the remove_stopwords function to the 'review' column
df_subset['review'] = df_subset['review'].apply(remove_stopwords)

# Url

In [31]:
def remove_urls(text):
    # Define a regular expression pattern to match URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    # Find all matches in the text
    urls = re.findall(url_pattern, text)

    # Remove URLs from the text
    text_without_urls = re.sub(url_pattern, '', text)

    return text_without_urls

# Example usage
df_subset['review'] = df_subset['review'].apply(remove_urls)

# HTML

In [32]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

df_subset["review"] = df_subset["review"].apply(remove_html_tags)

  soup = BeautifulSoup(text, 'html.parser')


# Non alphanumeric

In [33]:
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df_subset['review'] = df_subset['review'].apply(clean_text)

# Extra spaces

In [34]:
def remove_extra_whitespaces(text):
    # Use regular expression to replace multiple whitespaces with a single space
    return re.sub(r'\s+', ' ', text).strip()

df_subset['review'] = df_subset['review'].apply(remove_extra_whitespaces)

# Filter non-English comments

In [35]:
def filter_non_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Create a boolean mask for non-English reviews
mask = df_subset['review'].apply(filter_non_english)

# Create a new DataFrame containing only English reviews
df_subset = df_subset[mask]

# Lemma

In [36]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the part of speech for WordNet lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if the part of speech is not found

# Function to lemmatize a text
def lemmatize_text(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to the 'text' column
df_subset['review'] = df_subset['review'].apply(lemmatize_text)

In [37]:
df_subset.head()

Unnamed: 0,review,sentiment
0,r xmas one film ve see almost say simply nothi...,negative
1,life save animal volunteer work cat rescue org...,negative
2,cant believe year finally track meant name the...,positive
3,movie horrendous decent fight scene act really...,negative
4,overlook fact plot do many time hilarious glee...,positive


# LabelEncoding

In [38]:
label = LabelEncoder()
df_subset['sentiment'] = label.fit_transform(df_subset['sentiment'])

In [39]:
df_subset.head()

Unnamed: 0,review,sentiment
0,r xmas one film ve see almost say simply nothi...,0
1,life save animal volunteer work cat rescue org...,0
2,cant believe year finally track meant name the...,1
3,movie horrendous decent fight scene act really...,0
4,overlook fact plot do many time hilarious glee...,1


# Feature Extraction

In [40]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
totalFeatures = vectorizer.fit_transform(df_subset['review'])
column_names = vectorizer.get_feature_names_out()
totalFeatures.shape

(4995, 37301)

In [41]:
final_features = pd.DataFrame(totalFeatures.toarray(), columns=column_names)
final_features.head()

Unnamed: 0,aa,aaa,aaargh,aachen,aag,aaliyah,aames,aamir,aan,aankhen,...,zucco,zues,zugsmith,zula,zulu,zuniga,zurer,zurlini,zwick,zzvorkov
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature Selection

# Classification