# Import Necessary Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


# Load and Explore Dataset

We are using the IMDb 50K movie reviews dataset to classify reviews as positive or negative.

In [2]:
# Load dataset
data = pd.read_csv('IMDB Dataset.csv')

# Display first few rows
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Data cleaning

In [12]:
from nltk.corpus import stopwords

# download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Define stopwords
stop_words = stopwords.words('english')

print(stop_words[:10])  # Just to check the first few stopwords

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


In [18]:
# Initialize stemmer
stemmer = PorterStemmer()

# Function to clean text
def clean_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'<br />', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text) # Remove special characters
    text = text.split() # Tokenize words
    text = [stemmer.stem(word) for word in text if word not in stop_words]  # Stemming & stopword removal
    
    return ' '.join(text)

# Apply text cleaning to reviews
data['cleaned_review'] = data['review'].apply(clean_text)

# Show cleaned data
data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod youll hook ...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...
