## Preprocess the Data

pandas as pd: For data manipulation and analysis (e.g., load, clean, and analyze tabular data).

re: For text pattern matching using regular expressions (e.g., find emails, replace text).

nltk.corpus import stopwords: Removes common words (e.g., "the", "is") during text preprocessing in NLP tasks.

TfidfVectorizer: Converts text to numerical features for machine learning by measuring word importance.

os: Interacts with the operating system (e.g., file handling, environment variables).

nltk: For NLP tasks like tokenization, stemming, and lemmatization.

In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')

# File path for the dataset
file_path = 'IMDB Dataset.csv'  # File is in the same directory as the notebook

# Load the dataset
data = pd.read_csv(file_path)

# Create a directory for processed data if it doesn't exist
os.makedirs('processed_data', exist_ok=True)

# Function to clean text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stop words (optional)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply cleaning to the review column
data['cleaned_review'] = data['review'].apply(clean_text)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 words
X = vectorizer.fit_transform(data['cleaned_review']).toarray()

# Save processed data
processed_data = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())
processed_data['sentiment'] = data['sentiment']

# Save processed data to CSV
output_path = 'processed_data/imdb_processed.csv'
processed_data.to_csv(output_path, index=False)

print(f"Preprocessed data saved to: {output_path}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mayeshamalihaproma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessed data saved to: processed_data/imdb_processed.csv


## Load the Preprocessed Data

In [7]:
pd.read_csv('processed_data/imdb_processed.csv').head()

Unnamed: 0,aaron,abandoned,abc,abilities,ability,able,absence,absent,absolute,absolutely,...,young,younger,youre,youth,youve,zero,zombie,zombies,zone,sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.079299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.079664,0.0,0.0,0.0,0.117844,0.0,0.0,negative
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive


In [9]:
from sklearn.model_selection import train_test_split

# Features (X) and Labels (y)
X = processed_data.drop(columns=['sentiment'])
y = processed_data['sentiment']

# Encode labels (convert positive/negative to 1/0)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8867
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

