In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import gensim.downloader as api

In [2]:
# --- NLTK Setup ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NAVADHESH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NAVADHESH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NAVADHESH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
df = pd.read_csv("IMDB Dataset.csv")  
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [5]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove HTML
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # remove numbers/symbols
    tokens = nltk.word_tokenize(text.lower())
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

In [6]:
df['clean_review'] = df['review'].apply(clean_text)

In [7]:
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['clean_review'])
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment'], test_size=0.2, random_state=42)

In [8]:
model1 = LogisticRegression()
model1.fit(X_train, y_train)
preds1 = model1.predict(X_test)

In [9]:
print("TF-IDF + Logistic Regression")
print("Accuracy:", accuracy_score(y_test, preds1))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds1))

TF-IDF + Logistic Regression
Accuracy: 0.8882
Confusion Matrix:
 [[4334  627]
 [ 491 4548]]
