<a href="https://colab.research.google.com/github/Kushagra651/IMDB-reviews/blob/main/imdb_reviews_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('IMDB_dataset.csv',encoding='utf-8', on_bad_lines='skip', engine='python')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2417 entries, 0 to 2416
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     2417 non-null   object
 1   sentiment  2417 non-null   object
dtypes: object(2)
memory usage: 37.9+ KB


In [6]:
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2417 entries, 0 to 2416
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     2417 non-null   object
 1   sentiment  2417 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 37.9+ KB


In [10]:
df['review'] = df['review'].str.lower()

In [11]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

df['review'] = df['review'].apply(remove_tags)


In [12]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


In [13]:
import nltk
nltk.download('stopwords')
nltk.corpus.stopwords.words('english')
stop_word  = set[nltk.corpus.stopwords.words('english')]

df['review'] = df['review'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_word)]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
df['review'] = df['review'].str.replace('http\S+\www.\S+','',case=False)

In [15]:
X = df['review']
y = df['sentiment']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [19]:
# STEP 1: Quick CV baselines for multiple classifiers
# Run this now to compare families before tuning

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, f1_score
import warnings
warnings.filterwarnings("ignore")

# ASSUMPTION: You already have X (array/Series of preprocessed text) and y (0/1 sentiment)
# If not, set:
# X = df['review'].astype(str)
# y = df['sentiment'].astype(int)

# Quick check (uncomment if needed)
# print("Samples:", len(X), "Positive:", y.sum(), "Negative:", len(y)-y.sum())

models = {
    "LogisticRegression": LogisticRegression(max_iter=100),
    "LinearSVC": LinearSVC(max_iter=500),
    "MultinomialNB": MultinomialNB(),
    "RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42),
    "KNN": KNeighborsClassifier(n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42)
}

# CV strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring_acc = "accuracy"
scoring_f1_macro = make_scorer(f1_score, average="macro")

results = []
for name, clf in models.items():
    pipe = make_pipeline(
        TfidfVectorizer(max_features=30000, ngram_range=(1,2)),
        clf
    )
    print(f"Running CV for: {name} ...")
    acc_scores = cross_val_score(pipe, X, y, cv=cv, scoring=scoring_acc, n_jobs=-1)
    f1_scores = cross_val_score(pipe, X, y, cv=cv, scoring=scoring_f1_macro, n_jobs=-1)
    results.append({
        "model": name,
        "acc_mean": acc_scores.mean(),
        "acc_std": acc_scores.std(),
        "f1_mean": f1_scores.mean(),
        "f1_std": f1_scores.std()
    })

res_df = pd.DataFrame(results).sort_values("f1_mean", ascending=False).reset_index(drop=True)
print("\nCV baseline results (sorted by F1 macro):")
print(res_df.round(4))


Running CV for: LogisticRegression ...
Running CV for: LinearSVC ...
Running CV for: MultinomialNB ...
Running CV for: RandomForest ...
Running CV for: KNN ...
Running CV for: GradientBoosting ...

CV baseline results (sorted by F1 macro):
                model  acc_mean  acc_std  f1_mean  f1_std
0       MultinomialNB    0.8610   0.0122   0.8609  0.0123
1           LinearSVC    0.8593   0.0134   0.8591  0.0135
2  LogisticRegression    0.8448   0.0047   0.8445  0.0047
3    GradientBoosting    0.7936   0.0078   0.7927  0.0075
4        RandomForest    0.7894   0.0124   0.7893  0.0124
5                 KNN    0.6905   0.0149   0.6902  0.0144


In [17]:
from sklearn.

SyntaxError: invalid syntax (ipython-input-2184803493.py, line 1)