In [41]:
import pandas as pd
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [43]:
df.duplicated().sum()

418

In [44]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [45]:
import re

def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www.\S+","",t)
    t = re.sub(r"@[A-Za-z0-9_]+","",t)
    t = re.sub(r"[^a-z\s]","",t)
    t = re.sub(r"\s+"," ",t).strip()
    return t

df["review"] = df["review"].apply(clean_text)
df["sentiment"] = df["sentiment"].map({"negative":0, "positive":1})


In [46]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=11
)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [49]:
X_train.shape

(39665, 155153)

# SVM Classifier

In [50]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)

# Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=120,max_depth=9,oob_score=True)
rf.fit(X_train, y_train)

# Result


In [52]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# SVM
print("SVM Accuracy:", accuracy_score(y_test, svm.predict(X_test)))
print(confusion_matrix(y_test, svm.predict(X_test)))
print(classification_report(y_test, svm.predict(X_test)))

# Random Forest
print("RF Accuracy:", accuracy_score(y_test, rf.predict(X_test)))
print(confusion_matrix(y_test, rf.predict(X_test)))
print(classification_report(y_test, rf.predict(X_test)))

SVM Accuracy: 0.8989613794494302
[[4445  534]
 [ 468 4470]]
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      4979
           1       0.89      0.91      0.90      4938

    accuracy                           0.90      9917
   macro avg       0.90      0.90      0.90      9917
weighted avg       0.90      0.90      0.90      9917

RF Accuracy: 0.8126449531108199
[[3759 1220]
 [ 638 4300]]
              precision    recall  f1-score   support

           0       0.85      0.75      0.80      4979
           1       0.78      0.87      0.82      4938

    accuracy                           0.81      9917
   macro avg       0.82      0.81      0.81      9917
weighted avg       0.82      0.81      0.81      9917

