In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import joblib  #for save model


In [6]:
df = pd.read_csv("../data/IMDB_Dataset_cleaned.csv")

df.head()

Unnamed: 0,review,sentiment,review_length,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,1761,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,positive,998,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,926,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,748,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1317,petter matteis love time money visually stunni...


## let's convert the clean texts (cleaned_review) into a numerical representation that the model understands (TF-IDF).

In [7]:
# TF-IDF

tfidf = TfidfVectorizer(max_features=5000)  # Use 5000 feature
X = tfidf.fit_transform(df['cleaned_review'])
y = df['sentiment'].map({'positive':1, 'negative':0})  #encode label to 0 or 1


## let's train our model

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
lr = LogisticRegression()
lr.fit(X_train, y_train)


y_pred = lr.predict(X_test)
print("F1 Score:", f1_score(y_test, y_pred))


F1 Score: 0.8873653281096964


In [10]:
import os

os.makedirs("../models/Sentiment_Classifier_v1", exist_ok=True)
joblib.dump(lr, "../models/Sentiment_Classifier_v1/lr_model.joblib")
joblib.dump(tfidf, "../models/Sentiment_Classifier_v1/tfidf_vectorizer.joblib")


['../models/Sentiment_Classifier_v1/tfidf_vectorizer.joblib']