In [4]:
import nltk
import string
import re 
import pandas as pd 
from pipeline_preprocessor import preprocess_for_ml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import joblib  #

In [5]:
df = pd.read_csv("D:/Users/Bolaji/Downloads/archive/IMDB Dataset.csv")
df.head(50)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [6]:
df.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
df['label'] = df.sentiment.map({"negative": 0, "positive": 1})

In [8]:
df.head(4)

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0


In [9]:
df.review = df.review.apply(preprocess_for_ml)

In [10]:
df.head(4)

Unnamed: 0,review,sentiment,label
0,one review mention watch 1 oz episod youll hoo...,positive,1
1,wonder littl product film techniqu unassum old...,positive,1
2,thought wonder way spend time hot summer weeke...,positive,1
3,basic there famili littl boy jake think there ...,negative,0


In [11]:
X = df['review']
y = df['label']



In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [34]:
# 3. Fit vectorizer on training set
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(x_train)

# 4. Transform test set
X_test = vectorizer.transform(x_test)

# 5. Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# 6. Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88      5000
           1       0.88      0.89      0.88      5000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [35]:
accuracy = accuracy_score(y_test, y_pred)

print({"Accuracy:", accuracy * 100 , "%"})

{88.33, '%', 'Accuracy:'}


In [57]:
def predict_sentiment(text):
    # processed_text = preprocess_for_ml(text)
    text_vector = vectorizer.transform([text])
    prediction = model.predict(text_vector)
    return prediction[0]

print(predict_sentiment("Stranger things was not terrible"))
print(predict_sentiment("It was not a good movie"))


0
1


In [25]:
joblib.dump(model, "imbd_sentiment_model.pkl")

['imbd_sentiment_model.pkl']

In [46]:
pipeline = Pipeline([
    # Tfidf with n-grams (unigrams + bigrams) and max features
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2), 
        max_features=50000,  # limit vocab to speed up
        min_df=5,            # ignore rare words
        max_df=0.9,          # ignore very frequent words
    )),
    # Logistic Regression with stronger solver and class weight
    ("clf", LogisticRegression(
        solver="liblinear",   # good for small-medium datasets
        C=2.0,                # slightly stronger regularization
        class_weight="balanced",  # helps if classes are skewed
        max_iter=1000
    ))
])


pipeline.fit(x_train, y_train)

joblib.dump(pipeline, "imbd_sentiment_pipeline.pkl")

['imbd_sentiment_pipeline.pkl']

In [51]:
def predict_sentiment_2(text):
    processed_text = preprocess_for_ml(text)
    prediction = pipeline.predict([processed_text])[0]  # ← Vectorization happens automatically
    return int(prediction) 

In [53]:
predict_sentiment_2("It was a good movie")

1