In [6]:
pip install datasets


Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import joblib
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [8]:
ds = load_dataset("imdb")
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])


In [9]:
import re
import string

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # remove extra whitespace
    return text.strip()

train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)


In [10]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])


In [11]:
y_train = train_df['label']
y_test = test_df['label']

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [12]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.87888

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [13]:
print("Number of documents in training set:", len(train_df))
print("Number of documents in test set:", len(test_df))
print("Vocabulary size (TF-IDF):", len(vectorizer.vocabulary_))


Number of documents in training set: 25000
Number of documents in test set: 25000
Vocabulary size (TF-IDF): 5000


- Accuracy: ~88% on IMDb test set with just TF-IDF + Logistic Regression.

**Limitations:**
- BoW/TF-IDF ignore word order and context.
- Can't capture sarcasm or subtle tone (e.g., "so bad it's good").
- Vocabulary fixed during training; can't handle OOV (out-of-vocabulary) words well.

This simple model acts as a baseline to compare later LLM-based models (like DistilBERT).


In [None]:
joblib.dump(model, 'baseline_logreg_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']