# 🎯 Sentiment Analysis Project (IMDb Reviews)
This project uses machine learning to classify IMDb movie reviews as Positive or Negative.

In [None]:
# ✅ Step 1: Install and Import Libraries
!pip install datasets --quiet
import pandas as pd
import re
import string
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [None]:
# ✅ Step 2: Load IMDb Dataset
dataset = load_dataset('imdb')
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])
train_df.head()

In [None]:
# ✅ Step 3: Clean Text Data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r"\d+", '', text)
    return text

train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)
train_df[['text', 'clean_text']].head()

In [None]:
# ✅ Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['clean_text'])
X_test = vectorizer.transform(test_df['clean_text'])
y_train = train_df['label']
y_test = test_df['label']

In [None]:
# ✅ Step 5: Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# ✅ Step 6: Evaluate the Model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
# ✅ Step 7: Predict Custom Input
sample = "The movie was boring and too long."
sample_clean = clean_text(sample)
sample_vector = vectorizer.transform([sample_clean])
result = model.predict(sample_vector)
print("Sentiment:", "Positive" if result[0] == 1 else "Negative")

In [None]:
# ✅ Step 8: Save the Model and Vectorizer
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')