# Trained LogisticRegression model 

In [11]:
import pandas as pd
import re
import string
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# === Load dataset ===
df = pd.read_csv(r"C:\Users\Sajjad\Desktop\Universty__Work\IMDB Dataset.csv")

# === Text preprocessing function ===
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"<.*?>", "", text)  # remove HTML tags
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    return text

df["review"] = df["review"].apply(clean_text)

# === Split dataset ===
X = df["review"]
y = df["sentiment"].map({"positive": 1, "negative": 0})  # encode target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# === TF-IDF Vectorization ===
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# === Train Logistic Regression Model ===
model = LogisticRegression(max_iter=200, solver="liblinear")
model.fit(X_train_tfidf, y_train)

# === Predictions ===
y_pred = model.predict(X_test_tfidf)

# === Evaluation ===
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.4f}\n")
print("📊 Classification Report")
print(classification_report(y_test, y_pred))

print("\n🔹 Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

# === Save model & vectorizer ===
joblib.dump(model, r"C:\Users\Sajjad\Desktop\Universty__Work\models\imdb_logreg_model.pkl")
joblib.dump(tfidf, r"C:\Users\Sajjad\Desktop\Universty__Work\models\tfidf_vectorizer.pkl")

print("\n✅ Model & Vectorizer saved in: C:\\Users\\Sajjad\\Desktop\\Universty__Work\\models")


✅ Accuracy: 0.9017

📊 Classification Report
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000


🔹 Confusion Matrix
[[4410  551]
 [ 432 4607]]

✅ Model & Vectorizer saved in: C:\Users\Sajjad\Desktop\Universty__Work\models


# Load Trained model files l & test 

In [12]:
import pandas as pd
import re
import string
import joblib
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

# === Load dataset again ===
df = pd.read_csv(r"C:\Users\Sajjad\Desktop\Universty__Work\IMDB Dataset.csv")

# === Text preprocessing ===
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)  
    text = re.sub(r"\d+", "", text)  
    text = text.translate(str.maketrans("", "", string.punctuation))  
    return text

df["review"] = df["review"].apply(clean_text)

X = df["review"]
y = df["sentiment"].map({"positive": 1, "negative": 0})

# === New split: 70% train / 30% test ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, shuffle=True
)

# === Load saved model & vectorizer ===
model = joblib.load(r"C:\Users\Sajjad\Desktop\Universty__Work\models\imdb_logreg_model.pkl")
tfidf = joblib.load(r"C:\Users\Sajjad\Desktop\Universty__Work\models\tfidf_vectorizer.pkl")

# === Transform test data ===
X_test_tfidf = tfidf.transform(X_test)

# === Predict ===
y_pred = model.predict(X_test_tfidf)

# === Evaluation ===
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy on 30% test set: {acc:.4f}\n")
print("📊 Classification Report")
print(classification_report(y_test, y_pred))

print("\n🔹 Confusion Matrix")
print(confusion_matrix(y_test, y_pred))


✅ Accuracy on 30% test set: 0.9111

📊 Classification Report
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      7411
           1       0.91      0.92      0.91      7589

    accuracy                           0.91     15000
   macro avg       0.91      0.91      0.91     15000
weighted avg       0.91      0.91      0.91     15000


🔹 Confusion Matrix
[[6682  729]
 [ 604 6985]]
