In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import joblib

nltk.download('stopwords')

df = pd.read_csv("IMDB Dataset.csv")
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

n_samples_per_class = 10000
pos_samples = df[df['sentiment'] == 1].sample(n=n_samples_per_class, random_state=42)
neg_samples = df[df['sentiment'] == 0].sample(n=n_samples_per_class, random_state=42)
balanced_df = pd.concat([pos_samples, neg_samples]).sample(frac=1, random_state=42).reset_index(drop=True)

# Prepare stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Chat words dictionary
chat_words = {
    "LOL": "Laughing Out Loud",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "TTYL": "Talk To You Later",
    "BRB": "Be Right Back",
    "IDK": "I Don't Know",
    "IMHO": "In My Humble Opinion",
    "FYI": "For Your Information",
    "ILY": "I Love You",
    "IMO": "In My Opinion",
    "JK": "Just Kidding",
    "IRL": "In Real Life",
    "SMH": "Shaking My Head",
    "YOLO": "You Only Live Once",
    "AFK": "Away From Keyboard",
    "BFF": "Best Friends Forever",
    "GG": "Good Game",
    "IDC": "I Don't Care",
    "TBH": "To Be Honest"
}

# Text preprocessing functions
def lowercase_text(series):
    return series.str.lower()

def remove_html(series):
    return series.apply(lambda x: re.sub(r'<.*?>', '', x))

def remove_punctuation(series):
    return series.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

def handle_chatwords(series):
    return series.apply(lambda x: " ".join([chat_words.get(word.upper(), word) for word in x.split()]))

def remove_stopwords(series):
    return series.apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

def stem_text(series):
    return series.apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

data = balanced_df['review']
data = lowercase_text(data)
data = remove_html(data)
data = remove_punctuation(data)
data = handle_chatwords(data)
data = remove_stopwords(data)
data = stem_text(data)


balanced_df['clean_text'] = data


X_train, X_test, y_train, y_test = train_test_split(
    balanced_df['clean_text'], balanced_df['sentiment'], test_size=0.2, random_state=42
)


model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=30000, ngram_range=(1,3), stop_words='english')),
    ('clf', LogisticRegression(max_iter=2000))
])


model_pipeline.fit(X_train, y_train)

predictions = model_pipeline.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))

joblib.dump(model_pipeline, 'sentiment_pipeline.pkl')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Confusion Matrix:
 [[1648  297]
 [ 191 1864]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.87      1945
           1       0.86      0.91      0.88      2055

    accuracy                           0.88      4000
   macro avg       0.88      0.88      0.88      4000
weighted avg       0.88      0.88      0.88      4000



['sentiment_pipeline.pkl']