In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from wordcloud import WordCloud

def load_data(filepath):
    df = pd.read_csv(filepath, names=['review', 'sentiment'], header=None)
    return df


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def apply_preprocessing(df):
    print("Starting text preprocessing...")
    df['clean_review'] = df['review'].apply(preprocess_text)
    print("Text preprocessing complete.")
    return df

def feature_extraction(df):
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df['clean_review'])
    return X, vectorizer

def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

def train_models(X_train, y_train):
    models = {}
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    models['Naive Bayes'] = nb
    
    svm = SVC(kernel='linear', probability=True)
    svm.fit(X_train, y_train)
    models['SVM'] = svm
    return models

def cross_validate_models(models, X_train, y_train):
    for name, model in models.items():
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        print(f"{name} acc: {scores.mean():.4f} (+/- {scores.std():.4f})")


def evaluate_models(models, X_test, y_test):
    for name, model in models.items():
        print(f"\nEvaluating model: {name}")
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label='positive')
        recall = recall_score(y_test, y_pred, pos_label='positive')
        f1 = f1_score(y_test, y_pred, pos_label='positive')
        print(f"Accuracy: {acc:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(classification_report(y_test, y_pred))
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6,4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
        plt.title(f'{name} Confusion Matrix')
        plt.xlabel('Predicted Labels')
        plt.ylabel('True Labels')
        plt.show()

def visualize_wordcloud(df, sentiment):
    text = ' '.join(df[df['sentiment'] == sentiment]['clean_review'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(15, 7.5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{sentiment.capitalize()} Review Word Cloud')
    plt.show()

def save_models(models, vectorizer, model_path='models/'):
    import os
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    for name, model in models.items():
        joblib.dump(model, os.path.join(model_path, f"{name}.joblib"))
        print(f"Saved model: {name}")
    joblib.dump(vectorizer, os.path.join(model_path, "vectorizer.joblib"))
    print("Saved vectorizer.")

def load_models(model_path='models/'):
    models = {}
    model_files = ['Naive Bayes.joblib', 'SVM.joblib']
    for file in model_files:
        name = file.replace('.joblib', '')
        models[name] = joblib.load(os.path.join(model_path, file))
    vectorizer = joblib.load(os.path.join(model_path, "vectorizer.joblib"))
    print("Loaded all models and vectorizer.")
    return models, vectorizer

def predict_new_review(review, models, vectorizer):
    clean = preprocess_text(review)
    X_new = vectorizer.transform([clean])
    predictions = {}
    for name, model in models.items():
        pred = model.predict(X_new)[0]
        predictions[name] = pred
    return predictions

def main():
    filepath = 'IMDB_Dataset.csv'
    
    df = load_data(filepath)
     
    df = apply_preprocessing(df)
    
    X, vectorizer = feature_extraction(df)
    y = df['sentiment']
    
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    models = train_models(X_train, y_train)

    cross_validate_models(models, X_train, y_train)

    evaluate_models(models, X_test, y_test)
    
    visualize_wordcloud(df, 'positive')
    visualize_wordcloud(df, 'negative')
    
    save_models(models, vectorizer)
    
    sample_review = " "
    predictions = predict_new_review(sample_review, models, vectorizer)
    for model, pred in predictions.items():
        print(f"{model}: {pred}")

if __name__ == "__main__":
    main()


Starting text preprocessing...
Text preprocessing complete.
