In [None]:
!pip install pandas numpy scikit-learn nltk textblob transformers matplotlib seaborn plotly dash shap wordcloud xgboost torch tensorflow vaderSentiment ipython flask accelerate emoji joblib

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)  # Suppress Keras warnings
warnings.filterwarnings('ignore', category=FutureWarning)  # Suppress FutureWarnings

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from xgboost import XGBClassifier
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud
import joblib
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import emoji

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

label_mapping = {1: "Positive üôÇ", 0: "Negative üòû"}

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def load_and_preprocess_dataset(file_path):
    df = pd.read_csv(file_path)
    if 'text' not in df.columns or 'label' not in df.columns:
        raise ValueError("Dataset must have 'text' and 'label' columns")
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    df['label'] = df['label'].map({'positive': 1, 'negative': 0})
    return df

def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    return "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"

def get_vader_emotions(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return {"positive": scores['pos'], "negative": scores['neg'], "neutral": scores['neu'], "compound": scores['compound']}

def apply_sentiment_analysis(df):
    df['sentiment'] = df['cleaned_text'].apply(get_sentiment)
    print("TextBlob Sentiment Analysis:")
    print(df[['cleaned_text', 'sentiment']].head())
    df_subset = df.head(10).copy()
    df_subset.loc[:, 'emotions'] = df_subset['cleaned_text'].apply(get_vader_emotions)
    print("\nVADER Sentiment Analysis (first 10 rows):")
    print(df_subset[['cleaned_text', 'sentiment', 'emotions']].head())
    return df

def train_traditional_models(X_train, X_test, y_train, y_test):
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Naive Bayes": MultinomialNB(),
        "Decision Tree": DecisionTreeClassifier(max_depth=10),
        "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10),
        "XGBoost": XGBClassifier(eval_metric='logloss')
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        y_prob = model.predict_proba(X_test_vec)[:, 1]  # Probabilities for ROC
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        results[name] = {
            "accuracy": accuracy,
            "report": report,
            "fpr": fpr.tolist(),  # Convert to list for serialization
            "tpr": tpr.tolist(),
            "roc_auc": roc_auc
        }
        print(f"{name} Accuracy: {accuracy}")
    return models, vectorizer, results

def train_lstm(X_train, X_test, y_train, y_test):
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(X_train)
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    max_length = 100
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

    y_train_np = np.array(y_train)
    y_test_np = np.array(y_test)

    lstm_model = Sequential([
        Embedding(input_dim=5000, output_dim=128),  # Removed input_length
        LSTM(64, return_sequences=False),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = lstm_model.fit(X_train_pad[:5000], y_train_np[:5000],
                            epochs=3, batch_size=32,
                            validation_data=(X_test_pad[:1000], y_test_np[:1000]),
                            verbose=1)

    y_prob_lstm = lstm_model.predict(X_test_pad[:1000])
    y_pred_lstm = (y_prob_lstm > 0.5).astype(int)
    lstm_accuracy = accuracy_score(y_test_np[:1000], y_pred_lstm)
    fpr, tpr, _ = roc_curve(y_test_np[:1000], y_prob_lstm)
    roc_auc = auc(fpr, tpr)
    lstm_results = {
        "LSTM": {
            "accuracy": lstm_accuracy,
            "report": classification_report(y_test_np[:1000], y_pred_lstm, output_dict=True),
            "fpr": fpr.tolist(),
            "tpr": tpr.tolist(),
            "roc_auc": roc_auc
        }
    }
    print(f"LSTM Accuracy: {lstm_accuracy}")
    return lstm_model, tokenizer, lstm_results

def train_bert(X_train, X_test, y_train, y_test):
    tokenizer_bert = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
    bert_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

    train_texts = X_train[:1000].tolist()
    test_texts = X_test[:200].tolist()
    train_labels = y_train[:1000].tolist()
    test_labels = y_test[:200].tolist()

    train_encodings = tokenizer_bert(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    test_encodings = tokenizer_bert(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

    class SentimentDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = SentimentDataset(train_encodings, train_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch"  # Updated from evaluation_strategy
    )

    trainer = Trainer(
        model=bert_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    trainer.train()
    predictions = trainer.predict(test_dataset)
    y_prob_bert = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()[:, 1]
    y_pred_bert = np.argmax(predictions.predictions, axis=1)
    bert_accuracy = accuracy_score(test_labels, y_pred_bert)
    fpr, tpr, _ = roc_curve(test_labels, y_prob_bert)
    roc_auc = auc(fpr, tpr)
    bert_results = {
        "BERT": {
            "accuracy": bert_accuracy,
            "report": classification_report(test_labels, y_pred_bert, output_dict=True),
            "fpr": fpr.tolist(),
            "tpr": tpr.tolist(),
            "roc_auc": roc_auc
        }
    }
    print(f"BERT Accuracy: {bert_accuracy}")
    return bert_model, tokenizer_bert, bert_results

def visualize_model_performance(results):
    model_names = list(results.keys())
    accuracies = [results[name]["accuracy"] for name in model_names]
    plt.figure(figsize=(12, 6))
    sns.barplot(x=accuracies, y=model_names, hue=model_names, palette="viridis", legend=False)
    plt.title("Model Accuracy Comparison")
    plt.xlabel("Accuracy")
    plt.ylabel("Model")
    plt.savefig("model_accuracy_bar.png")
    plt.close()

def generate_wordclouds(df):
    positive_text = ' '.join(df[df['sentiment'] == 'Positive']['cleaned_text'])
    if positive_text.strip():
        wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud_positive, interpolation='bilinear')
        plt.axis('off')
        plt.title("Word Cloud for Positive Sentiment")
        plt.savefig("wordcloud_positive.png")
        plt.close()

    negative_text = ' '.join(df[df['sentiment'] == 'Negative']['cleaned_text'])
    if negative_text.strip():
        wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative_text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud_negative, interpolation='bilinear')
        plt.axis('off')
        plt.title("Word Cloud for Negative Sentiment")
        plt.savefig("wordcloud_negative.png")
        plt.close()

def main():
    dataset_path = 'IMDB-Dataset.csv'
    print(f"Loading dataset from {dataset_path}...")
    df = load_and_preprocess_dataset(dataset_path)
    print("Dataset Shape:", df.shape)
    print(df.head())

    df = apply_sentiment_analysis(df)

    print("\nSplitting data into training and testing sets...")
    X = df['cleaned_text']
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("\nTraining traditional models...")
    models, vectorizer, results = train_traditional_models(X_train, X_test, y_train, y_test)

    print("\nTraining LSTM model...")
    lstm_model, lstm_tokenizer, lstm_results = train_lstm(X_train, X_test, y_train, y_test)
    results.update(lstm_results)

    print("\nTraining BERT model...")
    bert_model, bert_tokenizer, bert_results = train_bert(X_train, X_test, y_train, y_test)
    results.update(bert_results)

    print("\nFinal Model Accuracies:")
    for name in results:
        print(f"{name}: {results[name]['accuracy']}")

    print("\nGenerating visualizations...")
    visualize_model_performance(results)
    generate_wordclouds(df)

    print("\nSaving the best model...")
    best_model_name = max(results, key=lambda k: results[k]["accuracy"])
    print(f"Best Model: {best_model_name} with Accuracy: {results[best_model_name]['accuracy']}")

    if best_model_name == "LSTM":
        lstm_model.save("lstm_sentiment_model.h5")
        joblib.dump(lstm_tokenizer, "lstm_tokenizer.pkl")
    elif best_model_name == "BERT":
        bert_model.save_pretrained("bert_sentiment_model")
        bert_tokenizer.save_pretrained("bert_sentiment_model")
    else:
        best_model = models[best_model_name]
        joblib.dump(best_model, "sentiment_model.pkl")
        joblib.dump(vectorizer, "vectorizer.pkl")

    joblib.dump(results, "model_metrics.pkl")
    print("Metrics saved to 'model_metrics.pkl'")
    print("Files saved:", os.listdir('.'))

if __name__ == "__main__":
    main()




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mallu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mallu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mallu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mallu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loading dataset from IMDB-Dataset.csv...
Dataset Shape: (50000, 3)
                                                text  label  \
0  One of the other reviewers has mentioned that ...      1   
1  A wonderful little production. <br /><br />The...      1   
2  I thought this was a wonderful way to spend ti...      1   
3  Basically there's a family where a little boy ...      0   
4  Petter Mattei's "Love in the Time of Money" is...      1   

                                        cleaned_text  
0  one reviewer mentioned watching 1 oz episode y...  
1  wonderful little production br br filming tech...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  
TextBlob Sentiment Analysis:
                                        cleaned_text sentiment
0  one reviewer mentioned watching 1 oz episode y...  Positive
1  wonderful little production br br filming tech...  Positive
2  tho

Epoch,Training Loss,Validation Loss
1,0.3104,0.422123


BERT Accuracy: 0.815

Final Model Accuracies:
Logistic Regression: 0.8871
Naive Bayes: 0.8543
Decision Tree: 0.7295
Random Forest: 0.8272
XGBoost: 0.8608
LSTM: 0.644
BERT: 0.815

Generating visualizations...

Saving the best model...
Best Model: Logistic Regression with Accuracy: 0.8871
Metrics saved to 'model_metrics.pkl'
Files saved: ['.ipynb_checkpoints', 'app.py', 'IMDB-Dataset.csv', 'logs', 'model_accuracy_bar.png', 'model_metrics.pkl', 'results', 'sentiment_model.pkl', 'static', 'templates', 'Untitled.ipynb', 'vectorizer.pkl', 'wordcloud_negative.png', 'wordcloud_positive.png']
