<a href="https://colab.research.google.com/github/MarianVelasquez/Sentiment-Analysis-and-Text-Mining/blob/main/Assigment10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import string
import nltk
import spacy
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


In [None]:
from tensorflow.keras.datasets import imdb

# Load dataset with top 10,000 words
num_words = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

print(f"Training samples: {len(x_train)}, Test samples: {len(x_test)}")

# Get the word index mapping
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_review(text_ids):
    # Note: Keras IMDB reserves indices 0-3 for special tokens
    return " ".join([reverse_word_index.get(i - 3, "?") for i in text_ids])

# Show a sample decoded review
print("Sample review:", decode_review(x_train[0]))
print("Label:", y_train[0])

## 2. Text Preprocessing
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and digits
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Decode and clean a subset (for speed in demo)
train_texts = [clean_text(decode_review(x)) for x in x_train[:5000]]
test_texts = [clean_text(decode_review(x)) for x in x_test[:1000]]
train_labels = y_train[:5000]
test_labels = y_test[:1000]

print("Sample cleaned text:", train_texts[0])


## 3. Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

print("TF-IDF matrix shape:", X_train.shape)

## 4. Model Training and Evaluation

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, train_labels)
lr_preds = lr.predict(X_test)

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, train_labels)
nb_preds = nb.predict(X_test)

# SVM
svm = LinearSVC()
svm.fit(X_train, train_labels)
svm_preds = svm.predict(X_test)

# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(test_labels, lr_preds))
print("Naive Bayes Accuracy:", accuracy_score(test_labels, nb_preds))
print("SVM Accuracy:", accuracy_score(test_labels, svm_preds))

print("\nLogistic Regression Classification Report:\n", classification_report(test_labels, lr_preds))

## 5. Pipeline and Hyperparameter Tuning

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

params = {
    'tfidf__max_features': [3000, 5000],
    'clf__C': [0.1, 1, 10]
}

grid = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(train_texts, train_labels)

print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

## 6. Inference on New Data

best_model = grid.best_estimator_

sample_reviews = [
    "I absolutely loved this movie, it was fantastic!",
    "Terrible movie, waste of time.",
    "It was okay, not the best but not the worst.",
    "Brilliant acting and great storyline.",
    "Worst plot ever, very disappointing."
]

predictions = best_model.predict(sample_reviews)

for review, pred in zip(sample_reviews, predictions):
    print(f"Review: {review}\nPredicted Sentiment: {'Positive' if pred == 1 else 'Negative'}\n")