<a href="https://colab.research.google.com/github/KARTIKSINGH542/project1/blob/main/Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('train (1).csv')
df = df.dropna(subset=['text'])
X = df['text']

In [None]:
import spacy
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

import nltk
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_tweet(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    words = tokenizer.tokenize(text)
    words = [word for word in words if word not in stop_words and len(word) > 1]
    stemmed = [stemmer.stem(word) for word in words]
    doc = nlp(" ".join(stemmed))
    lemmatized = [token.lemma_ for token in doc]

    return " ".join(lemmatized)

df['normalized_text'] = df['text'].astype(str).apply(preprocess_tweet)

In [None]:
from wordcloud import WordCloud

all_text = ' '.join(df['normalized_text'])
wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(all_text)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Frequent Words in Tweets", fontsize=18)
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_features=5000)
X_bow = bow_vectorizer.fit_transform(df['normalized_text'])

print(X_bow.shape)

In [None]:
from transformers import BertTokenizer, BertModel
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)

    with torch.no_grad():
        outputs = model(**inputs)

    last_hidden_state = outputs.last_hidden_state  # shape: (1, seq_len, hidden_size)
    mean_embedding = last_hidden_state.mean(dim=1).squeeze().numpy()  # shape: (768,)

    return mean_embedding

In [None]:
import numpy as np
from tqdm import tqdm

bert_embeddings = []
for tweet in tqdm(df['normalized_text']):
    try:
        emb = get_bert_embedding(tweet)
        bert_embeddings.append(emb)
    except Exception as e:
        bert_embeddings.append(np.zeros(768))

X_bert = np.array(bert_embeddings)
print(X_bert.shape)

In [None]:
from sklearn.model_selection import train_test_split

X = X_bert
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"{name} Performance:")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1-score :", f1_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("="*50)

from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
evaluate_model("Logistic Regression", lr_model, X_test, y_test)

from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
evaluate_model("SVM", svm_model, X_test, y_test)

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
evaluate_model("Naive Bayes", nb_model, X_test, y_test)

import seaborn as sns
import matplotlib.pyplot as plt

def plot_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

plot_conf_matrix(y_test, lr_model.predict(X_test), "Logistic Regression Confusion Matrix")


In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_roc(models, X_test, y_test):
    plt.figure(figsize=(8, 6))
    for name, model in models.items():
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_test)[:, 1]
        else:  # For SVM without probability
            y_score = model.decision_function(X_test)

        fpr, tpr, _ = roc_curve(y_test, y_score)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves")
    plt.legend()
    plt.grid()
    plt.show()

def plot_precision_recall(models, X_test, y_test):
    plt.figure(figsize=(8, 6))
    for name, model in models.items():
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_test)[:, 1]
        else:
            y_score = model.decision_function(X_test)

        precision, recall, _ = precision_recall_curve(y_test, y_score)
        plt.plot(recall, precision, label=name)

    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curves")
    plt.legend()
    plt.grid()
    plt.show()

from sklearn.metrics import confusion_matrix

def plot_conf_matrix(model, name, X_test, y_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

def show_misclassified(model, name, X_test, y_test, raw_texts):
    y_pred = model.predict(X_test)
    mis_idx = np.where(y_pred != y_test)[0]

    print(f"\n🔍 Misclassified by {name}:")
    for i in mis_idx[:5]:  # Show top 5
        print(f"\nTweet: {raw_texts[i]}")
        print(f"True Label: {y_test[i]}, Predicted: {y_pred[i]}")

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

lr_params = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}
grid_lr = GridSearchCV(LogisticRegression(max_iter=1000), lr_params, cv=3, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_

svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
random_svm = RandomizedSearchCV(SVC(probability=True), svm_params, n_iter=5, cv=3, scoring='f1', n_jobs=-1, random_state=42)
random_svm.fit(X_train, y_train)
best_svm = random_svm.best_estimator_

nb_model = GaussianNB().fit(X_train, y_train)

models = {
    "Logistic Regression": best_lr,
    "SVM": best_svm,
    "Naive Bayes": nb_model
}

plot_roc(models, X_test, y_test)
plot_precision_recall(models, X_test, y_test)

for name, model in models.items():
    plot_conf_matrix(model, name, X_test, y_test)

raw_texts = df['text'].values  # or df['normalized_text'].values
for name, model in models.items():
    show_misclassified(model, name, X_test, y_test, raw_texts)
