<a href="https://colab.research.google.com/github/LAXMIPRASANNA292005/NLP--2025/blob/main/lab_16_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [106]:
import os
import re
import random
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [107]:
import os
import re
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [108]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [109]:
DATA_PATH = "/content/tweets.csv"
TEST_SIZE = 0.2
MAX_NUM_WORDS = 30000
MAX_SEQUENCE_LENGTH = 60
EMBEDDING_DIM = 100
BATCH_SIZE = 64
EPOCHS = 8
USE_GLOVE = False
GLOVE_PATH = "/mnt/data/glove.6B.100d.txt"

In [110]:
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
def clean_tweet(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # remove urls
    text = re.sub(r"http\S+|www\.[^\s]+", " ", text)
    # remove mentions
    text = re.sub(r"@\w+", " ", text)
    # remove hash symbol only (keep the word), then remove any non-letter
    text = re.sub(r"#", " ", text)
    # remove non-alphabetic characters
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in STOPWORDS]
    return " ".join(tokens)

In [112]:
def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


In [113]:
# ---------------------------
# Load dataset
# ---------------------------
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(
        f"Dataset not found at {DATA_PATH}. Please place tweets.csv there with columns 'text' and 'target'."
    )

print("Loading dataset:", DATA_PATH)
df = pd.read_csv(DATA_PATH)


Loading dataset: /content/tweets.csv


In [114]:
df = df[['text', 'target']].dropna().reset_index(drop=True)
print(f"Loaded {len(df)} rows. Class distribution:\n{df['target'].value_counts().to_dict()}")


Loaded 11370 rows. Class distribution:
{0: 9256, 1: 2114}


In [115]:
print("Cleaning texts...")
df['clean_text'] = df['text'].astype(str).apply(clean_tweet)


Cleaning texts...


In [116]:
X = df['clean_text'].values
y = df['target'].astype(int).values
X_train_raw, X_val_raw, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)
print(f"Train: {len(X_train_raw)}, Val: {len(X_val_raw)}")

Train: 9096, Val: 2274


In [117]:
results = []

In [118]:
def train_eval_classical(vectorizer, clf, X_tr, y_tr, X_v, y_v):
    pipe = Pipeline([('vec', vectorizer), ('clf', clf)])
    pipe.fit(X_tr, y_tr)
    preds = pipe.predict(X_v)
    metrics = evaluate_model(y_v, preds)
    return metrics, pipe, preds


In [119]:
print("Training classical baselines...")

Training classical baselines...


In [120]:
count_vec = CountVectorizer(max_features=20000, ngram_range=(1,2))
sv_clf = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
metrics_count_lr, model_count_lr, preds_count_lr = train_eval_classical(count_vec, sv_clf, X_train_raw, y_train, X_val_raw, y_val)
metrics_count_lr.update({'model': 'LogisticRegression', 'features': 'CountVectorizer'})
results.append(metrics_count_lr)
print("CountVectorizer + LR:", metrics_count_lr)

CountVectorizer + LR: {'accuracy': 0.8830255057167986, 'precision': 0.7854545454545454, 'recall': 0.5106382978723404, 'f1': 0.6189111747851003, 'model': 'LogisticRegression', 'features': 'CountVectorizer'}


In [121]:
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
metrics_tfidf_lr, model_tfidf_lr, preds_tfidf_lr = train_eval_classical(tfidf, lr, X_train_raw, y_train, X_val_raw, y_val)
metrics_tfidf_lr.update({'model': 'LogisticRegression', 'features': 'TF-IDF'})
results.append(metrics_tfidf_lr)
print("TF-IDF + LR:", metrics_tfidf_lr)

TF-IDF + LR: {'accuracy': 0.86103781882146, 'precision': 0.8496732026143791, 'recall': 0.3073286052009456, 'f1': 0.4513888888888889, 'model': 'LogisticRegression', 'features': 'TF-IDF'}


In [122]:
svm = LinearSVC(max_iter=20000, random_state=RANDOM_SEED)
metrics_tfidf_svm, model_tfidf_svm, preds_tfidf_svm = train_eval_classical(tfidf, svm, X_train_raw, y_train, X_val_raw, y_val)
metrics_tfidf_svm.update({'model': 'LinearSVC', 'features': 'TF-IDF'})
results.append(metrics_tfidf_svm)
print("TF-IDF + SVM:", metrics_tfidf_svm)


TF-IDF + SVM: {'accuracy': 0.8834652594547053, 'precision': 0.7408536585365854, 'recall': 0.574468085106383, 'f1': 0.6471371504660453, 'model': 'LinearSVC', 'features': 'TF-IDF'}


In [123]:
print("Tokenizing for deep models...")
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_raw)
word_index = tokenizer.word_index
vocab_size = min(MAX_NUM_WORDS, len(word_index)) + 1
print("Vocab size:", vocab_size)

Tokenizing for deep models...
Vocab size: 18144


In [124]:
X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
X_val_seq = tokenizer.texts_to_sequences(X_val_raw)
X_train = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_val = pad_sequences(X_val_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

In [125]:
embedding_matrix = None
if USE_GLOVE and os.path.exists(GLOVE_PATH):
    print("Building embedding matrix from GloVe...")
    embeddings_index = {}
    with open(GLOVE_PATH, 'r', encoding='utf8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    emb_dim = len(next(iter(embeddings_index.values())))
    EMBEDDING_DIM = emb_dim
    embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= vocab_size:
            continue
        vec = embeddings_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec
    print("GloVe matrix built. EMBEDDING_DIM=", EMBEDDING_DIM)
else:
    print("No pretrained embeddings used; training embeddings from scratch.")

No pretrained embeddings used; training embeddings from scratch.


In [126]:
def build_mlp_avg(vocab_size, embedding_dim, seq_len, embedding_matrix=None, trainable=True):
    inp = Input(shape=(seq_len,))
    if embedding_matrix is not None:
        emb = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=seq_len, trainable=trainable)(inp)
    else:
        emb = Embedding(vocab_size, embedding_dim, input_length=seq_len)(inp)
    x = GlobalAveragePooling1D()(emb)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [127]:
def build_cnn1d(vocab_size, embedding_dim, seq_len, embedding_matrix=None, trainable=True):
    inp = Input(shape=(seq_len,))
    if embedding_matrix is not None:
        emb = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=seq_len, trainable=trainable)(inp)
    else:
        emb = Embedding(vocab_size, embedding_dim, input_length=seq_len)(inp)
    x = Conv1D(128, 5, activation='relu')(emb)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.4)(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [128]:
def build_lstm(vocab_size, embedding_dim, seq_len, embedding_matrix=None, trainable=True):
    inp = Input(shape=(seq_len,))
    if embedding_matrix is not None:
        emb = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=seq_len, trainable=trainable)(inp)
    else:
        emb = Embedding(vocab_size, embedding_dim, input_length=seq_len)(inp)
    x = Bidirectional(LSTM(128, return_sequences=False))(emb)
    x = Dropout(0.4)(x)
    x = Dense(64, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [134]:
from tensorflow.keras.callbacks import EarlyStopping

print("Building and training DL models...")


callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True
    )
]


history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)


Building and training DL models...
Epoch 1/10
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - accuracy: 0.7981 - loss: 0.5131 - val_accuracy: 0.8140 - val_loss: 0.4713
Epoch 2/10
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.8118 - loss: 0.4741 - val_accuracy: 0.8201 - val_loss: 0.4243
Epoch 3/10
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - accuracy: 0.8720 - loss: 0.3280 - val_accuracy: 0.8857 - val_loss: 0.3190
Epoch 4/10
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.9302 - loss: 0.1948 - val_accuracy: 0.8799 - val_loss: 0.3009
Epoch 5/10
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.9403 - loss: 0.1684 - val_accuracy: 0.8861 - val_loss: 0.3216
Epoch 6/10
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - accuracy: 0.9620 - loss: 0.1091 - val_accuracy: 0.8870 - v

In [136]:
mlp = build_mlp_avg(vocab_size=vocab_size, embedding_dim=EMBEDDING_DIM, seq_len=MAX_SEQUENCE_LENGTH, embedding_matrix=embedding_matrix, trainable=(embedding_matrix is None))
print(mlp.summary())
mlp.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks, verbose=1)
preds_prob = mlp.predict(X_val).ravel()
preds = (preds_prob >= 0.5).astype(int)
metrics_mlp = evaluate_model(y_val, preds)
metrics_mlp.update({'model': 'MLP_avg', 'features': 'Embeddings'})
results.append(metrics_mlp)
print("MLP metrics:", metrics_mlp)



None
Epoch 1/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.7951 - loss: 0.5134 - val_accuracy: 0.8140 - val_loss: 0.4731
Epoch 2/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8115 - loss: 0.4785 - val_accuracy: 0.8140 - val_loss: 0.4671
Epoch 3/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.8199 - loss: 0.4376 - val_accuracy: 0.8650 - val_loss: 0.3234
Epoch 4/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9174 - loss: 0.2280 - val_accuracy: 0.8883 - val_loss: 0.2971
Epoch 5/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.9397 - loss: 0.1690 - val_accuracy: 0.8830 - val_loss: 0.3116
Epoch 6/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9595 - loss: 0.1195 - val_accuracy: 0.8883 - val_loss: 0.3665
[1m72/72[0m [32m━━━━

In [138]:
cnn = build_cnn1d(vocab_size=vocab_size, embedding_dim=EMBEDDING_DIM, seq_len=MAX_SEQUENCE_LENGTH, embedding_matrix=embedding_matrix, trainable=(embedding_matrix is None))
print(cnn.summary())
cnn.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks, verbose=1)
preds_prob = cnn.predict(X_val).ravel()
preds = (preds_prob >= 0.5).astype(int)
metrics_cnn = evaluate_model(y_val, preds)
metrics_cnn.update({'model': 'CNN_1D', 'features': 'Embeddings'})
results.append(metrics_cnn)
print("CNN metrics:", metrics_cnn)



None
Epoch 1/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 73ms/step - accuracy: 0.7966 - loss: 0.4954 - val_accuracy: 0.8663 - val_loss: 0.3557
Epoch 2/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 71ms/step - accuracy: 0.9258 - loss: 0.2085 - val_accuracy: 0.8804 - val_loss: 0.3144
Epoch 3/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 61ms/step - accuracy: 0.9770 - loss: 0.0635 - val_accuracy: 0.8734 - val_loss: 0.4504
Epoch 4/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 70ms/step - accuracy: 0.9942 - loss: 0.0238 - val_accuracy: 0.8782 - val_loss: 0.5663
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
CNN metrics: {'accuracy': 0.8803869832893579, 'precision': 0.7046070460704607, 'recall': 0.6146572104018913, 'f1': 0.6565656565656566, 'model': 'CNN_1D', 'features': 'Embeddings'}


In [140]:
lstm = build_lstm(vocab_size=vocab_size, embedding_dim=EMBEDDING_DIM, seq_len=MAX_SEQUENCE_LENGTH, embedding_matrix=embedding_matrix, trainable=(embedding_matrix is None))
print(lstm.summary())
lstm.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks, verbose=1)
preds_prob = lstm.predict(X_val).ravel()
preds = (preds_prob >= 0.5).astype(int)
metrics_lstm = evaluate_model(y_val, preds)
metrics_lstm.update({'model': 'BiLSTM', 'features': 'Embeddings'})
results.append(metrics_lstm)
print("LSTM metrics:", metrics_lstm)



None
Epoch 1/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 316ms/step - accuracy: 0.8167 - loss: 0.4619 - val_accuracy: 0.8791 - val_loss: 0.2869
Epoch 2/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 309ms/step - accuracy: 0.9416 - loss: 0.1646 - val_accuracy: 0.8927 - val_loss: 0.3232
Epoch 3/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 322ms/step - accuracy: 0.9787 - loss: 0.0714 - val_accuracy: 0.8848 - val_loss: 0.4121
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 57ms/step
LSTM metrics: {'accuracy': 0.8790677220756377, 'precision': 0.7356687898089171, 'recall': 0.5460992907801419, 'f1': 0.6268656716417911, 'model': 'BiLSTM', 'features': 'Embeddings'}


In [141]:
res_df = pd.DataFrame(results)
res_df = res_df[['model', 'features', 'accuracy', 'precision', 'recall', 'f1']]
res_df = res_df.sort_values(by='f1', ascending=False).reset_index(drop=True)
print('\n===== SUMMARY (sorted by F1) =====')
print(res_df)
res_df.to_csv('experiment_results.csv', index=False)
print('Saved experiment_results.csv')


===== SUMMARY (sorted by F1) =====
                model         features  accuracy  precision    recall  \
0              CNN_1D       Embeddings  0.880387   0.704607  0.614657   
1           LinearSVC           TF-IDF  0.883465   0.740854  0.574468   
2             MLP_avg       Embeddings  0.888303   0.811808  0.520095   
3              BiLSTM       Embeddings  0.879068   0.735669  0.546099   
4  LogisticRegression  CountVectorizer  0.883026   0.785455  0.510638   
5  LogisticRegression           TF-IDF  0.861038   0.849673  0.307329   

         f1  
0  0.656566  
1  0.647137  
2  0.634006  
3  0.626866  
4  0.618911  
5  0.451389  
Saved experiment_results.csv


In [142]:
print('\nTF-IDF + LR classification report:')
print(classification_report(y_val, preds_tfidf_lr, zero_division=0))
print('\nBest DL model classification report (by F1):')
best_dl = res_df[res_df['features']=='Embeddings'].iloc[0]['model']
if best_dl == 'BiLSTM':
    best_preds = (lstm.predict(X_val).ravel() >= 0.5).astype(int)
elif best_dl == 'CNN_1D':
    best_preds = (cnn.predict(X_val).ravel() >= 0.5).astype(int)
else:
    best_preds = (mlp.predict(X_val).ravel() >= 0.5).astype(int)
print(classification_report(y_val, best_preds, zero_division=0))

print('\nDone.')


TF-IDF + LR classification report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      1851
           1       0.85      0.31      0.45       423

    accuracy                           0.86      2274
   macro avg       0.86      0.65      0.69      2274
weighted avg       0.86      0.86      0.83      2274


Best DL model classification report (by F1):
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      1851
           1       0.70      0.61      0.66       423

    accuracy                           0.88      2274
   macro avg       0.81      0.78      0.79      2274
weighted avg       0.88      0.88      0.88      2274


Done.
