<a href="https://colab.research.google.com/github/LikhithaBanna/NLP/blob/main/Lab-4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
import zipfile # Import the zipfile module

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

# Corrected line to read 'train.csv' from the zip archive
with zipfile.ZipFile('/content/archive (7) (1) (1).zip', 'r') as z:
    with z.open('train.csv') as f:
        df = pd.read_csv(f)
print(df.head())

# Corrected column names based on df.head() output
texts = df["description_x"].values
labels = df["same_security"].astype(int).values # Convert boolean labels to integers

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in STOPWORDS]
    return " ".join(tokens)

# Apply clean_text to the correct column
df["clean_text"] = df["description_x"].apply(clean_text)

X_train, X_val, y_train, y_val = train_test_split(df["clean_text"], labels, test_size=0.2, random_state=42)

count_vec = CountVectorizer()
X_train_count = count_vec.fit_transform(X_train)
X_val_count = count_vec.transform(X_val)

tfidf_vec = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_val_tfidf = tfidf_vec.transform(X_val)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_val_tfidf)

svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_val_tfidf)

print("\n--- Logistic Regression (TF-IDF) ---")
print(classification_report(y_val, y_pred_lr))

print("\n--- SVM (TF-IDF) ---")
print(classification_report(y_val, y_pred_svm))

tokenizer = Tokenizer(num_words=10000, oov_token="")
tokenizer.fit_on_texts(df["clean_text"])
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

max_len = 30
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding="post")

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

mlp = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
mlp.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
mlp.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_val_pad, y_val))

y_pred_mlp = (mlp.predict(X_val_pad) > 0.5).astype("int32")

print("\n--- MLP (Embeddings) ---")
print(classification_report(y_val, y_pred_mlp))

cnn = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    Conv1D(128, 5, activation="relu"),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
cnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
cnn.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_val_pad, y_val))

y_pred_cnn = (cnn.predict(X_val_pad) > 0.5).astype("int32")

print("\n--- CNN (Embeddings) ---")
print(classification_report(y_val, y_pred_cnn))

lstm = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(128, dropout=0.3, recurrent_dropout=0.3),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")
])
lstm.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
lstm.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_val_pad, y_val))

y_pred_lstm = (lstm.predict(X_val_pad) > 0.5).astype("int32")

print("\n--- LSTM (Embeddings) ---")
print(classification_report(y_val, y_pred_lstm))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Unnamed: 0                                      description_x  \
0           0                     first trust dow jones internet   
1           1                schwab intl large company index etf   
2           2                       vanguard small cap index adm   
3           3  duke energy corp new com new isin #us4 sedol #...   
4           4                                   visa inc class a   

                                       description_y ticker_x ticker_y  \
0                        first trust dj internet idx      FDN      FDN   
1  schwab strategic tr fundamental intl large co ...     FNDF     FNDF   
2                 vanguard small-cap index fund inst    VSMAX    VSCIX   
3  duke energy corp new com new isin #us26441c204...      DUK      DUK   
4                                          visa inc.        V        V   

   same_security  
0           True  
1           True  
2          False  
3           True  
4           True  

--- Logistic Regression (TF-IDF



[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7146 - loss: 0.6007 - val_accuracy: 0.7436 - val_loss: 0.5548
Epoch 2/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7565 - loss: 0.5412 - val_accuracy: 0.7436 - val_loss: 0.5405
Epoch 3/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7525 - loss: 0.5336 - val_accuracy: 0.7436 - val_loss: 0.5099
Epoch 4/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7575 - loss: 0.4862 - val_accuracy: 0.7739 - val_loss: 0.4379
Epoch 5/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8008 - loss: 0.4299 - val_accuracy: 0.8648 - val_loss: 0.3497
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

--- MLP (Embeddings) ---
              precision    recall  f1-score   support

           0       0.91      0.53      0.67       110
       



[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.7350 - loss: 0.5775 - val_accuracy: 0.8438 - val_loss: 0.3782
Epoch 2/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8609 - loss: 0.3307 - val_accuracy: 0.8765 - val_loss: 0.3150
Epoch 3/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9271 - loss: 0.2127 - val_accuracy: 0.8928 - val_loss: 0.2938
Epoch 4/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9298 - loss: 0.1775 - val_accuracy: 0.8881 - val_loss: 0.3168
Epoch 5/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9532 - loss: 0.1406 - val_accuracy: 0.8881 - val_loss: 0.3203
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

--- CNN (Embeddings) ---
              precision    recall  f1-score   support

           0       0.78      0.79      0.78       110
 



[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 92ms/step - accuracy: 0.7185 - loss: 0.5990 - val_accuracy: 0.7436 - val_loss: 0.5794
Epoch 2/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 102ms/step - accuracy: 0.7602 - loss: 0.5628 - val_accuracy: 0.7436 - val_loss: 0.5687
Epoch 3/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 78ms/step - accuracy: 0.7731 - loss: 0.5401 - val_accuracy: 0.7436 - val_loss: 0.5637
Epoch 4/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 77ms/step - accuracy: 0.7880 - loss: 0.4880 - val_accuracy: 0.7366 - val_loss: 0.3929
Epoch 5/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 105ms/step - accuracy: 0.7549 - loss: 0.4244 - val_accuracy: 0.8438 - val_loss: 0.3567
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step

--- LSTM (Embeddings) ---
              precision    recall  f1-score   support

           0       0.64      0.88      0.74       1