<a href="https://colab.research.google.com/github/KalusaniLaxman/nlp_lab/blob/main/Untitled58.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas scikit-learn tensorflow

import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# ============================================================
# 1. FIXED DATASET PATH (YOUR REQUEST)
# ============================================================
ZIP_PATH = "/content/tweets.csv.zip"


# ============================================================
# 2. LOAD DATA FROM ZIP
# ============================================================
def load_csv_from_zip(zip_path):
    with zipfile.ZipFile(zip_path, "r") as z:
        csv_files = [f for f in z.namelist() if f.endswith(".csv")]
        print("Found CSV files:", csv_files)

        with z.open(csv_files[0]) as f:
            # Added encoding='latin1' to handle UnicodeDecodeError
            df = pd.read_csv(f, encoding='latin1')

    return df

df = load_csv_from_zip(ZIP_PATH)
print("Dataset loaded successfully!")
print(df.head())
print(df.columns)


# ============================================================
# 3. AUTO DETECT TEXT + LABEL COLUMNS
# ============================================================
def detect_columns(df):
    text_col = None
    label_col = None

    for c in df.columns:
        low = c.lower()
        if "text" in low or "tweet" in low:
            text_col = c
        if "label" in low or "sentiment" in low or "target" in low:
            label_col = c

    # fallback
    if text_col is None:
        text_col = df.columns[0]

    if label_col is None:
        label_col = df.columns[-1]

    return text_col, label_col


text_col, label_col = detect_columns(df)
print("Using text column:", text_col)
print("Using label column:", label_col)

df = df[[text_col, label_col]].dropna().reset_index(drop=True)


# ============================================================
# 4. CLEAN TEXT
# ============================================================
def clean_text(s):
    s = s.lower()
    s = re.sub(r"http\S+|www\.\S+", "", s)
    s = re.sub(r"@\w+", "", s)
    s = re.sub(r"#", "", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

texts = [clean_text(t) for t in df[text_col].astype(str).tolist()]
labels = df[label_col].tolist()


# ============================================================
# 5. ENCODE LABELS
# ============================================================
le = LabelEncoder()
y = le.fit_transform(labels)
num_classes = len(le.classes_)

if num_classes > 2:
    y = tf.keras.utils.to_categorical(y, num_classes)
    loss_fn = "categorical_crossentropy"
    final_act = "softmax"
else:
    loss_fn = "binary_crossentropy"
    final_act = "sigmoid"


# ============================================================
# 6. TRAIN/VAL SPLIT
# ============================================================
X_train, X_val, y_train, y_val = train_test_split(
    texts, y, test_size=0.15, random_state=42
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))


# ============================================================
# 7. TOKENIZATION
# ============================================================
max_words = 20000
maxlen = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

train_seq = tokenizer.texts_to_sequences(X_train)
val_seq   = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(train_seq, maxlen=maxlen, padding="post")
X_val_pad   = pad_sequences(val_seq, maxlen=maxlen, padding="post")

vocab_size = min(max_words, len(tokenizer.word_index) + 1)
print("Vocabulary size:", vocab_size)


# ============================================================
# 8. MODEL DEFINITIONS
# ============================================================

# ---------- C-LSTM MODEL ----------
def build_c_lstm():
    inp = Input(shape=(maxlen,))
    x = layers.Embedding(vocab_size, 100)(inp)

    x = layers.Conv1D(128, 5, padding="same", activation="relu")(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Bidirectional(layers.LSTM(64))(x)
    x = layers.Dropout(0.5)(x)

    if num_classes > 2:
        out = layers.Dense(num_classes, activation="softmax")(x)
    else:
        out = layers.Dense(1, activation="sigmoid")(x)

    model = Model(inp, out)
    model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
    return model


# ---------- RCNN MODEL ----------
def build_rcnn():
    inp = Input(shape=(maxlen,))
    x = layers.Embedding(vocab_size, 100)(inp)

    x = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)

    x = layers.Conv1D(128, 3, padding="same", activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)

    if num_classes > 2:
        out = layers.Dense(num_classes, activation="softmax")(x)
    else:
        out = layers.Dense(1, activation="sigmoid")(x)

    model = Model(inp, out)
    model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
    return model


c_lstm = build_c_lstm()
rcnn   = build_rcnn()

c_lstm.summary()
rcnn.summary()


# ============================================================
# 9. TRAIN MODELS
# ============================================================
epochs = 4
batch_size = 64

print("\nTraining C-LSTM...")
c_lstm.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=epochs, batch_size=batch_size, verbose=2
)

print("\nTraining RCNN...")
rcnn.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=epochs, batch_size=batch_size, verbose=2
)


# ============================================================
# 10. SAVE MODELS + TOKENIZER
# ============================================================
os.makedirs("/content/models", exist_ok=True)

c_lstm.save("/content/models/c_lstm_model.h5")
rcnn.save("/content/models/rcnn_model.h5")

import json
with open("/content/models/tokenizer.json", "w") as f:
    f.write(tokenizer.to_json())

print("\nModels saved in /content/models/")


Found CSV files: ['tweets.csv']
Dataset loaded successfully!
   Target          ID                          Date      flag           User  \
0       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY  scotthamilton   
1       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY       mattycus   
2       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY        ElleCTF   
3       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         Karoli   
4       0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY       joy_wolf   

                                                Text  
0  is upset that he can't update his Facebook by ...  
1  @Kenichan I dived many times for the ball. Man...  
2    my whole body feels itchy and like its on fire   
3  @nationwideclass no, it's not behaving at all....  
4                      @Kwesidei not the whole crew   
Index(['Target', 'ID', 'Date', 'flag', 'User', 'Text'], dtype='object')
Using text column: Text
Using label column: Target
Train


Training C-LSTM...
Epoch 1/4
