In [16]:
import numpy as np
import pandas as pd
import re
import nltk
from gensim.models import FastText
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Attention, Flatten, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [17]:
df = pd.read_csv("tam_train.csv")
df_val = pd.read_csv("tam_dev.csv")
print(df["Label"].value_counts())
train_len = len(df)
df = pd.concat([df, df_val]).reset_index(drop=True)

Label
Positive          18145
unknown_state      5164
Negative           4151
Mixed_feelings     3662
Name: count, dtype: int64


In [18]:
df["Label"].value_counts()

Label
Positive          20417
unknown_state      5783
Negative           4631
Mixed_feelings     4134
Name: count, dtype: int64

In [19]:
nltk.download('stopwords')
nltk.download('punkt')

english_stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
def clean_text(text):
    """
    Removes unwanted characters, URLs, special symbols, and repeated characters.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\u0B80-\u0BFF\s]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text.strip()

def transliterate_to_english(text):
    """
    Transliterates Tamil script to English while preserving English words.
    """
    result = []
    for word in text.split():
        try:
            if re.search(r'[\u0B80-\u0BFF]', word):
                word = transliterate(word, sanscript.TAMIL, sanscript.ITRANS)
        except Exception:
            pass
        result.append(word)
    return " ".join(result)

def preprocess_text(text):
    """
    Cleans, transliterates, tokenizes, and removes stopwords.
    """
    text = clean_text(text)
    text = transliterate_to_english(text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in english_stopwords]
    return " ".join(tokens)

In [21]:
label_mapping = {
    "Positive": 0,
    "Mixed_feelings": 1,
    "unknown_state": 2,
    "Negative": 3    
}
df['Label'] = df['Label'].map(label_mapping)
df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [23]:
X_train = df.iloc[:train_len, 2]
y_train = df.iloc[:train_len, 1]
X_test = df.iloc[train_len:, 2]
y_test = df.iloc[train_len:, 1]

In [24]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 300

In [25]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [26]:
fasttext_model = FastText(sentences=[text.split() for text in X_train], vector_size=EMBEDDING_DIM, window=5, min_count=2, workers=4, alpha=0.1, min_alpha=0.001)

vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in fasttext_model.wv:
        embedding_matrix[i] = fasttext_model.wv[word]

In [28]:
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_layer = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, 
                             weights=[embedding_matrix], trainable=True)(input_layer)

bilstm_layer = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01)))(embedding_layer)
bilstm_layer = Dropout(0.5)(bilstm_layer)

attention_layer = Attention()([bilstm_layer, bilstm_layer])

flatten = Flatten()(bilstm_layer)
dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(flatten)
dense_layer = Dropout(0.5)(dense_layer)

output_layer = Dense(4, activation='softmax')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [29]:
history = model.fit(X_train_padded, np.array(y_train), epochs=7, batch_size=64, validation_split=0.2)

Epoch 1/7
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 806ms/step - accuracy: 0.5818 - loss: 3.3028 - val_accuracy: 0.6039 - val_loss: 1.0805
Epoch 2/7
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 708ms/step - accuracy: 0.6242 - loss: 1.0705 - val_accuracy: 0.6212 - val_loss: 1.0586
Epoch 3/7
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 623ms/step - accuracy: 0.6905 - loss: 0.9424 - val_accuracy: 0.6169 - val_loss: 1.1476
Epoch 4/7
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 515ms/step - accuracy: 0.7764 - loss: 0.7719 - val_accuracy: 0.5709 - val_loss: 1.3229
Epoch 5/7
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 687ms/step - accuracy: 0.8356 - loss: 0.6252 - val_accuracy: 0.5836 - val_loss: 1.4065
Epoch 6/7
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 518ms/step - accuracy: 0.8754 - loss: 0.5215 - val_accuracy: 0.5578 - val_loss: 1.5861
Epoch 7/7


In [30]:
from sklearn.metrics import classification_report

y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)

print(classification_report(y_test, y_pred))

[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 105ms/step
              precision    recall  f1-score   support

           0       0.74      0.76      0.75      2272
           1       0.24      0.19      0.21       472
           2       0.36      0.48      0.41       619
           3       0.44      0.28      0.34       480

    accuracy                           0.58      3843
   macro avg       0.45      0.43      0.43      3843
weighted avg       0.58      0.58      0.58      3843

