In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from gensim.models import FastText
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Attention, Flatten, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [2]:
df = pd.read_csv("tam_train.csv")
df_val = pd.read_csv("tam_dev.csv")
df_test = pd.read_csv("tam_test.csv")
df = pd.concat([df, df_val]).reset_index(drop=True)

In [3]:
df["Label"].value_counts()

Label
Positive          20417
unknown_state      5783
Negative           4631
Mixed_feelings     4134
Name: count, dtype: int64

In [4]:
print(df["Label"].value_counts())
train_len = len(df)
df = pd.concat([df, df_test])

Label
Positive          20417
unknown_state      5783
Negative           4631
Mixed_feelings     4134
Name: count, dtype: int64


In [5]:
nltk.download('stopwords')
nltk.download('punkt')

english_stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def clean_text(text):
    """
    Removes unwanted characters, URLs, special symbols, and repeated characters.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\u0B80-\u0BFF\s]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text.strip()

def transliterate_to_english(text):
    """
    Transliterates Tamil script to English while preserving English words.
    """
    result = []
    for word in text.split():
        try:
            if re.search(r'[\u0B80-\u0BFF]', word):
                word = transliterate(word, sanscript.TAMIL, sanscript.ITRANS)
        except Exception:
            pass
        result.append(word)
    return " ".join(result)

def preprocess_text(text):
    """
    Cleans, transliterates, tokenizes, and removes stopwords.
    """
    text = clean_text(text)
    text = transliterate_to_english(text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in english_stopwords]
    return " ".join(tokens)

In [7]:
label_mapping = {
    "Positive": 0,
    "Mixed_feelings": 1,
    "unknown_state": 2,
    "Negative": 3    
}
df['Label'] = df['Label'].map(label_mapping)

df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [9]:
X_train = df.iloc[:train_len, 3]
y_train = df.iloc[:train_len, 1]
X_test = df.iloc[train_len:, 3]

In [10]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 300

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [12]:
fasttext_model = FastText(sentences=[text.split() for text in X_train], vector_size=EMBEDDING_DIM, window=5, min_count=2, workers=4, alpha=0.1, min_alpha=0.001)

vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in fasttext_model.wv:
        embedding_matrix[i] = fasttext_model.wv[word]

In [14]:
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_layer = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, 
                             weights=[embedding_matrix], trainable=True)(input_layer)

bilstm_layer = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01)))(embedding_layer)
bilstm_layer = Dropout(0.5)(bilstm_layer) 

attention_layer = Attention()([bilstm_layer, bilstm_layer])

flatten = Flatten()(bilstm_layer)
dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(flatten)
dense_layer = Dropout(0.5)(dense_layer)

output_layer = Dense(4, activation='softmax')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [15]:
history = model.fit(X_train_padded, np.array(y_train), epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 382ms/step - accuracy: 0.5918 - loss: 3.1479 - val_accuracy: 0.6095 - val_loss: 1.1020
Epoch 2/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 379ms/step - accuracy: 0.6249 - loss: 1.0805 - val_accuracy: 0.6125 - val_loss: 1.0727
Epoch 3/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 348ms/step - accuracy: 0.6938 - loss: 0.9522 - val_accuracy: 0.6321 - val_loss: 1.0869
Epoch 4/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 343ms/step - accuracy: 0.7755 - loss: 0.7653 - val_accuracy: 0.5790 - val_loss: 1.2271
Epoch 5/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 342ms/step - accuracy: 0.8336 - loss: 0.6207 - val_accuracy: 0.5922 - val_loss: 1.2527


In [20]:
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step


In [21]:
reverse_label_mapping = {value: key for key, value in label_mapping.items()}

In [22]:
ans = pd.DataFrame()
ans["Id"] = df.iloc[train_len:]['Id']
labels = [reverse_label_mapping[pred] for pred in y_pred_classes]
ans["Label"] = pd.Series(labels)
ans.head(50)

Unnamed: 0,Id,Label
0,SA_Ta_01,Positive
1,SA_Ta_02,unknown_state
2,SA_Ta_03,Positive
3,SA_Ta_04,Mixed_feelings
4,SA_Ta_05,Negative
5,SA_Ta_06,Positive
6,SA_Ta_07,Negative
7,SA_Ta_08,Positive
8,SA_Ta_09,Positive
9,SA_Ta_10,Positive


In [24]:
ans.to_csv("bi_lstm_tam.csv", index=False)