In [77]:
import numpy as np
import pandas as pd
import re
import nltk
from gensim.models import FastText
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Attention, Flatten, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [78]:
df = pd.read_csv("tulu_train.csv")
df_val = pd.read_csv("tulu_dev.csv")
df_test = pd.read_csv("tulu_test.csv")
df = pd.concat([df, df_val]).reset_index(drop=True)

In [79]:
df["Label"].value_counts()

Label
Not Tulu    4943
Positive    4239
Neutral     3543
Mixed       1257
Negative     961
Name: count, dtype: int64

In [80]:
df = df[df["Label"].isin(["Not Tulu", "Positive", "Neutral", "Mixed", "Negative"])]
df["Label"].isna().sum()

0

In [81]:
print(df["Label"].value_counts())
train_len = len(df)
df = pd.concat([df, df_test])
df

Label
Not Tulu    4943
Positive    4239
Neutral     3543
Mixed       1257
Negative     961
Name: count, dtype: int64


Unnamed: 0,Text,Label,Id
0,Aunty log bohot kadak hai,Not Tulu,
1,Shruthi was awesome... Nice collaboration... H...,Not Tulu,
2,Gol gappadh ammana sajjigene best,Positive,
3,Chaddida brand thojodijji marre😃😃😃,Neutral,
4,Memories just got refreshed...,Not Tulu,
...,...,...,...
1474,"Enchina a avaste marree ck or attavar, edde c...",,SA_TU_1475
1475,Corona apaga itthnda 😂😂 Corona suvarna,,SA_TU_1476
1476,Nishith🔥🔥 perfect acting 😀😀😀 nice story welll ...,,SA_TU_1477
1477,ov nataka full ejja,,SA_TU_1478


In [82]:
nltk.download('stopwords')
nltk.download('punkt')

english_stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [83]:
def clean_text(text):
    """
    Removes unwanted characters, URLs, special symbols, and repeated characters.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\u0C80-\u0CFF\s]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text.strip()

def transliterate_to_english(text):
    """
    Transliterates Tulu (Kannada script) to English while preserving English words.
    """
    result = []
    for word in text.split():
        try:
            if re.search(r'[\u0C80-\u0CFF]', word):
                word = transliterate(word, sanscript.KANNADA, sanscript.ITRANS)
        except Exception:
            pass
        result.append(word)
    return " ".join(result)

def preprocess_text(text):
    """
    Cleans, transliterates, tokenizes, and removes stopwords.
    """
    text = clean_text(text)
    text = transliterate_to_english(text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in english_stopwords]
    return " ".join(tokens)

In [84]:
label_mapping = {
    "Positive": 0,
    "Not Tulu": 1,
    "Neutral": 2,
    "Mixed": 3,
    "Negative": 4  
}
df['Label'] = df['Label'].map(label_mapping)

df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [85]:
X_train = df.iloc[:train_len, 3]
y_train = df.iloc[:train_len, 1]
X_test = df.iloc[train_len:, 3]

In [86]:
y_train.isna().sum()

0

In [87]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 300

In [88]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [89]:
fasttext_model = FastText(sentences=[text.split() for text in X_train], vector_size=EMBEDDING_DIM, window=5, min_count=1, workers=4, alpha=0.1, min_alpha=0.01)

vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in fasttext_model.wv:
        embedding_matrix[i] = fasttext_model.wv[word]

In [91]:
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_layer = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, 
                             weights=[embedding_matrix], trainable=True)(input_layer)

bilstm_layer = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01)))(embedding_layer)
bilstm_layer = Dropout(0.5)(bilstm_layer)

attention_layer = Attention()([bilstm_layer, bilstm_layer])

flatten = Flatten()(bilstm_layer)
dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(flatten) 
dense_layer = Dropout(0.5)(dense_layer)

output_layer = Dense(5, activation='softmax')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [92]:
history = model.fit(X_train_padded, np.array(y_train), epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 338ms/step - accuracy: 0.5568 - loss: 4.5918 - val_accuracy: 0.6494 - val_loss: 1.1865
Epoch 2/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 317ms/step - accuracy: 0.6700 - loss: 1.1366 - val_accuracy: 0.6721 - val_loss: 1.0713
Epoch 3/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 368ms/step - accuracy: 0.7479 - loss: 0.8805 - val_accuracy: 0.6661 - val_loss: 1.0807
Epoch 4/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 367ms/step - accuracy: 0.7914 - loss: 0.7356 - val_accuracy: 0.6618 - val_loss: 1.1052
Epoch 5/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 352ms/step - accuracy: 0.8276 - loss: 0.6168 - val_accuracy: 0.6574 - val_loss: 1.3522


In [93]:
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step


In [94]:
reverse_label_mapping = {value: key for key, value in label_mapping.items()}

In [95]:
ans = pd.DataFrame()
ans["Id"] = df.iloc[train_len:]['Id']
labels = [reverse_label_mapping[pred] for pred in y_pred_classes]
ans["Label"] = pd.Series(labels)
ans.head(50)

Unnamed: 0,Id,Label
0,SA_TU_01,Not Tulu
1,SA_TU_02,Positive
2,SA_TU_03,Not Tulu
3,SA_TU_04,Neutral
4,SA_TU_05,Positive
5,SA_TU_06,Not Tulu
6,SA_TU_07,Not Tulu
7,SA_TU_08,Neutral
8,SA_TU_09,Positive
9,SA_TU_10,Not Tulu


In [97]:
ans.to_csv("bi_lstm_tulu.csv", index=False)