In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
import re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

In [None]:
df_train = pd.read_csv("tulu_train.csv")
df_val = pd.read_csv("tulu_dev.csv")

In [None]:
df = pd.concat([df_train, df_val])
df = df[df["Label"].isin(["Not Tulu", "Positive", "Neutral", "Mixed", "Negative"])]
print(df["Label"].value_counts())

Label
Not Tulu    4943
Positive    4239
Neutral     3543
Mixed       1257
Negative     961
Name: count, dtype: int64


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
english_stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
def clean_text(text):
    """
    Removes unwanted characters, URLs, special symbols, and repeated characters.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\u0C80-\u0CFF\s]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text.strip()

def transliterate_to_english(text):
    """
    Transliterates Tulu (Kannada script) to English while preserving English words.
    """
    result = []
    for word in text.split():
        try:
            if re.search(r'[\u0C80-\u0CFF]', word):
                word = transliterate(word, sanscript.KANNADA, sanscript.ITRANS)
        except Exception:
            pass
        result.append(word)
    return " ".join(result)

def preprocess_text(text):
    """
    Cleans, transliterates, tokenizes, and removes stopwords.
    """
    text = clean_text(text)
    text = transliterate_to_english(text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in english_stopwords]
    return " ".join(tokens)

In [None]:
df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [None]:
label_mapping = {
    "Positive": 0,
    "Not Tulu": 1,
    "Neutral": 2,
    "Mixed": 3,
    "Negative": 4
}
df['Label'] = df['Label'].map(label_mapping)

In [None]:
train_len = len(df_train)
X_train = df.iloc[:train_len]['cleaned_text']
y_train = df.iloc[:train_len]['Label']
X_val = df.iloc[train_len:]['cleaned_text']
y_val = df.iloc[train_len:]['Label']

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 4), analyzer='char')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

In [None]:
lr_model = LogisticRegression(max_iter=2000, class_weight='balanced')
lr_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred_val = lr_model.predict(X_val_tfidf)

In [None]:
print("Classification Report:")
print(classification_report(y_val, y_pred_val, target_names=label_mapping.keys()))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_val))

Classification Report:
              precision    recall  f1-score   support

    Positive       0.81      0.67      0.73       467
    Not Tulu       0.82      0.88      0.85       541
     Neutral       0.64      0.58      0.61       368
       Mixed       0.30      0.37      0.33       141
    Negative       0.42      0.56      0.48       118

    accuracy                           0.69      1635
   macro avg       0.60      0.61      0.60      1635
weighted avg       0.70      0.69      0.69      1635

Confusion Matrix:
[[313  54  44  41  15]
 [ 28 474  23   7   9]
 [ 25  37 215  52  39]
 [ 17   7  36  52  29]
 [  4   8  18  22  66]]
