In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import nltk
import re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

In [14]:
df = pd.read_csv("tulu_train.csv")
df_val = pd.read_csv("tulu_dev.csv")
df = pd.concat([df, df_val])
df = df[df["Label"].isin(["Not Tulu", "Positive", "Neutral", "Mixed", "Negative"])]
df["Label"].isna().sum()
print(df["Label"].value_counts())
train_len = len(df)
df_test = pd.read_csv("tulu_test.csv")
df = pd.concat([df, df_test])

Label
Not Tulu    4943
Positive    4239
Neutral     3543
Mixed       1257
Negative     961
Name: count, dtype: int64


In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
english_stopwords = nltk.corpus.stopwords.words('english')

In [18]:
def clean_text(text):
    """
    Removes unwanted characters, URLs, special symbols, and repeated characters.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\u0C80-\u0CFF\s]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text.strip()

def transliterate_to_english(text):
    """
    Transliterates Tulu (Kannada script) to English while preserving English words.
    """
    result = []
    for word in text.split():
        try:
            if re.search(r'[\u0C80-\u0CFF]', word):
                word = transliterate(word, sanscript.KANNADA, sanscript.ITRANS)
        except Exception:
            pass
        result.append(word)
    return " ".join(result)

def preprocess_text(text):
    """
    Cleans, transliterates, tokenizes, and removes stopwords.
    """
    text = clean_text(text)
    text = transliterate_to_english(text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in english_stopwords]
    return " ".join(tokens)

In [19]:
df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [20]:
label_mapping = {
    "Positive": 0,
    "Not Tulu": 1,
    "Neutral": 2,
    "Mixed": 3,
    "Negative": 4  
}
df['Label'] = df['Label'].map(label_mapping)

In [21]:
df['cleaned_text']

0                               Aunty log bohot kadak hai
1       Shruthi awesome Nice collaboration Hope see ur...
2                       Gol gappadh ammana sajjigene best
3                         Chaddida brand thojodijji marre
4                                  Memories got refreshed
                              ...                        
1474    Enchina avaste marree ck attavar edde comedy show
1475                  Corona apaga itthnda Corona suvarna
1476    Nishith perfect acting nice story well done guyss
1477                                  ov nataka full ejja
1478                                 Pukuli maya ge ayana
Name: cleaned_text, Length: 16422, dtype: object

In [22]:
X_train = df.iloc[:train_len]['cleaned_text']
y_train = df.iloc[:train_len]['Label']
X_test = df.iloc[train_len:]['cleaned_text']

In [23]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 4), analyzer='char')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [24]:
lr_model = LogisticRegression(max_iter=2000, class_weight='balanced')
lr_model.fit(X_train_tfidf, y_train)

In [25]:
y_pred = lr_model.predict(X_test_tfidf)

In [26]:
reverse_label_mapping = {value: key for key, value in label_mapping.items()}

In [27]:
ans = pd.DataFrame()
ans["Id"] = df.iloc[train_len:]['Id']
labels = [reverse_label_mapping[pred] for pred in y_pred]
ans["Label"] = pd.Series(labels)
ans.head(50)

Unnamed: 0,Id,Label
0,SA_TU_01,Not Tulu
1,SA_TU_02,Negative
2,SA_TU_03,Not Tulu
3,SA_TU_04,Neutral
4,SA_TU_05,Neutral
5,SA_TU_06,Not Tulu
6,SA_TU_07,Not Tulu
7,SA_TU_08,Neutral
8,SA_TU_09,Positive
9,SA_TU_10,Positive


In [30]:
ans.to_csv("lr_tulu.csv", index=False)