In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
import re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

In [None]:
df_train = pd.read_csv("tam_train.csv")
df_val = pd.read_csv("tam_dev.csv")

In [None]:
print(df_train["Label"].value_counts())

Label
Positive          18145
unknown_state      5164
Negative           4151
Mixed_feelings     3662
Name: count, dtype: int64


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
english_stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
def clean_text(text):
    """
    Remove unwanted characters, URLs, special symbols, and repeated characters.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\u0B80-\u0BFF\s]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text.strip()

def transliterate_to_english(text):
    """
    Transliterates Tamil script to English while preserving English words.
    """
    result = []
    for word in text.split():
        try:
            if re.search(r'[\u0B80-\u0BFF]', word):
                word = transliterate(word, sanscript.TAMIL, sanscript.ITRANS)
        except Exception:
            pass
        result.append(word)
    return " ".join(result)

def preprocess_text(text):
    """
    Cleans, transliterates, tokenizes, and removes stopwords.
    """
    text = clean_text(text)
    text = transliterate_to_english(text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in english_stopwords]
    return " ".join(tokens)

In [None]:
df_train['cleaned_text'] = df_train['Text'].apply(preprocess_text) 
df_val['cleaned_text'] = df_val['Text'].apply(preprocess_text)

In [None]:
label_mapping = {
    "Positive": 0,
    "Mixed_feelings": 1,
    "unknown_state": 2,
    "Negative": 3
}
df_train['Label'] = df_train['Label'].map(label_mapping)
df_val['Label'] = df_val['Label'].map(label_mapping)

In [None]:
X_train = df_train['cleaned_text']
y_train = df_train['Label']
X_val = df_val['cleaned_text']
y_val = df_val['Label']

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 4), analyzer='char')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

In [None]:
lr_model = LogisticRegression(max_iter=2000, class_weight='balanced')
lr_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred_val = lr_model.predict(X_val_tfidf)

In [None]:
print("Classification Report:")
print(classification_report(y_val, y_pred_val, target_names=label_mapping.keys()))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_val))

Classification Report:
                precision    recall  f1-score   support

      Positive       0.85      0.58      0.69      2272
Mixed_feelings       0.24      0.38      0.29       472
 unknown_state       0.38      0.52      0.44       619
      Negative       0.35      0.51      0.41       480

      accuracy                           0.54      3843
     macro avg       0.45      0.50      0.46      3843
  weighted avg       0.64      0.54      0.57      3843

Confusion Matrix:
[[1321  350  367  234]
 [  98  178   77  119]
 [  82  109  319  109]
 [  50  112   72  246]]
