In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import nltk
import re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

In [18]:
df = pd.read_csv("tam_train.csv")
df_val = pd.read_csv("tam_dev.csv")
df = pd.concat([df, df_val])
print(df["Label"].value_counts())
train_len = len(df)
df_test = pd.read_csv("tam_test.csv")
df = pd.concat([df, df_test])

Label
Positive          20417
unknown_state      5783
Negative           4631
Mixed_feelings     4134
Name: count, dtype: int64


In [20]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
english_stopwords = nltk.corpus.stopwords.words('english')

In [22]:
def clean_text(text):
    """
    Removes unwanted characters, URLs, special symbols, and repeated characters.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\u0B80-\u0BFF\s]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text.strip()

def transliterate_to_english(text):
    """
    Transliterates Tamil script to English while preserving English words.
    """
    result = []
    for word in text.split():
        try:
            if re.search(r'[\u0B80-\u0BFF]', word):
                word = transliterate(word, sanscript.TAMIL, sanscript.ITRANS)
        except Exception:
            pass
        result.append(word)
    return " ".join(result)

def preprocess_text(text):
    """
    Cleans, transliterates, tokenizes, and removes stopwords.
    """
    text = clean_text(text)
    text = transliterate_to_english(text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in english_stopwords]
    return " ".join(tokens)

In [23]:
df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [24]:
label_mapping = {
    "Positive": 0,
    "Mixed_feelings": 1,
    "unknown_state": 2,
    "Negative": 3
}
df['Label'] = df['Label'].map(label_mapping)

In [25]:
df['cleaned_text']

0             Ennq pa idhu paei padama twist nalla irkkae
1       Na oru thalaivar veriyanintha padam pakanum in...
2            last shot apdiye moratu kaala paatha feeling
3                  Darbar motion poster see petta trailer
4                  ln minutes k likes Ajith sir fans like
                              ...                        
3454    indha bhaDhadhdhiன Adharavai bhArdhdhAl indha ...
3455    Trailer time mella pathavung madun like podung...
3456                 la vachu pathe la pakura maari iruku
3457    vauLALar bhiLLai jhamughaM jhArbhAgha ibhbhaDh...
3458    Thalaivar semma style marana mass adichi under...
Name: cleaned_text, Length: 38424, dtype: object

In [26]:
X_train = df.iloc[:train_len]['cleaned_text']
y_train = df.iloc[:train_len]['Label']
X_test = df.iloc[train_len:]['cleaned_text']

In [27]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 4), analyzer='char')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [28]:
lr_model = LogisticRegression(max_iter=2000, class_weight='balanced')
lr_model.fit(X_train_tfidf, y_train)

In [29]:
y_pred = lr_model.predict(X_test_tfidf)

In [30]:
reverse_label_mapping = {value: key for key, value in label_mapping.items()}

In [31]:
ans = pd.DataFrame()
ans["Id"] = df.iloc[train_len:]['Id']
labels = [reverse_label_mapping[pred] for pred in y_pred]
ans["Label"] = pd.Series(labels)
ans.head(50)

Unnamed: 0,Id,Label
0,SA_Ta_01,Positive
1,SA_Ta_02,unknown_state
2,SA_Ta_03,unknown_state
3,SA_Ta_04,Mixed_feelings
4,SA_Ta_05,Negative
5,SA_Ta_06,Positive
6,SA_Ta_07,Negative
7,SA_Ta_08,Positive
8,SA_Ta_09,Positive
9,SA_Ta_10,Positive


In [33]:
ans.to_csv("lr_tam.csv", index=False)