# HASOC2019 Hinglish Hate Speech Classification Pipeline

This notebook covers:
1. Library installation
2. Data loading
3. Preprocessing (transliteration, emoji mapping, text normalization)
4. Feature engineering (TF-IDF, char n-grams, numeric, lexicon, code-mix)
5. Classical ML baseline (LogReg, SVM, RF)
6. Transformer fine-tuning (mBERT / XLM-R)
7. Ensembling
8. Evaluation & saving models


In [None]:
# 1. Install necessary libraries
!pip install emoji aksharamukha sklearn imblearn transformers torch langdetect indic-nlp-library

In [None]:
# 2. Imports
import re
import pandas as pd
import numpy as np
from langdetect import detect
import emoji
from aksharamukha import transliterate
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
# 3. Load HASOC2019 Hinglish dataset
# Adjust the path to where your CSV/TSV file is located
df = pd.read_csv('hasoc2019_hinglish_train.csv')
# Expect columns: 'text', 'label'
df.head()

In [None]:
# 4. Preprocessing functions

# a) Emoji mapping
def map_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))

# b) Transliterate Roman‐Hindi to Devanagari (optional)
def transliterate_hinglish(text):
    try:
        return transliterate.process('IAST', 'Devanagari', text)
    except:
        return text

# c) Normalize text
def normalize_text(text):
    text = map_emojis(text)
    text = transliterate_hinglish(text)
    text = re.sub(r'http\S+|www\.\S+', '', text)       # remove URLs
    text = re.sub(r'@\w+', '', text)                     # remove mentions
    text = re.sub(r'[^\w\s]', ' ', text)                # punct -> space
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

df['clean_text'] = df['text'].apply(normalize_text)
df.head()

In [None]:
# 5. Feature engineering

# a) Numeric features
def extract_numeric_feats(text):
    tokens = text.split()
    num_words = len(tokens)
    uppercase_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
    excls = text.count('!')
    ques = text.count('?')
    # code-mix ratio
    lang_tags = [detect(tok) for tok in tokens if len(tok) > 0]
    hinglish_ratio = lang_tags.count('hi') / max(len(lang_tags), 1)
    return [num_words, uppercase_ratio, excls, ques, hinglish_ratio]

numeric_feats = np.array([extract_numeric_feats(t) for t in df['clean_text']])

# b) TF-IDF word n-grams
tfidf_word = TfidfVectorizer(ngram_range=(1,3), min_df=3, max_df=0.9, max_features=20000)
X_tfidf_word = tfidf_word.fit_transform(df['clean_text'])

# c) TF-IDF char n-grams
tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(3,6), max_features=20000)
X_tfidf_char = tfidf_char.fit_transform(df['clean_text'])

from scipy.sparse import hstack
X = hstack([X_tfidf_word, X_tfidf_char, numeric_feats])
y = df['label']

In [None]:
# 6. Train/test split and oversampling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
smote = SMOTE(random_state=42, sampling_strategy=0.9)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
# 7. Classical ML baseline
models = {
    'LogReg': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'SVM': LinearSVC(class_weight='balanced', max_iter=5000),
    'RF': RandomForestClassifier(n_estimators=100, class_weight='balanced')
}

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    preds = model.predict(X_test)
    print(f"{name} F1: {f1_score(y_test, preds, average='macro'):.4f}")

In [None]:
# 8. Transformer fine-tuning (e.g. XLM-RoBERTa)
checkpoint = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenization
train_enc = tokenizer(df['clean_text'].tolist(), truncation=True, padding=True)
# Build dataset object
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

train_dataset = HateDataset(train_enc, df['label'])

# Training arguments
args = TrainingArguments(
    output_dir='./transformer_out', num_train_epochs=3,
    per_device_train_batch_size=16, evaluation_strategy='epoch',
    save_total_limit=1, load_best_model_at_end=True
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(y.unique()))
trainer = Trainer(model=model, args=args, train_dataset=train_dataset)
trainer.train()

In [None]:
# 9. Ensemble classical + transformer
from sklearn.calibration import CalibratedClassifierCV
# Calibrate SVM for probabilities
svm_cal = CalibratedClassifierCV(models['SVM'], cv=3).fit(X_train_res, y_train_res)
ensemble = VotingClassifier(
    estimators=[
        ('lr', models['LogReg']),
        ('svm', svm_cal),
        ('rf', models['RF'])
    ],
    voting='soft'
)
ensemble.fit(X_train_res, y_train_res)
ensemble_preds = ensemble.predict(X_test)
print(f"Ensemble F1: {f1_score(y_test, ensemble_preds, average='macro'):.4f}")

In [None]:
# 10. Save pipelines and models
import joblib
joblib.dump({
    'tfidf_word': tfidf_word,
    'tfidf_char': tfidf_char,
    'numeric_feats_fn': extract_numeric_feats,
    'classical_ensemble': ensemble
}, 'hasoc_hinglish_classical.pkl")
# Save transformer
model.save_pretrained('hasoc_transformer/')
tokenizer.save_pretrained('hasoc_transformer/')