In [52]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [53]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn import svm
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from collections import Counter
import nltk

In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# HELPER FUNCTIONS

In [55]:
nltk.download('stopwords')
stopwords_list = stopwords.words('english')
stopwords_list.extend(['nalla','ena','unaku','per','irukanga','panna','yarum','mattum','ivan','ada','pesa','unakku','k','sari','idhu','vida','vittu','enga','yen','ithu','poda','dey','irundhu','ya','la', 'u','r','s','bro','da','dei','dai','nu','ah','nee','ni','illa','un','ok','na','pls','ur','unga']) 
stopwords_list.extend(['🥰','indha','antha','vera','iruka','pola','innum','avan','summa','ellam','thaan','romba','❤️','ana','ama','apdi','ithula','po','evlo','eruku','irukum','nama','enna','va','hi','h','ku','iruku','naa','va','oru','athu','avanga','neenga','tha','en','di','dhan','ne','ella','intha']) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
label = { 'None-of-the-above': 0, 'Transphobic':1, 'Counter-speech':2, 'Misandry':3,
       'Homophobia':4, 'Hope-Speech':5, 'Xenophobia':6, 'Misogyny':7}

In [57]:
def tolower(text):
  return text.lower()

def removepunctuation(text):
    clean="".join([i for i in text if i not in string.punctuation])
    return clean

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def removestopwords(text):
  clean = " ".join([word for word in text.split() if word not in stopwords_list])
  return clean

In [58]:
def preprocess(df):
  df['text']= df['text'].apply(lambda x:tolower(x))
  df['text']= df['text'].apply(lambda x:removepunctuation(x))
  df['text']= df['text'].apply(lambda x:deEmojify(x))
  df['text']= df['text'].apply(lambda x:removestopwords(x))
  df['category'] = df['category'].map(label)
  return df

In [59]:
def tokenize(text):
  return text.split()

def tokenizedf(df):
  df['token']= df['text'].apply(lambda x:tokenize(x))
  df['len'] = df.text.apply(lambda x : len(x))
  return df

In [60]:
def stats(value):
    if value == 0:
        return 0
    else:
        return 1

In [61]:
def vocab(df):
  return df.groupby('category')['text'].apply(lambda x: Counter(" ".join(x).split()).most_common(100))

In [62]:
def getVocab(df):
  df_vocab = vocab(df)
  processed = list(set([key for groups in df_vocab for key, value in groups]))
  return processed

In [63]:
def fitTfidfVectrain(df, tfidf_vec):
  x_df = tfidf_vec.fit_transform(df['text'])
  return x_df

In [64]:
def getXtrain(model, df, tfidf_vec):
  model_tf_x = fitTfidfVectrain(df, tfidf_vec)
  model_val_x = model.encode(df['text'].tolist())
  Xval = [np.append(list1, list2) for list1, list2 in zip(model_val_x, model_tf_x.toarray())]
  return Xval

In [65]:
def fitTfidfVectest(df, tfidf_vec):
  x_df = tfidf_vec.transform(df['text'])
  return x_df

In [66]:
def getXtest(model, df, tfidf_vec):
  model_tf_x = fitTfidfVectest(df, tfidf_vec)
  model_val_x = model.encode(df['text'].tolist())
  Xval = [np.append(list1, list2) for list1, list2 in zip(model_val_x, model_tf_x.toarray())]
  return Xval

In [67]:
def getY(df):
  return df['category']

# MODELS

In [68]:
labse_embedding_model = SentenceTransformer('sentence-transformers/LaBSE')

# DATASETS AND PREPROCESSING

In [69]:
df_train = pd.read_csv('/content/drive/Shareddrives/NLP Research work - PANDAS team/ACL/Datasets/ACD/ta-en-misogyny-train.csv',sep='\t', names = ["category", "text"])
df_train.text=df_train.text.astype(str)
train = df_train
train = preprocess(df_train)

In [70]:
df_test = pd.read_csv('/content/drive/Shareddrives/NLP Research work - PANDAS team/ACL/Datasets/ACD/ta-en-misogyny-dev.csv',sep='\t', names = ["category", "text"])
df_test.text=df_test.text.astype(str)
test = df_test
test = preprocess(test)

In [71]:
train

Unnamed: 0,category,text
0,0,enaku unmaikum aluha wantu thirunangaigal thei...
1,1,superstar vijay arivuketta polu thappu pavam a...
2,0,ugka smile cute
3,0,anna waiting 🥰🥰🥰
4,0,yanda tamilnadu evvalavo pirachana athalam vit...
...,...,...
5943,0,noq day caste religious certificate
5944,0,mimicry escape aitaru
5945,0,rajesh age
5946,2,videos nallarukku dont prank eena varavanga ep...


In [72]:
test

Unnamed: 0,category,text
0,0,black saree hot
1,2,halo first ayunga kai thatti amount keatkyrang...
2,0,hello manithana sollunga pramanana solla vanam
3,6,china kaaranai kalaipan daily enjoying 2016
4,0,onnota ponnu ipdi vituviya
...,...,...
1483,1,9 usu vinoth
1484,0,love panuradhaaaa soliyaeeee cover panura niiii
1485,0,part 2 aippa varum
1486,3,jii nariya thetunga andha potta gopiya


#VOCABULARY AND FEATURE EXTRACTION

In [73]:
vocab = getVocab(train)

In [74]:
vocab[3]

'lam'

#TRAINING - Level 1 - max classification

In [75]:
tfidf_vec = TfidfVectorizer(analyzer = 'word', vocabulary = vocab)

In [76]:
Xtrain = getXtrain(labse_embedding_model, train, tfidf_vec)
Xtest = getXtest(labse_embedding_model, test, tfidf_vec)

In [77]:
y_train = getY(train)
y_test = getY(test)

In [78]:
svm_model = svm.SVC(kernel='linear', C=1, verbose=True, probability = True)
svm_model.fit(Xtrain, y_train)

[LibSVM]

SVC(C=1, kernel='linear', probability=True, verbose=True)

Original

In [79]:
ypred = svm_model.predict(Xtest)
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       0.76      0.96      0.85       919
           1       0.56      0.25      0.34        40
           2       0.44      0.29      0.35        95
           3       0.74      0.55      0.63       218
           4       0.76      0.30      0.43        43
           5       0.40      0.08      0.13        53
           6       0.83      0.57      0.68        70
           7       0.79      0.22      0.34        50

    accuracy                           0.74      1488
   macro avg       0.66      0.40      0.47      1488
weighted avg       0.72      0.74      0.71      1488



Linear model with Intercept

In [80]:
svm_model = svm.LinearSVC(C=1, verbose=True, fit_intercept = True, intercept_scaling = 1)
svm_model.fit(Xtrain, y_train)
ypred = svm_model.predict(Xtest)
print(classification_report(y_test, ypred))

[LibLinear]              precision    recall  f1-score   support

           0       0.79      0.92      0.85       919
           1       0.32      0.20      0.25        40
           2       0.49      0.31      0.38        95
           3       0.66      0.59      0.62       218
           4       0.59      0.37      0.46        43
           5       0.36      0.15      0.21        53
           6       0.78      0.67      0.72        70
           7       0.52      0.30      0.38        50

    accuracy                           0.74      1488
   macro avg       0.56      0.44      0.48      1488
weighted avg       0.71      0.74      0.71      1488



Linear model with modified Class Weights

In [82]:
svm_model = svm.LinearSVC(C = 1, class_weight = {0: 0.09, 1: 0.13, 2: 0.13, 3: 0.13, 4: 0.13, 5: 0.13, 6: 0.13, 7: 0.13}, verbose = True, fit_intercept = True, intercept_scaling = 1)
svm_model.fit(Xtrain, y_train)
ypred = svm_model.predict(Xtest)
print(classification_report(y_test, ypred))

[LibLinear]              precision    recall  f1-score   support

           0       0.81      0.89      0.85       919
           1       0.34      0.28      0.31        40
           2       0.44      0.34      0.38        95
           3       0.61      0.61      0.61       218
           4       0.53      0.42      0.47        43
           5       0.37      0.25      0.30        53
           6       0.70      0.67      0.69        70
           7       0.48      0.24      0.32        50

    accuracy                           0.73      1488
   macro avg       0.54      0.46      0.49      1488
weighted avg       0.71      0.73      0.71      1488



# Lime XAI

In [None]:
!pip install lime

In [None]:
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
from lime.lime_text import IndexedString,IndexedCharacters
from lime.lime_base import LimeBase
from sklearn.linear_model import Ridge, lars_path
from lime.lime_text import explanation
from functools import partial
import scipy as sp
from sklearn.utils import check_random_state

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class InputTransformer(BaseEstimator, TransformerMixin):
  def __init__(self,labse = labse_embedding_model, df = train, vec = tfidf_vec):
    self.labse = labse
    self.df = df
    self.vec = vec
  def fit(self, x, y=None):
    return self
  def transform(self, x):
    model_tf_x = self.vec.transform(x)
    model_val_x = self.labse.encode(x)
    Xval = [np.append(list1, list2) for list1, list2 in zip(model_val_x, model_tf_x.toarray())]
    return Xval

In [None]:
iptransformer = InputTransformer(labse = labse_embedding_model, df = train, vec = tfidf_vec)

In [None]:
x = train["text"][1]

In [None]:
iptransformer.transform([x])

In [None]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(iptransformer, svm_model)

In [None]:
class_names = svm_model.classes_
explainer = LimeTextExplainer(class_names = class_names)

#Checking indexes

In [None]:
train.loc[(train['category'] == 4)]

#test

In [None]:
idx = train.index[12]

In [None]:
exp = explainer.explain_instance(train["text"][idx], c.predict_proba, num_features = 6, top_labels=23)

In [None]:
print("Question: \n", train["text"][idx])
print("Probability (None-of-the-above) =", c.predict_proba([train["text"][idx]])[0, 0])
print("Probability (Transphobia) =", c.predict_proba([train["text"][idx]])[0, 1])
print("Probability (Counter-Speech) =", c.predict_proba([train["text"][idx]])[0, 2])
print("Probability (Misandry) =", c.predict_proba([train["text"][idx]])[0, 3])
print("Probability (Homophobia) =", c.predict_proba([train["text"][idx]])[0, 4])
print("Probability (Hope-Speech) =", c.predict_proba([train["text"][idx]])[0, 5])
print("Probability (Xenophobia) =", c.predict_proba([train["text"][idx]])[0, 6])
print("Probability (Misogyny) =", c.predict_proba([train["text"][idx]])[0, 7])
print("True Class is:", train["category"][idx])

In [None]:
print(exp.as_list(label = 4))

In [None]:
print(exp.as_list)
exp.show_in_notebook(text=train["text"][idx])

Weights

In [None]:
tfidf_vec

In [None]:
vect = fitTfidfVectrain(train, tfidf_vec)

In [None]:
vect

In [None]:
vect.toarray()