In [1]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [109]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn import svm
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from collections import Counter

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report

HELPER FUNCTIONS

In [111]:
nltk.download('stopwords')
stopwords_list = stopwords.words('english')
stopwords_list.extend(['nalla','ena','unaku','per','irukanga','panna','yarum','mattum','ivan','ada','pesa','unakku','k','sari','idhu','vida','vittu','enga','yen','ithu','poda','dey','irundhu','ya','la', 'u','r','s','bro','da','dei','dai','nu','ah','nee','ni','illa','un','ok','na','pls','ur','unga']) 
stopwords_list.extend(['🥰','indha','antha','vera','iruka','pola','innum','avan','summa','ellam','thaan','❤️','ana','ama','apdi','ithula','po','evlo','eruku','irukum','nama','enna','va','hi','h','ku','iruku','naa','va','oru','athu','avanga','neenga','tha','en','di','dhan','ne','ella','intha']) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [112]:
label = { 'None-of-the-above': 0, 'Transphobic':1, 'Counter-speech':2, 'Misandry':3,
       'Homophobia':4, 'Hope-Speech':5, 'Xenophobia':6, 'Misogyny':7}

In [113]:
def tolower(text):
  return text.lower()

def removepunctuation(text):
    clean="".join([i for i in text if i not in string.punctuation])
    return clean

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def removestopwords(text):
  clean = " ".join([word for word in text.split() if word not in stopwords_list])
  return clean

In [114]:
def preprocess(df):
  df['text']= df['text'].apply(lambda x:tolower(x))
  df['text']= df['text'].apply(lambda x:removepunctuation(x))
  df['text']= df['text'].apply(lambda x:deEmojify(x))
  df['text']= df['text'].apply(lambda x:removestopwords(x))
  df['category'] = df['category'].map(label)
  return df

In [115]:
def dropnota(df):
  return df.drop(df[df['category']==0].index)

In [116]:
def tokenize(text):
  return text.split()

def tokenizedf(df):
  df['token']= df['text'].apply(lambda x:tokenize(x))
  df['len'] = df.text.apply(lambda x : len(x))
  return df

In [117]:
def stats(value):
    if value == 0:
        return 0
    else:
        return 1

In [118]:
def vocab(df):
  return df.groupby('category')['text'].apply(lambda x: Counter(" ".join(x).split()).most_common(100))

In [201]:
def getVocab(df):
  df_vocab = vocab(df)
  processed = list(set([key for groups in df_vocab for key, value in groups]))
  return processed

In [202]:
def fitCountVec(df, df_vocab):
  count_vec = CountVectorizer(vocabulary = df_vocab)
  x_df = count_vec.fit_transform(df['text'])
  return x_df

In [203]:
def fitTfidfVec(df, df_vocab):
  tfidf_vec = TfidfVectorizer(analyzer = 'word', vocabulary = df_vocab)
  x_df = tfidf_vec.fit_transform(df['text'])
  return x_df

In [236]:
def getX(model, df, df_vocab):
  model_tf_x = fitTfidfVec(df, df_vocab)
  model_val_x = model.encode(df['text'].tolist())
  Xval = [np.append(list1, list2) for list1, list2 in zip(model_val_x, model_tf_x.toarray())]
  return Xval

In [204]:
def getY(df):
  return df['category']

In [205]:
def split_max(df):
  data_max = {}
  data_max['category'] = df['category'].map(stats)
  data_max['text'] = df['text']
  
  df_max = pd.DataFrame(data_max)
  return df_max

In [206]:
def split_min(df):
  df_min = dropnota(df)
  return df_min

MODELS

In [207]:
labse_embedding_model = SentenceTransformer('sentence-transformers/LaBSE')

DATASETS AND PREPROCESSING

In [257]:
df_train = pd.read_csv('/content/gdrive/Shareddrives/NLP Research work - PANDAS team/ACL/Datasets/ACD/ta-en-misogyny-train.csv',sep='\t', names = ["category", "text"])
df_train = df_train.dropna()
train = df_train
train = preprocess(train)

In [258]:
df_test = pd.read_csv('/content/gdrive/Shareddrives/NLP Research work - PANDAS team/ACL/Datasets/ACD/ta-en-misogyny-dev.csv',sep='\t', names = ["category", "text"])
df_test = df_test.dropna()
test = df_test
test = preprocess(test)

In [259]:
train

Unnamed: 0,category,text
0,0,enaku unmaikum aluha wantu thirunangaigal thei...
1,1,superstar vijay arivuketta polu thappu pavam a...
2,0,ugka smile cute
3,0,anna waiting 🥰🥰🥰
4,0,yanda tamilnadu evvalavo pirachana athalam vit...
...,...,...
5943,0,noq day caste religious certificate
5944,0,mimicry escape aitaru
5945,0,rajesh age
5946,2,videos nallarukku dont prank eena varavanga ep...


In [260]:
test

Unnamed: 0,category,text
0,0,black saree hot
1,2,halo first ayunga kai thatti amount keatkyrang...
2,0,hello manithana sollunga pramanana solla vanam
3,6,china kaaranai kalaipan daily enjoying 2016
4,0,onnota ponnu ipdi vituviya
...,...,...
1483,1,9 usu vinoth
1484,0,love panuradhaaaa soliyaeeee cover panura niiii
1485,0,part 2 aippa varum
1486,3,jii nariya thetunga andha potta gopiya


In [261]:
#train_max - category (yes - 1, no - 0)
train_max = split_max(train)
test_max = split_max(test)

In [262]:
train_max

Unnamed: 0,category,text
0,0,enaku unmaikum aluha wantu thirunangaigal thei...
1,1,superstar vijay arivuketta polu thappu pavam a...
2,0,ugka smile cute
3,0,anna waiting 🥰🥰🥰
4,0,yanda tamilnadu evvalavo pirachana athalam vit...
...,...,...
5943,0,noq day caste religious certificate
5944,0,mimicry escape aitaru
5945,0,rajesh age
5946,1,videos nallarukku dont prank eena varavanga ep...


In [263]:
test_max

Unnamed: 0,category,text
0,0,black saree hot
1,1,halo first ayunga kai thatti amount keatkyrang...
2,0,hello manithana sollunga pramanana solla vanam
3,1,china kaaranai kalaipan daily enjoying 2016
4,0,onnota ponnu ipdi vituviya
...,...,...
1483,1,9 usu vinoth
1484,0,love panuradhaaaa soliyaeeee cover panura niiii
1485,0,part 2 aippa varum
1486,1,jii nariya thetunga andha potta gopiya


In [264]:
#train_min - categories other than NOTA
train_min = split_min(train)
test_min = split_min(test)

In [265]:
train_min

Unnamed: 0,category,text
1,1,superstar vijay arivuketta polu thappu pavam a...
5,1,night fulla mater alaiuradhu day fulla pichaik...
9,2,pradeep kodi maataanga nenaikireengala ellaaru...
10,3,yena public pudikalayadei arivu ellathavane at...
12,4,515 paiyyan straight girl friend gay illai ava...
...,...,...
5934,3,hiphopgamer92 thevdiaaa paiyaaa pottaa amaaa j...
5938,3,mendal epo ponnunga ambala rights thanthangale...
5941,3,endha thevidiya payalavairamuthuva thaane solr...
5942,7,nadu roomba irunthathu epo ponnunga ambala rig...


In [266]:
test_min

Unnamed: 0,category,text
1,2,halo first ayunga kai thatti amount keatkyrang...
3,6,china kaaranai kalaipan daily enjoying 2016
6,3,புரட்சியாளன் சுபா sonipdila sollapidadhuunga a...
11,7,guru murthi dhevudiyalukku porantha dhevudiya ...
14,3,arumaiyana speech akka karu nai pudunguvan ipo
...,...,...
1473,3,shinchan shinchan deii otha avangalukku suppor...
1478,2,ananda sreniwasan mekaum nallamanither maduvan...
1482,4,swetha ingayum wanthutiya homo 9 homo support ...
1483,1,9 usu vinoth


VOCABULARY AND FEATURE EXTRACTION

In [267]:
vocab_max = getVocab(train_max)
vocab_min = getVocab(train_min)

In [268]:
'''#train_max
x_train_cv_max = fitCountVec(train_max, vocab_max)
x_train_tf_max = fitTfidfVec(train_max, vocab_max)
y_train_max = getY(train_max)'''

'#train_max\nx_train_cv_max = fitCountVec(train_max, vocab_max)\nx_train_tf_max = fitTfidfVec(train_max, vocab_max)\ny_train_max = getY(train_max)'

In [269]:
'''#test_max
x_test_cv_max = fitCountVec(test_max, vocab_max)
x_test_tf_max = fitTfidfVec(test_max, vocab_max)
y_test_max = getY(test_max)'''

'#test_max\nx_test_cv_max = fitCountVec(test_max, vocab_max)\nx_test_tf_max = fitTfidfVec(test_max, vocab_max)\ny_test_max = getY(test_max)'

In [270]:
'''#train_min
x_train_cv_min = fitCountVec(train_min, vocab_min)
x_train_tf_min = fitTfidfVec(train_min, vocab_min)
y_train_min = getY(train_min)'''

'#train_min\nx_train_cv_min = fitCountVec(train_min, vocab_min)\nx_train_tf_min = fitTfidfVec(train_min, vocab_min)\ny_train_min = getY(train_min)'

In [271]:
'''#test_min
x_test_cv_min = fitCountVec(test_min, vocab_min)
x_test_tf_min = fitTfidfVec(test_min, vocab_min)
y_test_min = getY(test_min)'''

'#test_min\nx_test_cv_min = fitCountVec(test_min, vocab_min)\nx_test_tf_min = fitTfidfVec(test_min, vocab_min)\ny_test_min = getY(test_min)'

TRAINING - Level 1 - max classification

In [272]:
Xtrain_max = getX(labse_embedding_model, train_max, vocab_max)
Xtest_max = getX(labse_embedding_model, test_max, vocab_max)

In [273]:
y_train_max = getY(train_max)
y_test_max = getY(test_max)

In [274]:
svm_model_max = svm.SVC(kernel='rbf', C=1, verbose=True, probability = True)
svm_model_max.fit(Xtrain_max, y_train_max)

[LibSVM]

SVC(C=1, probability=True, verbose=True)

In [275]:
ypred_max = svm_model_max.predict(Xtest_max)
print(classification_report(y_test_max, ypred_max))

              precision    recall  f1-score   support

           0       0.80      0.90      0.84       917
           1       0.79      0.64      0.71       569

    accuracy                           0.80      1486
   macro avg       0.80      0.77      0.78      1486
weighted avg       0.80      0.80      0.79      1486



TRIAL

In [276]:
Xtest_max