In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn import svm
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from collections import Counter

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report

In [None]:
!pip install lime

HELPER FUNCTIONS

In [None]:
nltk.download('stopwords')
stopwords_list = stopwords.words('english')
stopwords_list.extend(['nalla','ena','unaku','per','irukanga','panna','yarum','mattum','ivan','ada','pesa','unakku','k','sari','idhu','vida','vittu','enga','yen','ithu','poda','dey','irundhu','ya','la', 'u','r','s','bro','da','dei','dai','nu','ah','nee','ni','illa','un','ok','na','pls','ur','unga']) 
stopwords_list.extend(['🥰','indha','antha','vera','iruka','pola','innum','avan','summa','ellam','thaan','❤️','ana','ama','apdi','ithula','po','evlo','eruku','irukum','nama','enna','va','hi','h','ku','iruku','naa','va','oru','athu','avanga','neenga','tha','en','di','dhan','ne','ella','intha']) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
label = { 'None-of-the-above': 0, 'Transphobic':1, 'Counter-speech':2, 'Misandry':3,
       'Homophobia':4, 'Hope-Speech':5, 'Xenophobia':6, 'Misogyny':7}

In [None]:
def tolower(text):
  return text.lower()

def removepunctuation(text):
    clean="".join([i for i in text if i not in string.punctuation])
    return clean

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def removestopwords(text):
  clean = " ".join([word for word in text.split() if word not in stopwords_list])
  return clean

In [None]:
def preprocess(df):
  df['text']= df['text'].apply(lambda x:tolower(x))
  df['text']= df['text'].apply(lambda x:removepunctuation(x))
  df['text']= df['text'].apply(lambda x:deEmojify(x))
  df['text']= df['text'].apply(lambda x:removestopwords(x))
  df['category'] = df['category'].map(label)
  df['index'] = df.index
  return df

In [None]:
def dropnota(df):
  return df.drop(df[df['category']==0].index)

In [None]:
def dropnota_test(df):
  return df.drop(df[df['class']==0].index)

In [None]:
def getnota_test(df):
  return df[df['class']==0]

In [None]:
def tokenize(text):
  return text.split()

def tokenizedf(df):
  df['token']= df['text'].apply(lambda x:tokenize(x))
  df['len'] = df.text.apply(lambda x : len(x))
  return df

In [None]:
def stats(value):
    if value == 0:
        return 0
    else:
        return 1

In [None]:
def vocab(df):
  return df.groupby('category')['text'].apply(lambda x: Counter(" ".join(x).split()).most_common(100))

In [None]:
def getVocab(df):
  df_vocab = vocab(df)
  processed = list(set([key for groups in df_vocab for key, value in groups]))
  return processed

In [None]:
def fitCountVec(df, df_vocab):
  count_vec = CountVectorizer(vocabulary = df_vocab)
  x_df = count_vec.fit_transform(df['text'])
  return x_df

In [None]:
def fitTfidfVec(df, df_vocab):
  tfidf_vec = TfidfVectorizer(analyzer = 'word', vocabulary = df_vocab)
  x_df = tfidf_vec.fit_transform(df['text'])
  return x_df

In [None]:
def getX(model, df, df_vocab):
  model_tf_x = fitTfidfVec(df, df_vocab)
  model_val_x = model.encode(df['text'].tolist())
  Xval = [np.append(list1, list2) for list1, list2 in zip(model_val_x, model_tf_x.toarray())]
  return Xval

In [None]:
def getY(df):
  return df['category']

In [None]:
def split_max(df):
  data_max = {}
  data_max['category'] = df['category'].map(stats)
  data_max['text'] = df['text']
  
  df_max = pd.DataFrame(data_max)
  return df_max

In [None]:
def split_min(df):
  df_min = dropnota(df)
  return df_min

MODELS

In [None]:
labse_embedding_model = SentenceTransformer('sentence-transformers/LaBSE')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

DATASETS AND PREPROCESSING

In [None]:
df_train = pd.read_csv('/content/gdrive/Shareddrives/NLP Research work - PANDAS team/ACL/Datasets/ACD/ta-en-misogyny-train.csv',sep='\t', names = ["category", "text"])
df_train = df_train.dropna()
train = df_train
train = preprocess(train)

In [None]:
df_test = pd.read_csv('/content/gdrive/Shareddrives/NLP Research work - PANDAS team/ACL/Datasets/ACD/ta-en-misogyny-dev.csv',sep='\t', names = ["category", "text"])
df_test = df_test.dropna()
test = df_test
test = preprocess(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text']= df['text'].apply(lambda x:tolower(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text']= df['text'].apply(lambda x:removepunctuation(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text']= df['text'].apply(lambda x:deEmojify(x))
A value is trying to be set on a copy of a 

In [None]:
train

Unnamed: 0,category,text,index
0,0,enaku unmaikum aluha wantu thirunangaigal thei...,0
1,1,superstar vijay arivuketta polu thappu pavam a...,1
2,0,ugka smile cute,2
3,0,anna waiting 🥰🥰🥰,3
4,0,yanda tamilnadu evvalavo pirachana athalam vit...,4
...,...,...,...
5943,0,noq day caste religious certificate,5943
5944,0,mimicry escape aitaru,5944
5945,0,rajesh age,5945
5946,2,videos nallarukku dont prank eena varavanga ep...,5946


In [None]:
test

Unnamed: 0,category,text,index
0,0,black saree hot,0
1,2,halo first ayunga kai thatti amount keatkyrang...,1
2,0,hello manithana sollunga pramanana solla vanam,2
3,6,china kaaranai kalaipan daily enjoying 2016,3
4,0,onnota ponnu ipdi vituviya,4
...,...,...,...
1483,1,9 usu vinoth,1483
1484,0,love panuradhaaaa soliyaeeee cover panura niiii,1484
1485,0,part 2 aippa varum,1485
1486,3,jii nariya thetunga andha potta gopiya,1486


In [None]:
test[test['category'] == 1]

Unnamed: 0,category,text,index
75,1,ivaluga yella ore marri irukkaluga iclvalthan ...,75
90,1,thirunainggai mareyathi kuduingga solreingga p...,90
107,1,ippadi nadanthukondal ivarkal mel eppadi mariy...,107
113,1,yedukurathu picha item nai manusana ipadilam k...,113
120,1,ivangala vetti podanum ivangala ippadiye vitto...,120
165,1,ipdi irukura nalathan ungala mathikama mithika...,165
268,1,muruganantham g nanum ungala madhiri dha nenac...,268
296,1,rajeswari rajagopalan true bayam thatti ketta ...,296
297,1,pompala mathiri dress panu ponda movala,297
329,1,ningal penna ka pennai nesikuren yenrdru sonna...,329


In [None]:
#train_max - category (yes - 1, no - 0)
train_max = split_max(train)
test_max = split_max(test)

In [None]:
train_max

Unnamed: 0,category,text
0,0,enaku unmaikum aluha wantu thirunangaigal thei...
1,1,superstar vijay arivuketta polu thappu pavam a...
2,0,ugka smile cute
3,0,anna waiting 🥰🥰🥰
4,0,yanda tamilnadu evvalavo pirachana athalam vit...
...,...,...
5943,0,noq day caste religious certificate
5944,0,mimicry escape aitaru
5945,0,rajesh age
5946,1,videos nallarukku dont prank eena varavanga ep...


In [None]:
test_max

Unnamed: 0,category,text
0,0,black saree hot
1,1,halo first ayunga kai thatti amount keatkyrang...
2,0,hello manithana sollunga pramanana solla vanam
3,1,china kaaranai kalaipan daily enjoying 2016
4,0,onnota ponnu ipdi vituviya
...,...,...
1483,1,9 usu vinoth
1484,0,love panuradhaaaa soliyaeeee cover panura niiii
1485,0,part 2 aippa varum
1486,1,jii nariya thetunga andha potta gopiya


In [None]:
#train_min - categories other than NOTA
train_min = split_min(train)
'''test_min = split_min(test)''' #actual test_min, but we need to generate our own test_min based on max level's category prediction

'test_min = split_min(test)'

In [None]:
train_min

Unnamed: 0,category,text,index
1,1,superstar vijay arivuketta polu thappu pavam a...,1
5,1,night fulla mater alaiuradhu day fulla pichaik...,5
9,2,pradeep kodi maataanga nenaikireengala ellaaru...,9
10,3,yena public pudikalayadei arivu ellathavane at...,10
12,4,515 paiyyan straight girl friend gay illai ava...,12
...,...,...,...
5934,3,hiphopgamer92 thevdiaaa paiyaaa pottaa amaaa j...,5934
5938,3,mendal epo ponnunga ambala rights thanthangale...,5938
5941,3,endha thevidiya payalavairamuthuva thaane solr...,5941
5942,7,nadu roomba irunthathu epo ponnunga ambala rig...,5942


In [None]:
'''test_min'''

'test_min'

VOCABULARY AND FEATURE EXTRACTION

In [None]:
vocab_max = getVocab(train_max)
vocab_min = getVocab(train_min)

TRAINING - Level 1 - max classification

In [None]:
Xtrain_max = getX(labse_embedding_model, train_max, vocab_max)
Xtest_max = getX(labse_embedding_model, test_max, vocab_max)

In [None]:
y_train_max = getY(train_max)
y_test_max = getY(test_max)

In [None]:
svm_model_max = svm.SVC(kernel='rbf', C=1, verbose=True, probability = True)
svm_model_max.fit(Xtrain_max, y_train_max)

[LibSVM]

SVC(C=1, probability=True, verbose=True)

In [None]:
ypred_max = svm_model_max.predict(Xtest_max)
print(classification_report(y_test_max, ypred_max))

              precision    recall  f1-score   support

           0       0.80      0.90      0.84       917
           1       0.79      0.64      0.71       569

    accuracy                           0.80      1486
   macro avg       0.80      0.77      0.78      1486
weighted avg       0.80      0.80      0.79      1486



MERGING LEVELS - Level 2 - min classification

In [None]:
test

Unnamed: 0,category,text,index
0,0,black saree hot,0
1,2,halo first ayunga kai thatti amount keatkyrang...,1
2,0,hello manithana sollunga pramanana solla vanam,2
3,6,china kaaranai kalaipan daily enjoying 2016,3
4,0,onnota ponnu ipdi vituviya,4
...,...,...,...
1483,1,9 usu vinoth,1483
1484,0,love panuradhaaaa soliyaeeee cover panura niiii,1484
1485,0,part 2 aippa varum,1485
1486,3,jii nariya thetunga andha potta gopiya,1486


In [None]:
newtest = test
newtest['class'] = ypred_max.tolist()
newtest_nota = getnota_test(newtest)
newtest_min = dropnota_test(newtest)
newtest_min = newtest_min[['category', 'text', 'index']]
newtest_min

Unnamed: 0,category,text,index
1,2,halo first ayunga kai thatti amount keatkyrang...,1
3,6,china kaaranai kalaipan daily enjoying 2016,3
6,3,புரட்சியாளன் சுபா sonipdila sollapidadhuunga a...,6
11,7,guru murthi dhevudiyalukku porantha dhevudiya ...,11
12,0,vendam verupa show pandra mari pandran entha a...,12
...,...,...,...
1470,2,juz wanna say hetro natural right coz reproduc...,1470
1473,3,shinchan shinchan deii otha avangalukku suppor...,1473
1481,0,bumble bee athan endru sonningale athe nan pai...,1481
1482,4,swetha ingayum wanthutiya homo 9 homo support ...,1482


In [None]:
Xtrain_min = getX(labse_embedding_model, train_min, vocab_min)
Xtest_min = getX(labse_embedding_model, newtest_min, vocab_min)

In [None]:
y_train_min = getY(train_min)
y_test_min = getY(newtest_min)

In [None]:
svm_model_min = svm.SVC(kernel='rbf', C=1, verbose=True, probability = True)
svm_model_min.fit(Xtrain_min, y_train_min)

[LibSVM]

SVC(C=1, probability=True, verbose=True)

In [None]:
ypred_min = svm_model_min.predict(Xtest_min)
#print(classification_report(y_test_min, ypred_min)) #need to do classification report with entire test dataset, not max and min separately

In [None]:
len(ypred_min)

457

In [None]:
newtest_min['class'] = ypred_min.tolist()
newtest_min

Unnamed: 0,category,text,index,class
1,2,halo first ayunga kai thatti amount keatkyrang...,1,3
3,6,china kaaranai kalaipan daily enjoying 2016,3,6
6,3,புரட்சியாளன் சுபா sonipdila sollapidadhuunga a...,6,3
11,7,guru murthi dhevudiyalukku porantha dhevudiya ...,11,3
12,0,vendam verupa show pandra mari pandran entha a...,12,3
...,...,...,...,...
1470,2,juz wanna say hetro natural right coz reproduc...,1470,2
1473,3,shinchan shinchan deii otha avangalukku suppor...,1473,3
1481,0,bumble bee athan endru sonningale athe nan pai...,1481,2
1482,4,swetha ingayum wanthutiya homo 9 homo support ...,1482,4


In [None]:
newtest_nota

Unnamed: 0,category,text,index,class
0,0,black saree hot,0,0
2,0,hello manithana sollunga pramanana solla vanam,2,0
4,0,onnota ponnu ipdi vituviya,4,0
5,0,vinothkumar vinoth varalam ellamey nenga gay,5,0
7,0,vera11 thalaiva love ❤,7,0
...,...,...,...,...
1480,0,hari haran mobile number sent pana,1480,0
1483,1,9 usu vinoth,1483,0
1484,0,love panuradhaaaa soliyaeeee cover panura niiii,1484,0
1485,0,part 2 aippa varum,1485,0


In [None]:
totaltest = newtest_nota
totaltest = totaltest.append(newtest_min, ignore_index = True)
sorted_test = totaltest.sort_values(by = 'index')
sorted_test

Unnamed: 0,category,text,index,class
0,0,black saree hot,0,0
1029,2,halo first ayunga kai thatti amount keatkyrang...,1,3
1,0,hello manithana sollunga pramanana solla vanam,2,0
1030,6,china kaaranai kalaipan daily enjoying 2016,3,6
2,0,onnota ponnu ipdi vituviya,4,0
...,...,...,...,...
1025,1,9 usu vinoth,1483,0
1026,0,love panuradhaaaa soliyaeeee cover panura niiii,1484,0
1027,0,part 2 aippa varum,1485,0
1485,3,jii nariya thetunga andha potta gopiya,1486,3


In [None]:
ypred_test = sorted_test['class']
y_test = getY(sorted_test)

In [None]:
print(classification_report(y_test, ypred_test))

              precision    recall  f1-score   support

           0       0.80      0.90      0.84       917
           1       0.60      0.15      0.24        40
           2       0.36      0.53      0.43        95
           3       0.54      0.56      0.55       218
           4       0.82      0.33      0.47        43
           5       0.46      0.11      0.18        53
           6       0.83      0.54      0.66        70
           7       1.00      0.18      0.31        50

    accuracy                           0.72      1486
   macro avg       0.68      0.41      0.46      1486
weighted avg       0.73      0.72      0.70      1486

