In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn import svm
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from collections import Counter
import nltk

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.

# HELPER FUNCTIONS

In [4]:
nltk.download('stopwords')
stopwords_list = stopwords.words('english')
stopwords_list.extend(['nalla','ena','unaku','per','irukanga','panna','yarum','mattum','ivan','ada','pesa','unakku','k','sari','idhu','vida','vittu','enga','yen','ithu','poda','dey','irundhu','ya','la', 'u','r','s','bro','da','dei','dai','nu','ah','nee','ni','illa','un','ok','na','pls','ur','unga']) 
stopwords_list.extend(['🥰','indha','antha','vera','iruka','pola','innum','avan','summa','ellam','thaan','romba','❤️','ana','ama','apdi','ithula','po','evlo','eruku','irukum','nama','enna','va','hi','h','ku','iruku','naa','va','oru','athu','avanga','neenga','tha','en','di','dhan','ne','ella','intha']) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
label = { 'None-of-the-above': 0, 'Transphobic':1, 'Counter-speech':2, 'Misandry':3,
       'Homophobia':4, 'Hope-Speech':5, 'Xenophobia':6, 'Misogyny':7}

In [6]:
def tolower(text):
  return text.lower()

def removepunctuation(text):
    clean="".join([i for i in text if i not in string.punctuation])
    return clean

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def removestopwords(text):
  clean = " ".join([word for word in text.split() if word not in stopwords_list])
  return clean

In [7]:
def preprocess(df):
  df['text']= df['text'].apply(lambda x:tolower(x))
  df['text']= df['text'].apply(lambda x:removepunctuation(x))
  df['text']= df['text'].apply(lambda x:deEmojify(x))
  df['text']= df['text'].apply(lambda x:removestopwords(x))
  df['category'] = df['category'].map(label)
  return df

In [8]:
def tokenize(text):
  return text.split()

def tokenizedf(df):
  df['token']= df['text'].apply(lambda x:tokenize(x))
  df['len'] = df.text.apply(lambda x : len(x))
  return df

In [9]:
def stats(value):
    if value == 0:
        return 0
    else:
        return 1

In [10]:
def vocab(df):
  return df.groupby('category')['text'].apply(lambda x: Counter(" ".join(x).split()).most_common(100))

In [11]:
def getVocab(df):
  df_vocab = vocab(df)
  processed = list(set([key for groups in df_vocab for key, value in groups]))
  return processed

In [12]:
def fitTfidfVectrain(df, tfidf_vec):
  x_df = tfidf_vec.fit_transform(df['text'])
  return x_df

In [13]:
def getXtrain(model, df, tfidf_vec):
  model_tf_x = fitTfidfVectrain(df, tfidf_vec)
  model_val_x = model.encode(df['text'].tolist())
  Xval = [np.append(list1, list2) for list1, list2 in zip(model_val_x, model_tf_x.toarray())]
  return Xval

In [14]:
def fitTfidfVectest(df, tfidf_vec):
  x_df = tfidf_vec.transform(df['text'])
  return x_df

In [15]:
def getXtest(model, df, tfidf_vec):
  model_tf_x = fitTfidfVectest(df, tfidf_vec)
  model_val_x = model.encode(df['text'].tolist())
  Xval = [np.append(list1, list2) for list1, list2 in zip(model_val_x, model_tf_x.toarray())]
  return Xval

In [16]:
def getY(df):
  return df['category']

# MODELS

In [17]:
labse_embedding_model = SentenceTransformer('sentence-transformers/LaBSE')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

# DATASETS AND PREPROCESSING

In [18]:
df_train = pd.read_csv('/content/drive/Shareddrives/NLP Research work - PANDAS team/ACL/Datasets/ACD/ta-en-misogyny-train.csv',sep='\t', names = ["category", "text"])
df_train.text=df_train.text.astype(str)
train = df_train
train = preprocess(df_train)

In [19]:
df_test = pd.read_csv('/content/drive/Shareddrives/NLP Research work - PANDAS team/ACL/Datasets/ACD/ta-en-misogyny-dev.csv',sep='\t', names = ["category", "text"])
df_test.text=df_test.text.astype(str)
test = df_test
test = preprocess(test)

In [20]:
train

Unnamed: 0,category,text
0,0,enaku unmaikum aluha wantu thirunangaigal thei...
1,1,superstar vijay arivuketta polu thappu pavam a...
2,0,ugka smile cute
3,0,anna waiting 🥰🥰🥰
4,0,yanda tamilnadu evvalavo pirachana athalam vit...
...,...,...
5943,0,noq day caste religious certificate
5944,0,mimicry escape aitaru
5945,0,rajesh age
5946,2,videos nallarukku dont prank eena varavanga ep...


In [21]:
test

Unnamed: 0,category,text
0,0,black saree hot
1,2,halo first ayunga kai thatti amount keatkyrang...
2,0,hello manithana sollunga pramanana solla vanam
3,6,china kaaranai kalaipan daily enjoying 2016
4,0,onnota ponnu ipdi vituviya
...,...,...
1483,1,9 usu vinoth
1484,0,love panuradhaaaa soliyaeeee cover panura niiii
1485,0,part 2 aippa varum
1486,3,jii nariya thetunga andha potta gopiya


#VOCABULARY AND FEATURE EXTRACTION

In [22]:
vocab = getVocab(train)

In [23]:
vocab[3]

'malini'

#Getting Xtrain, y_train and test lists

In [24]:
tfidf_vec = TfidfVectorizer(analyzer = 'word', vocabulary = vocab)

In [25]:
Xtrain = getXtrain(labse_embedding_model, train, tfidf_vec)
Xtest = getXtest(labse_embedding_model, test, tfidf_vec)

In [26]:
y_train = getY(train)
y_test = getY(test)

#test

In [42]:
pip install hiclass

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hiclass
  Downloading hiclass-4.3.0-py3-none-any.whl (25 kB)
Installing collected packages: hiclass
Successfully installed hiclass-4.3.0


In [43]:
from hiclass import LocalClassifierPerNode
from sklearn.ensemble import RandomForestClassifier
from hiclass import LocalClassifierPerParentNode
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [53]:
lr = LogisticRegression()
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('lcppn', LocalClassifierPerParentNode(local_classifier=lr)),
])

In [51]:
y_train=np.array(y_train)

In [55]:
pipeline.fit(Xtrain, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('lcppn',
                 LocalClassifierPerParentNode(local_classifier=LogisticRegression()))])

In [56]:
ypred = pipeline.predict(Xtest)

In [59]:
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       0.72      0.97      0.83       919
           1       0.50      0.05      0.09        40
           2       0.52      0.13      0.20        95
           3       0.70      0.53      0.61       218
           4       0.89      0.19      0.31        43
           5       0.33      0.04      0.07        53
           6       0.90      0.50      0.64        70
           7       0.67      0.12      0.20        50

    accuracy                           0.72      1488
   macro avg       0.65      0.32      0.37      1488
weighted avg       0.70      0.72      0.67      1488

