In [60]:
import pandas as pd
import numpy as np

In [52]:
expanded_lexicon_path = './Data/lexicon-of-abusive-words-master/Lexicons/expandedLexicon.txt'
expanded_lexicon = pd.read_csv(expanded_lexicon_path, sep="\t", header=None)  # Assuming it's tab-separated; adjust if different

hurtlex_path = './Data/hurtlex_EN.tsv'
hurtlex = pd.read_csv(hurtlex_path, sep="\t")  # Assuming it's tab-separated; adjust if different

mol_path = './Data/mol.csv'
mol = pd.read_csv(mol_path)




### Explore data:

In [43]:
expanded_lexicon.head() 

Unnamed: 0,0,1
0,horrible_noun,3.679601
1,disgusting_adj,3.493682
2,moron_noun,3.469677
3,bastard_noun,3.399238
4,stupid_noun,3.323882


In [17]:
hurtlex.head()

Unnamed: 0,id,pos,category,stereotype,lemma,level
0,EN1382,n,qas,no,gag reel,inclusive
1,EN7077,a,cds,no,snotty,conservative
2,EN6856,n,is,yes,mendicant,conservative
3,EN5485,n,re,no,maffias,conservative
4,EN5024,n,cds,no,lying in trade,conservative


In [18]:
mol.head()

Unnamed: 0,term-or-expression,explicit-or-implicit,pt-brazilian-portuguese,pt-contextual-label,pt-hate-label,pt-deeply-culture-rooted,en-american-english,en-contextual-label,en-hate-label,es-latin-spanish,...,fr-african-french,fr-contextual-label,fr-hate-Label,ge-german,ge-contextual-label,ge-hate-Label,tu-turkish,tu-contextual-label,tu-hate-Label,Meaning Sources
0,term,explicit,chorume,1.0,0,0.0,rotten,0.0,0,0,...,0,0.0,0,Scheiße,0.0,0,bulamaç,0.0,0,https://michaelis.uol.com.br/moderno-portugues...
1,term,explicit,baixaria,1.0,0,0.0,fuckfest,1.0,0,vileza,...,0,0.0,0,Betrüger,1.0,0,değersiz,1.0,0,https://michaelis.uol.com.br/moderno-portugues...
2,term,explicit,cu,0.0,0,0.0,asshole,0.0,0,culo,...,cul,0.0,0,Arsch,0.0,0,büzük,0.0,0,https://michaelis.uol.com.br/moderno-portugues...
3,expression,explicit,cu pra tomar,1.0,0,0.0,out on your ass,1.0,0,culo para coger,...,cul à prendre,1.0,0,Schieb es dir sonst wohin,1.0,0,götüne sok,1.0,0,https://dictionary.cambridge.org/pt/dicionario...
4,term,explicit,vagabundo,1.0,0,0.0,degenerate,1.0,0,vagabundo,...,clochard,1.0,0,Penner,1.0,0,serseri,1.0,0,https://michaelis.uol.com.br/moderno-portugues...


In [71]:
# Define a function to print basic information about a dataframe
def print_basic_info(df, name):
    print(f"Information for dataframe: {name}")
    print("-" * 40)
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    print(f"Column names: {df.columns.tolist()}")
    print("\n")

# Print basic information for each dataframe
print_basic_info(expanded_lexicon, 'expanded_lexicon')
print_basic_info(hurtlex, 'hurtlex')
print_basic_info(mol, 'mol')



Information for dataframe: expanded_lexicon
----------------------------------------
Number of rows: 8478
Number of columns: 2
Column names: ['hate-label', 'unit']


Information for dataframe: hurtlex
----------------------------------------
Number of rows: 8228
Number of columns: 6
Column names: ['id', 'pos', 'category', 'stereotype', 'unit', 'level']


Information for dataframe: mol
----------------------------------------
Number of rows: 1011
Number of columns: 6
Column names: ['term-or-expression', 'explicit-or-implicit', 'unit', 'en-contextual-label', 'en-hate-label', 'Meaning Sources']




# Check for overlapping words

### Convert the word / string units to same format:

In [53]:
#exp column
expanded_lexicon['unit'] = expanded_lexicon[0].str.replace(r'_[a-zA-Z]+$', '', regex=True)
expanded_lexicon.drop(columns=[0], inplace=True)

#hurtlex  
hurtlex.rename(columns={'lemma': 'unit'}, inplace=True)

#mol
mol.rename(columns={'en-american-english': 'unit'}, inplace=True)
columns_to_drop = ['pt-brazilian-portuguese', 'pt-contextual-label', 'pt-hate-label', 'pt-deeply-culture-rooted', 
                   'es-latin-spanish', 'es-contextual-label', 'es-hate-label', 'fr-african-french', 
                   'fr-contextual-label', 'fr-hate-Label', 'ge-german', 'ge-contextual-label', 'ge-hate-Label', 
                   'tu-turkish', 'tu-contextual-label', 'tu-hate-Label']

mol.drop(columns=columns_to_drop, inplace=True)

In [55]:
mol.head()

Unnamed: 0,term-or-expression,explicit-or-implicit,unit,en-contextual-label,en-hate-label,Meaning Sources
0,term,explicit,rotten,0.0,0,https://michaelis.uol.com.br/moderno-portugues...
1,term,explicit,fuckfest,1.0,0,https://michaelis.uol.com.br/moderno-portugues...
2,term,explicit,asshole,0.0,0,https://michaelis.uol.com.br/moderno-portugues...
3,expression,explicit,out on your ass,1.0,0,https://dictionary.cambridge.org/pt/dicionario...
4,term,explicit,degenerate,1.0,0,https://michaelis.uol.com.br/moderno-portugues...


In [54]:
#Check result
print(expanded_lexicon.head())
print('-------------------------------------------------')
print(hurtlex.head())
print('-------------------------------------------------')
print(mol.head())

          1        unit
0  3.679601    horrible
1  3.493682  disgusting
2  3.469677       moron
3  3.399238     bastard
4  3.323882      stupid
-------------------------------------------------
       id pos category stereotype            unit         level
0  EN1382   n      qas         no        gag reel     inclusive
1  EN7077   a      cds         no          snotty  conservative
2  EN6856   n       is        yes       mendicant  conservative
3  EN5485   n       re         no         maffias  conservative
4  EN5024   n      cds         no  lying in trade  conservative
-------------------------------------------------
  term-or-expression explicit-or-implicit             unit  \
0               term             explicit           rotten   
1               term             explicit         fuckfest   
2               term             explicit          asshole   
3         expression             explicit  out on your ass   
4               term             explicit       degenerate   


In [56]:
expanded_lexicon_words = set(expanded_lexicon['unit'])  # Replace 'word_column_name' with the actual column name
hurtlex_words = set(hurtlex['unit'])
mol_words = set(mol['unit'])

overlap_expanded_hurtlex = expanded_lexicon_words.intersection(hurtlex_words)
overlap_expanded_mol = expanded_lexicon_words.intersection(mol_words)
overlap_hurtlex_mol = hurtlex_words.intersection(mol_words)

print(f"Number of overlapping words between expanded_lexicon and hurtlex: {len(overlap_expanded_hurtlex)}")
print(f"Number of overlapping words between expanded_lexicon and mol: {len(overlap_expanded_mol)}")
print(f"Number of overlapping words between hurtlex and mol: {len(overlap_hurtlex_mol)}")


Number of overlapping words between expanded_lexicon and hurtlex: 896
Number of overlapping words between expanded_lexicon and mol: 169
Number of overlapping words between hurtlex and mol: 126


# Create hate-label and convert / fill with no-hate(0) or hate(1)

In [72]:
# Print basic information for each dataframe
print_basic_info(expanded_lexicon, 'expanded_lexicon')
print_basic_info(hurtlex, 'hurtlex')
print_basic_info(mol, 'mol')

Information for dataframe: expanded_lexicon
----------------------------------------
Number of rows: 8478
Number of columns: 2
Column names: ['hate-label', 'unit']


Information for dataframe: hurtlex
----------------------------------------
Number of rows: 8228
Number of columns: 6
Column names: ['id', 'pos', 'category', 'stereotype', 'unit', 'level']


Information for dataframe: mol
----------------------------------------
Number of rows: 1011
Number of columns: 6
Column names: ['term-or-expression', 'explicit-or-implicit', 'unit', 'en-contextual-label', 'en-hate-label', 'Meaning Sources']




### 1. Expended lexicon -> Inducing a Lexicon of Abusive Words – A Feature-Based Approach , Michael Wiegand

In [57]:
# Set column name to hate-label
expanded_lexicon = expanded_lexicon.rename(columns={1: 'hate-label'})

In [62]:
# convert hate label to: no-hate(0) or hate(1)
expanded_lexicon['hate-label'] = np.where(expanded_lexicon['hate-label'] > 0, 1, 0)
expanded_lexicon.head(-10)

Unnamed: 0,hate-label,unit
0,1,horrible
1,1,disgusting
2,1,moron
3,1,bastard
4,1,stupid
...,...,...
8463,0,downturn
8464,0,disappoint
8465,0,decline
8466,0,trouble


### 2. Hurtlex -> A Multilingual Lexicon of Words to Hurt.

In [111]:
value_counts_hurtlex = hurtlex['level'].value_counts()
value_counts_hurtlex

level
inclusive       4868
conservative    3360
Name: count, dtype: int64

In [114]:
# add and convert hate label to: no-hate(0) or hate(1)
# THIS NEEEEDS TO BE CHANGED!
hurtlex['hate-label'] = np.where(hurtlex['category'] == '01010', 0, 1)

In [115]:
hurtlex.head()

Unnamed: 0,id,pos,category,stereotype,unit,level,hate-label
0,EN1382,n,qas,no,gag reel,inclusive,1
1,EN7077,a,cds,no,snotty,conservative,1
2,EN6856,n,is,yes,mendicant,conservative,1
3,EN5485,n,re,no,maffias,conservative,1
4,EN5024,n,cds,no,lying in trade,conservative,1


### 3. MOL ->  Contextual-Lexicon Approach for Abusive Lan-guage Detection

In [134]:
# Check which label from git read-me corresponds to no hate
value_counts = mol['en-hate-label'].value_counts()
print(value_counts) 
#Conclusion: 0 = no-hate

unit
0                         38
bitch                     10
faggot                    10
scammer                    8
jerk                       8
                          ..
even a big bitch           1
ideological garbage        1
drop dead                  1
mind your own business     1
worthlessness              1
Name: count, Length: 576, dtype: int64


In [107]:
# add and convert hate label to: no-hate(0) or hate(1)
mol['hate-label'] = np.where(mol['en-hate-label'] == '0', 0, 1)

Unnamed: 0,term-or-expression,explicit-or-implicit,unit,en-contextual-label,en-hate-label,Meaning Sources,hate-label
0,term,explicit,rotten,0.0,0,https://michaelis.uol.com.br/moderno-portugues...,0
1,term,explicit,fuckfest,1.0,0,https://michaelis.uol.com.br/moderno-portugues...,0
2,term,explicit,asshole,0.0,0,https://michaelis.uol.com.br/moderno-portugues...,0
3,expression,explicit,out on your ass,1.0,0,https://dictionary.cambridge.org/pt/dicionario...,0
4,term,explicit,degenerate,1.0,0,https://michaelis.uol.com.br/moderno-portugues...,0
...,...,...,...,...,...,...,...
95,term,explicit,dumb,1.0,0,https://michaelis.uol.com.br/moderno-portugues...,0
96,expression,explicit,shut up,1.0,0,https://www.dicionarioinformal.com.br/cala%20a...,0
97,expression,explicit,shut up,1.0,0,https://dictionary.cambridge.org/pt/dicionario...,0
98,expression,explicit,shut the hell up,1.0,0,https://dictionary.cambridge.org/pt/dicionario...,0


# Merge 3 dataframes to our final dataframe

In [133]:
print(expanded_lexicon['unit'].isna().sum())
print(hurtlex['unit'].isna().sum())
print(mol['unit'].isna().sum())


0
0
12


In [132]:
# Concatenate the three dataframes
all_data = pd.concat([expanded_lexicon[['unit', 'hate-label']], 
                     hurtlex[['unit', 'hate-label']], 
                     mol[['unit', 'hate-label']]])

# Function to apply the majority vote or random decision
def decide_label(group):
    # If there's only one entry, return its value
    if len(group) == 1:
        return group.iloc[0]
    # If there are two entries, return a random value (0 or 1)
    elif len(group) == 2:
        return np.random.choice([0, 1])
    # If there are three entries, return the majority vote
    else:
        return round(group.mean())

# Group by 'unit' and apply the decision function
merged = all_data.groupby('unit')['hate-label'].agg(decide_label).reset_index()

merged.head()
all_data

Unnamed: 0,unit,hate-label
0,horrible,1
1,disgusting,1
2,moron,1
3,bastard,1
4,stupid,1
...,...,...
1006,,1
1007,,1
1008,,1
1009,,1
