In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

import langid
from sklearn.model_selection import StratifiedKFold

In [2]:
data_path = '../data/'

In [3]:
dataset = '../data/new_all_DEFR_comments_27062022.csv'
expert_dataset = '../data/experts_for_annots_16062023_master.xlsx'

In [4]:
# Read the data
df_raw_all = pd.read_csv(dataset)
df_raw_expert = pd.read_excel(expert_dataset)

df_raw_all.shape, df_raw_expert.shape

  df_raw_all = pd.read_csv(dataset)


((422046, 16), (500, 16))

In [5]:
df_raw_expert.columns

Index(['ArticleID', 'ID', 'Titel', 'Text', 'Kommentar', 'Hate Speech_KD',
       'Target Group_KD', 'Hate Speech_FG', 'Target Group_FG',
       'Hate Speech_SK', 'Target Group_SK', 'Initial Agreement',
       'Konsensus HS', 'Konsensus Target 1', 'Unnamed: 14', 'Difficult case?'],
      dtype='object')

In [6]:
df_raw_all.columns[2:12]

Index(['geschlecht', 'alter', 'sexualitaet', 'religion', 'nationalitaet',
       'beeintraechtigung', 'sozialer_status', 'politik', 'aussehen',
       'andere'],
      dtype='object')

In [7]:
df_raw_expert['Initial Agreement'].unique()

array(['full', 'partial_HS', 'partial_toxic'], dtype=object)

In [8]:
df_raw_expert['Hate Speech_SK'].sum(), df_raw_expert['Hate Speech_FG'].sum(), df_raw_expert['Hate Speech_KD'].sum()

(255, 220, 225)

In [9]:
# Remove data without labels, text and drop duplicates
df_cleaned_all = df_raw_all[~df_raw_all.kommentar_original.isna()]
df_cleaned_all = df_cleaned_all[~df_cleaned_all['hatespeech'].isna()]
df_cleaned_all = df_cleaned_all.drop_duplicates(subset=['kommentar_original'])

df_cleaned_expert = df_raw_expert[~df_raw_expert.Kommentar.isna()]
df_cleaned_expert = df_cleaned_expert.drop_duplicates(subset='Kommentar')

df_cleaned_all.shape, df_cleaned_expert.shape

((417454, 16), (500, 16))

In [10]:
# Remove expert labels from potential training data
df_cleaned_all = df_cleaned_all[~df_cleaned_all.progress_apply(lambda x: df_cleaned_expert.Kommentar.isin([x['kommentar_original']]).any(), axis=1)]
df_cleaned_all.shape

100%|█████████████████████████████████████████████████████████████████████████| 417454/417454 [00:51<00:00, 8038.09it/s]


(416973, 16)

In [12]:
df_cleaned_expert = df_cleaned_expert.rename(columns={'Kommentar' : 'kommentar_original'})
df_cleaned_expert['hatespeech'] = df_cleaned_expert['Konsensus HS']

In [13]:
df_cleaned_all['general_toxic'] = df_cleaned_all['hatespeech'] | df_cleaned_all['toxic_new']

In [14]:
# Determine language
langid.set_languages(['fr', 'de'])
df_cleaned_all['language'] = df_cleaned_all["kommentar_original"].progress_apply(lambda x :langid.classify(x)[0])
df_cleaned_expert['language'] = df_cleaned_expert["kommentar_original"].progress_apply(lambda x :langid.classify(x)[0])

100%|█████████████████████████████████████████████████████████████████████████| 416973/416973 [01:25<00:00, 4866.73it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 4618.78it/s]


In [15]:
# Split df based on languages
df_german = df_cleaned_all[df_cleaned_all['language'] == 'de']
df_french = df_cleaned_all[df_cleaned_all['language'] == 'fr']

In [24]:
# Function which creates a split 80% Train, 10% Val and 10% Test stratified split

def get_train_val_test_split(df, random_state=42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state*random_state)
    for i, (train_index, other_index) in enumerate(skf.split(df, df['hatespeech'].astype(int))):
        df_train = df.iloc[train_index,:]
        df_other = df.iloc[other_index,:]
        skf_new = StratifiedKFold(n_splits=2, shuffle=True, random_state=random_state)
        for i, (val_index, test_index) in enumerate(skf_new.split(df_other, df_other['hatespeech'].astype(int))):
            df_val = df_other.iloc[val_index,:]
            df_test = df_other.iloc[test_index,:]
            break
        break
    return df_train, df_val, df_test

In [25]:
def generate_split(df, random_state=42):
    # split language datasets into 80% Train, 10% Val and 10% Test stratified split
    df_german_train, df_german_val, df_german_test = get_train_val_test_split(df_german, random_state=random_state)
    df_french_train, df_french_val, df_french_test = get_train_val_test_split(df_french, random_state=random_state)

    # Combine language data into a stratified split according to language and label
    df_train = pd.concat([df_german_train, df_french_train])
    df_val = pd.concat([df_german_val, df_french_val])
    df_test = pd.concat([df_german_test, df_french_test])

    df_train.to_csv(data_path + f'train_{random_state}.csv')
    df_val.to_csv(data_path + f'val_{random_state}.csv')
    df_test.to_csv(data_path + f'test_{random_state}.csv')
    pd.concat([df_train, df_val]).to_csv(data_path + f'trainval_{random_state}.csv')

In [26]:
generate_split(df_cleaned_all, random_state=42)
generate_split(df_cleaned_all, random_state=43)
generate_split(df_cleaned_all, random_state=44)

In [22]:
df_cleaned_expert.to_csv(data_path + 'expert.csv')

((333577, 18), (41697, 18), (41699, 18))

In [23]:
df_train.to_csv(data_path + 'train.csv')
df_val.to_csv(data_path + 'val.csv')
df_test.to_csv(data_path + 'test.csv')
df_cleaned_expert.to_csv(data_path + 'expert.csv')
pd.concat([df_train, df_val]).to_csv(data_path + 'trainval.csv')

In [37]:
def get_ratio(df, column='hatespeech'):
    return (1 - df[column]).sum() / df[column].sum()

In [38]:
get_ratio(df_train), get_ratio(df_val), get_ratio(df_test), get_ratio(df_cleaned_expert)

(4.614545638159998, 4.615001346619984, 4.613758750673129, 1.0661157024793388)

In [26]:
df_test = pd.read_csv('../data/test.csv')

In [27]:
df_test.shape

(41699, 19)

In [28]:
import pickle

with open("../data/outputs/e5_preds.pkl", 'rb') as f:
    tmp = pickle.load(f)

In [29]:
df_test.loc[tmp['idx'],'labels'] = tmp['labels']
(df_test['labels'] == df_test['hatespeech'].astype(float)).all()

True

In [30]:
df_test.loc[tmp['idx'],'preds'] = tmp['predictions']

In [31]:
df_test.loc[tmp['idx'],'neg_prob'] = tmp['probabilities'][:,0]
df_test.loc[tmp['idx'],'pos_prob'] = tmp['probabilities'][:,1]

In [32]:
from sklearn.metrics import f1_score

In [39]:
f1_score(df_test['preds'], df_test['labels'], average='macro')

0.7363224880480469

In [40]:
df_train = pd.read_csv('../data/train.csv')
df_val = pd.read_csv('../data/val.csv')

  df_train = pd.read_csv('../data/train.csv')


In [42]:
get_ratio(df_train, 'general_toxic'), get_ratio(df_val, 'general_toxic'), get_ratio(df_test, 'general_toxic'), get_ratio(pd.concat([df_train, df_val]), 'general_toxic')

(3.28074430542188, 3.2656777493606137, 3.3130947455523376, 3.2790649942987455)

In [36]:
df_train['general_toxic']

0          True
1          True
2          True
3         False
4         False
          ...  
333572    False
333573    False
333574    False
333575    False
333576    False
Name: general_toxic, Length: 333577, dtype: bool

In [16]:
import pandas as pd
df_val = pd.read_csv('../data/val.csv')

In [3]:
from data.nli_dataset import get_dataloader
import data.dataset

In [4]:
tmp = get_dataloader(
    model_name='MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7', 
    data_path='../data/', 
    dataset_name='test.csv',
    use_cache=False,
    batch_size=32,
    is_test=True
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 7428/7428 [00:02<00:00, 2616.10it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/74280 [00:00<?, ? examples/s]

In [5]:
for x in tmp:
    cool = x
    break

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [6]:
cool.items()

dict_items([('labels', tensor([1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
        1, 1, 1, 1, 0, 1, 1, 1])), ('input_ids', tensor([[     1,    260,   5551,  ...,      0,      0,      0],
        [     1,    500, 143664,  ...,      0,      0,      0],
        [     1,    828,    327,  ...,      0,      0,      0],
        ...,
        [     1,    260, 211927,  ...,      0,      0,      0],
        [     1,    351,    260,  ...,      0,      0,      0],
        [     1,    260,  82480,  ...,      0,      0,      0]])), ('attention_mask', tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]))])

In [7]:
tmp1= data.dataset.get_dataloader(
    model_name='MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7', 
    data_path='../data/', 
    dataset_name='test.csv',
    use_cache=False,
    batch_size=32,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/41634 [00:00<?, ? examples/s]

In [8]:
d = {}
for x in tmp1:
    for i in x['labels']:
        d[i] = 0

print(d)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{tensor(1): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(1): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(1): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(1): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(1): 0, tensor(0): 0, tensor(0): 0, tensor(1): 0, tensor(0): 0, tensor(0): 0, tensor(1): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(1): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(0): 0, tensor(1): 0, tenso

In [24]:
cool['input_ids']

tensor([[    1,  1219,  1144,  ...,     0,     0,     0],
        [    1,   289,  2035,  ...,     0,     0,     0],
        [    1, 15680,   260,  ...,     0,     0,     0],
        ...,
        [    1, 69944,   260,  ...,     0,     0,     0],
        [    1,   384,  1854,  ...,     0,     0,     0],
        [    1,  3231, 16224,  ...,     0,     0,     0]])

In [25]:
cool1['input_ids']

tensor([[     1,    629,    260,  ...,      0,      0,      0],
        [     1,    399,  17459,  ...,      0,      0,      0],
        [     1,    900,   9200,  ...,      0,      0,      0],
        ...,
        [     1,    433,    828,  ...,      0,      0,      0],
        [     1, 223360,   6541,  ...,      0,      0,      0],
        [     1,  36101,    266,  ...,      0,      0,      0]])