In [1]:
import pandas as pd

In [2]:
#run from the root directory
import os

os.chdir("..")

In [3]:
import config

In [4]:
config_map = config.dataset_category_mapping
index_map  = config.index_class_mapping

# HCI Harms

In [15]:
hci_harms_df = pd.DataFrame()

In [6]:
#student anxiety dataset
#unsafe: 730, safe: 5000

df_1_name = "anxiety"
student_anxiety = pd.read_csv("data/HCI_harms/anxiety.csv")
student_anxiety['global_label']  = student_anxiety['label'].map(config_map[df_1_name]['column_mapping'])
student_anxiety = student_anxiety.rename(columns={"text": 'text', 'label': 'binary_label'})
student_anxiety = student_anxiety[['text', 'binary_label', 'global_label']]

anxiety_unsafe = student_anxiety[student_anxiety['binary_label'] == 1]
anxiety_unsafe = anxiety_unsafe[['text', 'binary_label', 'global_label']]

anxiety_safe = student_anxiety[student_anxiety['binary_label'] == 0].sample(n=5500)

anxiety_safe = anxiety_safe[['text', 'binary_label', 'global_label']]


In [11]:
#suicide detection dataset
#unsafe: 9270, safe: 5000
df_2_name = "suicide"
suicide = pd.read_csv("data/HCI_harms/suicide.csv")
suicide['global_label']  = suicide['class'].map(config_map[df_2_name]['column_mapping'])
suicide['binary_label'] = suicide['global_label'].map(index_map)
suicide = suicide[['text', 'binary_label', 'global_label']]

suicide_unsafe = suicide[suicide['binary_label'] == 1].sample(n=9300)
suicide_safe = suicide[suicide['binary_label'] == 0].sample(n=5500)



In [12]:
hci_harms_df = pd.concat([anxiety_unsafe, anxiety_safe, suicide_unsafe, suicide_safe])

In [13]:
tmp2 = hci_harms_df.drop_duplicates(subset='text')

In [14]:
tmp2['binary_label'].value_counts()

0.0    10951
1.0    10012
Name: binary_label, dtype: int64

In [16]:
tmp2.head(1)

Unnamed: 0,text,binary_label,global_label
0,oh my gosh,1.0,7


In [19]:
t1 = tmp2[tmp2['binary_label'] == 0].sample(n=10000)
t2 = tmp2[tmp2['binary_label'] == 1].sample(10000)

df = pd.concat([t1, t2])

In [20]:
df['binary_label'].value_counts()

0.0    10000
1.0    10000
Name: binary_label, dtype: int64

In [21]:
df.to_csv("data/hci_harms_df.csv", index=False)

# Malicious Uses

In [5]:
def generate_df(*args):
    new_df = pd.DataFrame()

    for i in range(len(args)):
        df, safe_sample, unsafe_sample = args[i]
        safe_df = df[df['binary_label'] == 0].sample(n=safe_sample)
        unsafe_df = df[df['binary_label'] == 1].sample(n=unsafe_sample)

        new_df = pd.concat([new_df, safe_df, unsafe_df])

    
    new_df = new_df.drop_duplicates(subset='text')
    print(new_df['binary_label'].value_counts())

    t1 = new_df[new_df['binary_label'] == 0].sample(n=10000, replace=False)
    t2 = new_df[new_df['binary_label'] == 1].sample(n=10000, replace=False)

    t = pd.concat([t1, t2])

    
    return t

In [6]:
config_map['fake_news']

{'column_text': 'content',
 'column_label': 'binary_label',
 'column_mapping': {1: 18}}

In [7]:
def process_df(df, df_name):
    label_col = config_map[df_name]['column_label']
    text_col = config_map[df_name]['column_text']
    new_df = df.copy()
    new_df['global_label'] = new_df[label_col].map(config_map[df_name]['column_mapping'])
    new_df['binary_label'] = new_df['global_label'].map(index_map)
    new_df = new_df.rename(columns={text_col: "text"})

    new_df = new_df[['text', 'binary_label', 'global_label']]
    
    return new_df

In [78]:
bully = pd.read_csv("data/malicious_uses/cyberbullying.csv")
df_name = "cyberbullying"
bully = process_df(bully, "cyberbullying")


In [79]:
sus_act = pd.read_csv("data/malicious_uses/suspicious_activity.csv")
sus_act = process_df(sus_act, "suspicious_activity")

In [75]:
bully['binary_label'].value_counts()

0    391223
1     57651
Name: binary_label, dtype: int64

In [80]:
mal_df = generate_df((bully, 5500, 5500), (sus_act, 5500, 5500))

In [81]:
mal_df['binary_label'].value_counts()

0    10000
1    10000
Name: binary_label, dtype: int64

In [82]:
mal_df  = mal_df.drop_duplicates(subset='text')

In [83]:
len(mal_df)

20000

In [84]:
mal_df.to_csv("data/malicious_activity.csv", index=False)

# Discrimination, Exclusion, Toxicity

In [85]:
adult = pd.read_excel("data/disc_excl_toxicity/adult_data.xlsx")
adult = process_df(adult, "adult_content")

In [86]:
hate = pd.read_csv("data/disc_excl_toxicity/hate_speech.csv")
hate = process_df(hate, "hate_speech")

In [87]:
hate['binary_label'].value_counts()

0    80624
1    46021
Name: binary_label, dtype: int64

In [88]:
toxigen = pd.read_csv("data/disc_excl_toxicity/toxigen.csv")
toxigen = process_df(toxigen, "toxigen")

In [90]:
adult['binary_label'].value_counts()

0    512
1    338
Name: binary_label, dtype: int64

In [91]:
hate['binary_label'].value_counts()

0    80624
1    46021
Name: binary_label, dtype: int64

In [92]:
toxigen['binary_label'].value_counts()

1    125672
0    125279
Name: binary_label, dtype: int64

In [98]:
disc_df = generate_df((adult, 512, 338), (hate, 7800,8900), (toxigen,7800,8900))

0    11684
1    10211
Name: binary_label, dtype: int64


In [99]:
disc_df['binary_label'].value_counts()

0    10000
1    10000
Name: binary_label, dtype: int64

In [100]:
disc_df.to_csv("data/discrimination.csv", index=False)

# Misinfo

In [8]:
covid = pd.read_csv("data/misinformation/covid_fake_news_data.csv")
covid = process_df(covid, "covid_fake_news")

In [9]:
true_false = pd.read_csv("data/misinformation/true_false.csv")
true_false = process_df(true_false, "true_false")


In [10]:
fake = pd.read_csv("data/misinformation/fake_news.csv")
fake = process_df(fake, "fake_news")

In [11]:
covid['binary_label'].value_counts()

0    4064
1     397
Name: binary_label, dtype: int64

In [12]:
true_false['binary_label'].value_counts()

1    9929
0    9733
Name: binary_label, dtype: int64

In [13]:
fake.head(1)

Unnamed: 0,text,binary_label,global_label
0,Donald Trump Sends Out Embarrassing New Year’...,1,18


In [14]:
fake['binary_label'].value_counts()

1    17908
Name: binary_label, dtype: int64

In [15]:
misinfo = generate_df((covid, 4064,397), (true_false,9700,9900), (fake, 0, 10000))

1    18246
0    11594
Name: binary_label, dtype: int64


In [16]:
misinfo['binary_label'].value_counts()

0    10000
1    10000
Name: binary_label, dtype: int64

In [17]:
misinfo.to_csv("data/misinfo.csv")