In [1]:
import pandas as pd

In [2]:
#run from the root directory
import os

os.chdir("..")

In [3]:
import config

In [4]:
config_map = config.dataset_category_mapping
index_map  = config.index_class_mapping

# HCI Harms

In [2]:
hci_harms_df = pd.DataFrame()

In [18]:
#student anxiety dataset
#unsafe: 730, safe: 5000
df_1_name = "anxiety"
student_anxiety = pd.read_csv("data/HCI_harms/anxiety.csv")
student_anxiety['global_label']  = student_anxiety['label'].map(config_map[df_1_name]['column_mapping'])
student_anxiety = student_anxiety.rename(columns={"text": 'text', 'label': 'binary_label'})
student_anxiety = student_anxiety[['text', 'binary_label', 'global_label']]

anxiety_unsafe = student_anxiety[student_anxiety['binary_label'] == 1]
anxiety_unsafe = anxiety_unsafe[['text', 'binary_label', 'global_label']]

anxiety_safe = student_anxiety[student_anxiety['binary_label'] == 0].sample(n=5000)

anxiety_safe = anxiety_safe[['text', 'binary_label', 'global_label']]


In [20]:
#suicide detection dataset
#unsafe: 9270, safe: 5000
df_2_name = "suicide"
suicide = pd.read_csv("data/HCI_harms/suicide.csv")
suicide['global_label']  = suicide['class'].map(config_map[df_2_name]['column_mapping'])
suicide['binary_label'] = suicide['global_label'].map(index_map)
suicide = suicide[['text', 'binary_label', 'global_label']]

suicide_unsafe = suicide[suicide['binary_label'] == 1].sample(n=9270)
suicide_safe = suicide[suicide['binary_label'] == 0].sample(n=5000)



In [22]:
hci_harms_df = pd.concat([anxiety_unsafe, anxiety_safe, suicide_unsafe, suicide_safe])

In [23]:
len(hci_harms_df)

20000

In [25]:
hci_harms_df.to_csv("data/hci_harms_df.csv", index=False)

# Malicious Uses

In [5]:
def generate_df(*args):
    new_df = pd.DataFrame()

    for i in range(len(args)):
        df, safe_sample, unsafe_sample = args[i]
        safe_df = df[df['binary_label'] == 0].sample(n=safe_sample)
        unsafe_df = df[df['binary_label'] == 1].sample(n=unsafe_sample)

        new_df = pd.concat([new_df, safe_df, unsafe_df])
    
    return new_df

In [7]:
def process_df(df, df_name):
    label_col = config_map[df_name]['column_label']
    text_col = config_map[df_name]['column_text']
    new_df = df.copy()
    new_df['global_label'] = new_df[label_col].map(config_map[df_name]['column_mapping'])
    new_df['binary_label'] = new_df['global_label'].map(index_map)
    new_df = new_df.rename(columns={text_col: "text"})

    new_df = new_df[['text', 'binary_label', 'global_label']]
    return new_df

In [39]:
bully = pd.read_csv("data/malicious_uses/cyberbullying.csv")
df_name = "cyberbullying"
bully = process_df(bully, "cyberbullying")


In [41]:
sus_act = pd.read_csv("data/malicious_uses/suspicious_activity.csv")
sus_act = process_df(sus_act, "suspicious_activity")

In [45]:
mal_df = generate_df((bully, 5000, 5000), (sus_act, 5000, 5000))

In [48]:
mal_df.to_csv("data/malicious_activity.csv", index=False)

# Discrimination, Exclusion, Toxicity

In [8]:
adult = pd.read_excel("data/disc_excl_toxicity/adult_data.xlsx")
adult = process_df(adult, "adult_content")

In [9]:
hate = pd.read_csv("data/disc_excl_toxicity/hate_speech.csv")
hate = process_df(hate, "hate_speech")

In [10]:
hate['binary_label'].value_counts()

0    80624
1    46021
Name: binary_label, dtype: int64

In [11]:
toxigen = pd.read_csv("data/disc_excl_toxicity/toxigen.csv")
toxigen = process_df(toxigen, "toxigen")

In [12]:
disc_df = generate_df((adult, 512, 338), (hate, 4744,4831), (toxigen,4744, 4831))

In [14]:
disc_df['binary_label'].value_counts()

0    10000
1    10000
Name: binary_label, dtype: int64

In [15]:
disc_df.to_csv("data/discrimination.csv", index=False)

# Misinfo

In [17]:
covid = pd.read_csv("data/misinformation/covid_fake_news_data.csv")
covid = process_df(covid, "covid_fake_news")

In [19]:
true_false = pd.read_csv("data/misinformation/true_false.csv")
true_false = process_df(true_false, "true_false")


In [20]:
misinfo = generate_df((covid, 4064,397), (true_false,5936,9603 ))

In [21]:
misinfo['binary_label'].value_counts()

0    10000
1    10000
Name: binary_label, dtype: int64

In [28]:
misinfo.to_csv("data/misinfo.csv")