## Import Libraries

In [30]:
import os
import glob
import pandas as pd
from tqdm import tqdm

In [31]:
DISORDERS = ['agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'anxiety']

In [32]:
# *** Reddit ***
reddit_df = pd.DataFrame()

for disorder_file_name in tqdm(glob.glob("../Data/Anxiety_Detection_Data/Reddit/*")):
    disorder_df_temp = pd.read_excel(disorder_file_name)
    disorder_df = pd.DataFrame({
        "text": disorder_df_temp['Translated_Text'],
        "labels": os.path.basename(disorder_file_name).split('.')[0],
        "source": "Reddit"
    })

    if len(reddit_df) != 0:
        reddit_df = pd.concat([reddit_df, disorder_df], axis=0).reset_index(drop=True)

    else:
        reddit_df = disorder_df.copy()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 11.93it/s]


In [33]:
# *** Eksi ***
eksi_df = pd.DataFrame()

for disorder_file_name in tqdm(glob.glob("../Data/Anxiety_Detection_Data/Eksi/*")):
    disorder_df_temp = pd.read_excel(disorder_file_name)
    disorder_df = pd.DataFrame({
        "text": disorder_df_temp['Text'],
        "labels": os.path.basename(disorder_file_name).split('.')[0],
        "source": "Eksi"
    })

    if len(eksi_df) != 0:
        eksi_df = pd.concat([eksi_df, disorder_df], axis=0).reset_index(drop=True)

    else:
        eksi_df = disorder_df.copy()

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 26.56it/s]


In [34]:
# *** Youtube ***
youtube_df = pd.read_excel("../Data/Anxiety_Detection_Data/Youtube/anxiety_youtube_data.xlsx")
youtube_df = youtube_df.rename(columns = {"Text": "text", "Disorder": "labels"})
youtube_df['source'] = "Youtube"

## Concat

In [56]:
total_df = pd.concat([reddit_df, eksi_df, youtube_df], axis = 0).reset_index(drop=True)
total_df

Unnamed: 0,text,labels,source
0,"Sıkışmış hissetmek (yerine tekrar giriş yok, y...",agoraphobia,Reddit
1,Yakın zamanda başka bir şehre taşındım ve nele...,agoraphobia,Reddit
2,"Panik atak geçirmenin eşiğindeydim, sadece bun...",agoraphobia,Reddit
3,Geçen hafta maruz kalma terapimi yaparken soka...,agoraphobia,Reddit
4,Bazen özgüven ve benlik imajıyla çok mücadele ...,agoraphobia,Reddit
...,...,...,...
19139,Benim kızım 7 yaşlnda anaokuluna göderdim öğre...,selectivemutism,Youtube
19140,Merhaba oğlum da anasınıfına gidiyor kimseyle ...,selectivemutism,Youtube
19141,benim de 5 buçuk yaşında oğlum aile bireyleri ...,selectivemutism,Youtube
19142,merhaba benim oğlumda okulda çocuklarla sohbet...,selectivemutism,Youtube


In [36]:
# total_df.to_excel("../Data/Anxiety_Detection_Data/total_df.xlsx", index=False)
# print("Done")

In [37]:
total_df['labels'].value_counts()

labels
socialanxiety      8752
anxiety            3422
agoraphobia        2478
selectivemutism    1845
panic              1394
phobia             1253
Name: count, dtype: int64

## Convert them to Multi Label

In [57]:
total_df = total_df[total_df['labels'] != 'anxiety'] # not getting anxiety (optional)

In [59]:
one_hot_encoded = pd.get_dummies(total_df['labels'])

concat_df = pd.concat([total_df['text'], total_df['source'], one_hot_encoded], axis=1)
concat_df = concat_df.dropna().reset_index(drop=True)

concat_df.iloc[:, 2:] = concat_df.iloc[:, 2:].astype(int)

concat_df['all_labels'] = concat_df.apply(lambda row: [label for label in one_hot_encoded.columns if row[label] == 1], axis=1)
concat_df = concat_df.loc[:, ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source']]

In [60]:
concat_df.columns

Index(['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism',
       'socialanxiety', 'all_labels', 'source'],
      dtype='object')

## Push to Hub

In [61]:
from datasets import Dataset, DatasetDict
Dataset.cleanup_cache_files
from sklearn.model_selection import train_test_split

train, test = train_test_split(concat_df, test_size = 0.23, random_state=42)

train_data = Dataset.from_pandas(train, preserve_index=False)
test_data = Dataset.from_pandas(test, preserve_index=False)

hg_data = DatasetDict({
    "train": train_data,
    "test": test_data
})

In [62]:
hg_data

DatasetDict({
    train: Dataset({
        features: ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source'],
        num_rows: 12105
    })
    test: Dataset({
        features: ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source'],
        num_rows: 3616
    })
})

In [63]:
!huggingface-cli login --token=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

hg_data.push_to_hub("halilibr/dilbazlar-anxiety-disorders-recognition-not-augmented-not-anxiety-tr-dataset")
print("Data was pushed :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Data was pushed :)


In [64]:
hg_data.save_to_disk('../Data/Anxiety_Detection_Data/anxiety_not_augmented__not_anxiety_hg_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/12105 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3616 [00:00<?, ? examples/s]