# Create Augmented Data

Bu notebook, mevcut veri kümesine yeni, artırılmış (augmented) veriler ekleyerek veri kümesini genişletir ve temizler. Artırılmış veri oluşturma işlemi, modelin genel performansını artırmak ve çeşitli veri çeşitliliği sağlamak için önemlidir. Bu süreç, modelin farklı veri koşullarına daha iyi uyum sağlamasını ve genelleme yeteneğini artırabilir.

In [60]:
import os
import glob
import pandas as pd
from tqdm import tqdm
import pickle

Verisetinin içeriği:

['agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety']

In [61]:
total_augmented_df = pd.DataFrame()

In [62]:
total_df = pd.read_excel('../Data/Anxiety_Detection_Data/total_df.xlsx')
socialanxiety_texts = total_df[total_df['labels'] == 'socialanxiety']['text'].values

total_augmented_df['Text'] = socialanxiety_texts
total_augmented_df['labels'] = "socialanxiety"

##  Pickle file reading

In [63]:
for augmented_disorder in glob.glob("../Data/Anxiety_Detection_Data/Augmente Edilmiş Veriler/*"):
    temp_df = pd.DataFrame()
    if augmented_disorder.split('.')[-1] == 'pkl':
        with open(augmented_disorder, 'rb') as file:
            temp_df['Text'] = pickle.load(file)
            temp_df['labels'] = os.path.basename(augmented_disorder).split('.')[0]

    
    elif augmented_disorder.split('.')[-1] == 'csv':
        temp_df['Text'] = pd.read_csv(augmented_disorder)['Text']
        temp_df['labels'] = os.path.basename(augmented_disorder).split('.')[0] 
        
    if len(total_augmented_df) == 0:
        total_augmented_df = temp_df.copy()
    else:
        total_augmented_df = pd.concat([total_augmented_df, temp_df], axis=0)

In [64]:
total_augmented_df['labels'].value_counts()

labels
selectivemutism    10955
agoraphobia        10116
socialanxiety       8752
panic               8314
phobia              7493
Name: count, dtype: int64

In [65]:
total_augmented_df = total_augmented_df.rename(columns = {"Text": "text"})
total_augmented_df.columns

Index(['text', 'labels'], dtype='object')

## Convert it to Multi Label

In [82]:
# total_augmented_df.to_excel("../Data/Anxiety_Detection_Data/total_not_multi_label_augmented_data.xlsx", index=False)

In [74]:
one_hot_encoded = pd.get_dummies(total_augmented_df['labels'])

concat_df = pd.concat([total_augmented_df['text'], one_hot_encoded], axis=1)
concat_df = concat_df.dropna().reset_index(drop=True)

concat_df.iloc[:, 1:] = concat_df.iloc[:, 1:].astype(int)

concat_df['all_labels'] = concat_df.apply(lambda row: [label for label in one_hot_encoded.columns if row[label] == 1], axis=1)
concat_df = concat_df.loc[:, ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels']]

In [76]:
concat_df.head()

Unnamed: 0,text,agoraphobia,panic,phobia,selectivemutism,socialanxiety,all_labels
0,Bir şekilde sarhoş olmadığım sürece sosyalleşe...,0,0,0,0,1,[socialanxiety]
1,"Son zamanlarda sosyal kaygılarla boğuşuyorum, ...",0,0,0,0,1,[socialanxiety]
2,İnsanlar benden nefret mi ediyor yoksa ben mi ...,0,0,0,0,1,[socialanxiety]
3,Genellikle organize olduğum için kendimi çok k...,0,0,0,0,1,[socialanxiety]
4,Son zamanlarda kendimi hayal kırıklığına uğrat...,0,0,0,0,1,[socialanxiety]


## Push to Hub

In [77]:
from datasets import Dataset, DatasetDict
Dataset.cleanup_cache_files
from sklearn.model_selection import train_test_split

train, test = train_test_split(concat_df, test_size = 0.25, random_state=42)

train_data = Dataset.from_pandas(train, preserve_index=False)
test_data = Dataset.from_pandas(test, preserve_index=False)

hg_data = DatasetDict({
    "train": train_data,
    "test": test_data
})

In [78]:
hg_data

DatasetDict({
    train: Dataset({
        features: ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels'],
        num_rows: 34221
    })
    test: Dataset({
        features: ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels'],
        num_rows: 11408
    })
})

In [79]:
!huggingface-cli login --token=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

hg_data.push_to_hub("halilibr/dilbazlar-anxiety-disorders-recognition-multilabel-augmented-not-anxiety-tr-dataset")
print("Data was pushed :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Data was pushed :)


In [80]:
hg_data.save_to_disk('../Data/Anxiety_Detection_Data/anxiety_multilabel_augmented_not_anxiety_hg_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/34221 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11408 [00:00<?, ? examples/s]