<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/datasets/DATASET_stackExchange.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# 데이터셋 파일 경로
biology_path = '/content/drive/MyDrive/LLMEmbed/biology.csv'
cooking_path = '/content/drive/MyDrive/LLMEmbed/cooking.csv'
diy_path = '/content/drive/MyDrive/LLMEmbed/diy.csv'
travel_path = '/content/drive/MyDrive/LLMEmbed/travel.csv'
stackoverflow_path = '/content/drive/MyDrive/LLMEmbed/MultiLabel/rev_tag_training_samples.csv'

# 데이터셋 로드
biology = pd.read_csv(biology_path)
cooking = pd.read_csv(cooking_path)
diy = pd.read_csv(diy_path)
travel = pd.read_csv(travel_path)
stackoverflow = pd.read_csv(stackoverflow_path)

# title_content 컬럼 생성
biology['title_content'] = biology['title']
cooking['title_content'] = cooking['title']
diy['title_content'] = diy['title']
travel['title_content'] = travel['title']
stackoverflow['title_content'] = stackoverflow['Title']

# 레이블 할당
labels = {
    'biology': 0,
    'cooking': 1,
    'diy': 2,
    'travel': 3,
    'stackoverflow': 4
}

# 데이터셋을 나누고 레이블을 할당하는 함수
def split_and_label(dataset, label, validation_size=200, train_size=2000, test_size=200, additional_columns=None):
    additional_columns = additional_columns or []

    # 검증 데이터셋 추출
    validation_data = dataset.sample(n=min(validation_size, len(dataset)), random_state=1)
    # 남은 데이터 중에서 테스트 데이터셋 추출
    remaining_data = dataset.drop(validation_data.index)
    test_data = remaining_data.sample(n=min(test_size, len(remaining_data)), random_state=2)
    # 남은 데이터 중에서 훈련 데이터셋 추출
    train_data = remaining_data.drop(test_data.index).sample(n=min(train_size, len(remaining_data) - len(test_data)), random_state=1)

    # 레이블 추가
    validation_data['label'] = label
    test_data['label'] = label
    train_data['label'] = label

    # 필요한 컬럼만 선택
    validation_data = validation_data[['title_content', 'label'] + additional_columns]
    test_data = test_data[['title_content', 'label'] + additional_columns]
    train_data = train_data[['title_content', 'label'] + additional_columns]

    return train_data, validation_data, test_data

# split_and_label 함수 호출시 추가 컬럼들을 포함하도록 합니다
additional_columns = ['Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']  # StackOverflow 데이터셋에 있는 추가 컬럼 이름
biology_train, biology_val, biology_test = split_and_label(biology, labels['biology'])
cooking_train, cooking_val, cooking_test = split_and_label(cooking, labels['cooking'])
diy_train, diy_val, diy_test = split_and_label(diy, labels['diy'])
travel_train, travel_val, travel_test = split_and_label(travel, labels['travel'])
stackoverflow_train, stackoverflow_val, stackoverflow_test = split_and_label(stackoverflow, labels['stackoverflow'], additional_columns=additional_columns)

# 훈련, 검증, 테스트 데이터셋을 각각 하나로 합치기
combined_train_dataset = pd.concat([biology_train, cooking_train, diy_train, travel_train, stackoverflow_train], ignore_index=True)
combined_val_dataset = pd.concat([biology_val, cooking_val, diy_val, travel_val, stackoverflow_val], ignore_index=True)
combined_test_dataset = pd.concat([biology_test, cooking_test, diy_test, travel_test, stackoverflow_test], ignore_index=True)

# 결과 데이터셋을 CSV 파일로 저장하기
train_csv_path = '/content/drive/MyDrive/LLMEmbed/dataset/stackexchange_train_dataset_10000_title.csv'
val_csv_path = '/content/drive/MyDrive/LLMEmbed/dataset/stackexchange_val_dataset_1000_title.csv'
test_csv_path = '/content/drive/MyDrive/LLMEmbed/dataset/stackexchange_test_dataset_1000_title.csv'
combined_train_dataset.to_csv(train_csv_path, index=False)
combined_val_dataset.to_csv(val_csv_path, index=False)
combined_test_dataset.to_csv(test_csv_path, index=False)

print(f"Train dataset saved to {train_csv_path}")
print(f"Validation dataset saved to {val_csv_path}")
print(f"Test dataset saved to {test_csv_path}")

Train dataset saved to /content/drive/MyDrive/LLMEmbed/dataset/stackexchange_train_dataset_10000_title.csv
Validation dataset saved to /content/drive/MyDrive/LLMEmbed/dataset/stackexchange_val_dataset_1000_title.csv
Test dataset saved to /content/drive/MyDrive/LLMEmbed/dataset/stackexchange_test_dataset_1000_title.csv
