<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/DATASET_stackExchange%2Bmultilable.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
from ast import literal_eval

# 공통 경로 설정
base_path = '/content/drive/MyDrive/ML/AI-algorithm/dataset'

# 데이터셋 파일 경로
biology_path = f'{base_path}/biology.csv'
cooking_path = f'{base_path}/cooking.csv'
diy_path = f'{base_path}/diy.csv'
travel_path = f'{base_path}/travel.csv'
stackoverflow_train_path = f'{base_path}/rev_tag_training_samples.csv'
stackoverflow_test_path = f'{base_path}/rev_tag_test_samples.csv'

# 데이터셋 로드 및 title과 content 결합
biology = pd.read_csv(biology_path)
biology['title_content'] = biology['title'] + ' ' + biology['content']

cooking = pd.read_csv(cooking_path)
cooking['title_content'] = cooking['title'] + ' ' + cooking['content']

diy = pd.read_csv(diy_path)
diy['title_content'] = diy['title'] + ' ' + diy['content']

travel = pd.read_csv(travel_path)
travel['title_content'] = travel['title'] + ' ' + travel['content']

stackoverflow_train = pd.read_csv(stackoverflow_train_path)
stackoverflow_train['title_content'] = stackoverflow_train['Title'] + ' ' + stackoverflow_train['Body']

stackoverflow_val = pd.read_csv(stackoverflow_test_path)
stackoverflow_val['title_content'] = stackoverflow_val['Title'] + ' ' + stackoverflow_val['Body']

# 레이블 할당
labels = {
    'biology': 0,
    'cooking': 1,
    'diy': 2,
    'travel': 3,
    'stackoverflow': 4
}

# StackOverflow 데이터의 고유 sub_label 추출 및 인덱스 부여
unique_tags = set()
for tags in stackoverflow_train['Tags_new'].dropna():
    unique_tags.update(literal_eval(tags))

for tags in stackoverflow_val['Tags_new'].dropna():
    unique_tags.update(literal_eval(tags))

unique_tags = sorted(list(unique_tags))  # 태그를 정렬하여 고유 인덱스 할당
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}

# sub_label을 멀티라벨 인덱스 리스트로 변환
def encode_sub_label(tags):
    tags = literal_eval(tags) if isinstance(tags, str) else []
    return [tag2id[tag] for tag in tags if tag in tag2id]

stackoverflow_train['sub_label'] = stackoverflow_train['Tags_new'].apply(encode_sub_label)
stackoverflow_val['sub_label'] = stackoverflow_val['Tags_new'].apply(encode_sub_label)

# 데이터셋을 나누고 레이블을 할당하는 함수
def split_and_label(dataset, label, validation_size=400, train_size=2000, sublabel=False):
    validation_data = dataset.sample(n=min(validation_size, len(dataset)), random_state=1)
    train_data = dataset.drop(validation_data.index).sample(n=min(train_size, len(dataset) - len(validation_data)), random_state=1)
    validation_data['label'] = label
    train_data['label'] = label

    if sublabel:
        return train_data[['title_content', 'label', 'sub_label']], validation_data[['title_content', 'label', 'sub_label']]
    else:
        return train_data[['title_content', 'label']], validation_data[['title_content', 'label']]

# 데이터셋을 나누고 레이블을 할당
biology_train, biology_val = split_and_label(biology, labels['biology'])
cooking_train, cooking_val = split_and_label(cooking, labels['cooking'])
diy_train, diy_val = split_and_label(diy, labels['diy'])
travel_train, travel_val = split_and_label(travel, labels['travel'])

# StackOverflow 데이터는 이미 나누어져 있으므로 바로 레이블 할당
stackoverflow_train['label'] = labels['stackoverflow']
stackoverflow_val['label'] = labels['stackoverflow']

# 훈련 데이터셋과 검증 데이터셋을 각각 하나로 합치기
combined_train_dataset = pd.concat([biology_train, cooking_train, diy_train, travel_train, stackoverflow_train[['title_content', 'label', 'sub_label']]], ignore_index=True)
combined_val_dataset = pd.concat([biology_val, cooking_val, diy_val, travel_val, stackoverflow_val[['title_content', 'label', 'sub_label']]], ignore_index=True)

# sub_label을 문자열로 변환하고, NaN을 빈 문자열로 대체
combined_train_dataset['sub_label'] = combined_train_dataset['sub_label'].fillna('').apply(lambda x: str(x) if x else '')
combined_val_dataset['sub_label'] = combined_val_dataset['sub_label'].fillna('').apply(lambda x: str(x) if x else '')

# 데이터셋 확인 (저장 전 샘플 확인)
print("Train dataset sample:")
print(combined_train_dataset.sample(5))
print("\nValidation dataset sample:")
print(combined_val_dataset.sample(5))

# 결과 데이터셋을 CSV 파일로 저장하기
train_csv_path = f'{base_path}/stackexchange_train_dataset.csv'
val_csv_path = f'{base_path}/stackexchange_val_dataset.csv'
combined_train_dataset.to_csv(train_csv_path, index=False)
combined_val_dataset.to_csv(val_csv_path, index=False)

print(f"\nTrain dataset saved to {train_csv_path}")
print(f"Validation dataset saved to {val_csv_path}")


Train dataset sample:
                                           title_content  label sub_label
1245   What triggers creative thought in humans? <p>C...      0          
27079  What is the difference between Expo CLI and Re...      4       [8]
8111   How to check JSP varilable and hide if value i...      4       [0]
23881  Return first letter of each word capitalized |...      4       [7]
12099  Using php in python language is really a good ...      4    [2, 1]

Validation dataset sample:
                                          title_content  label sub_label
3458  when i use "=" copy a object ,why source value...      4       [0]
3042  Cross origin error in jquery.load() <p>I want ...      4       [5]
3748  C# Use textbox value in another cs file I'd li...      4    [7, 4]
1534  How to find car rental companies in Kalibo, Ph...      3          
2210  Convert an int** array into a char** array of ...      4    [2, 7]

Train dataset saved to /content/drive/MyDrive/ML/AI-algorithm/datas