In [1]:
import os
import time
import datetime
import shutil

import pandas as pd
import polars as pl
import numpy as np
from tqdm import tqdm

# Setup

In [2]:
DATA_PATH = "../data"
INPUT_DATA_PATH = os.path.join(DATA_PATH, "input")
RAW_DATA_DIR = os.path.join(INPUT_DATA_PATH, "raw")

CONTENT_PATH = os.path.join(RAW_DATA_DIR, "content.csv")
TOPIC_PATH = os.path.join(RAW_DATA_DIR, "topics.csv")
CORRELATIONS_PATH = os.path.join(RAW_DATA_DIR, "correlations.csv")

FLOD = 5
SEED_LIST = [42, 20, 91, 41, 44]

KFLOD_DATA_DIRNAME = os.path.join(INPUT_DATA_PATH, "kflod_data")

for i in range(FLOD):
    os.makedirs(os.path.join(KFLOD_DATA_DIRNAME, f"flod{i}"), exist_ok=True)

# Build

In [3]:
TEST_ADDITIONAL_DATA_SIZE = 10000
RAW_TRAIN_DATA_SIZE = 154047

TRAIN_DATA_SIZE = RAW_TRAIN_DATA_SIZE*RAW_TRAIN_DATA_SIZE // (RAW_TRAIN_DATA_SIZE + TEST_ADDITIONAL_DATA_SIZE)
VALID_DATA_SIZE = RAW_TRAIN_DATA_SIZE - TRAIN_DATA_SIZE

print(f"train data size: {TRAIN_DATA_SIZE}, valid data size: {VALID_DATA_SIZE}")

VALID_RATIO = 0.2

train data size: 144656, valid data size: 9391


In [4]:
%%time

def generate_flod_data(k):
    # Content
    df_content = pd.read_csv(CONTENT_PATH)
    content_list = df_content["id"].tolist()
    np.random.seed(SEED_LIST[k])
    train_content_list = np.random.choice(content_list, TRAIN_DATA_SIZE, replace=False)
    df_train_content = df_content[df_content["id"].isin(train_content_list)].reset_index(drop=True)
    # df_valid_content = df_content[~df_content["id"].isin(train_content_list)].reset_index(drop=True)
    df_valid_content = df_content
    df_train_content.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"train_content_flod{k}.pqt"))
    df_valid_content.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"valid_content_flod{k}.pqt"))

    # Topics
    df_topics = pd.read_csv(TOPIC_PATH)
    topic_list = df_topics["id"].tolist()
    np.random.seed(SEED_LIST[k])
    train_topic_list = np.random.choice(topic_list, int(len(topic_list)*(1-VALID_RATIO)), replace=False)
    df_train_topics = df_topics[df_topics["id"].isin(train_topic_list)].reset_index(drop=True)
    df_valid_topics = df_topics[~df_topics["id"].isin(train_topic_list)].reset_index(drop=True)
    df_train_topics.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"train_topics_flod{k}.pqt"))
    df_valid_topics.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"valid_topics_flod{k}.pqt"))

    # Correlations
    df_correlations = pd.read_csv(CORRELATIONS_PATH)
    df_correlations["content_ids_list"] = df_correlations["content_ids"].apply(lambda x: x.split())
    df_correlations_exploded = df_correlations[["topic_id", "content_ids_list"]].explode("content_ids_list").rename({"content_ids_list": "content_ids"}, axis=1)
    df_train_correlations = df_correlations_exploded[(df_correlations_exploded["topic_id"].isin(train_topic_list))&(df_correlations_exploded["content_ids"].isin(train_content_list))].reset_index(drop=True)
    df_valid_correlations = df_correlations_exploded[~df_correlations_exploded["topic_id"].isin(train_topic_list)].reset_index(drop=True)
    df_train_correlations.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"train_correlations_flod{k}.pqt"))
    df_valid_correlations.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"valid_correlations_flod{k}.pqt"))
    
    
for k in tqdm(range(FLOD)):
    generate_flod_data(k)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:31<00:00, 18.30s/it]

CPU times: user 1min 7s, sys: 12.6 s, total: 1min 20s
Wall time: 1min 31s



