In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedGroupKFold

import os
from tqdm.auto import tqdm
from pathlib import Path

TOPIC_DIR = '/root/autodl-nas/data/k12/topics.csv'
CONTENT_DIR = '/root/autodl-nas/data/k12/content.csv'
CORR_DIR = '/root/autodl-nas/data/k12/correlations.csv'
CV_DIR = '/root/autodl-tmp/data/k12/cv_split_new'

topic = pd.read_csv(TOPIC_DIR)
content = pd.read_csv(CONTENT_DIR)
corr = pd.read_csv(CORR_DIR)

In [2]:
N_FOLD = 5
SEED = 11
def cv_split(df_train, labels=['id', 'channel', 'category', 'language', 'fold']):
    df_train['fold'] = -1
    df_candidates = df_train[df_train["category"]!='source'].copy(deep=True).reset_index(drop=True)
    cv = StratifiedGroupKFold(n_splits=N_FOLD, random_state=SEED, shuffle=True)
    for i, (train_index, test_index) in enumerate(cv.split(df_candidates, df_candidates['language'], df_candidates['id'],)):
        df_candidates.loc[test_index, 'fold'] = i
    df_source = df_train.query('category=="source"')
    df_train = pd.concat([df_candidates, df_source]).reset_index(drop=True) 
    return df_train[labels]

df_train = pd.read_csv(TOPIC_DIR).query('has_content').reset_index(drop=True)
df_train = cv_split(df_train, ['id', 'fold'])
df_train

Unnamed: 0,id,fold
0,t_0008768bdee6,0
1,t_0008a1bd84ba,1
2,t_000d1fb3f2f5,2
3,t_00102869fbcb,3
4,t_0012a45fa09c,4
...,...,...
61512,t_fff5da49c4d3,-1
61513,t_fff7f2dd208b,-1
61514,t_fff830472691,-1
61515,t_fffbe1d5d43c,-1


In [3]:
# train fold
df_content = pd.read_csv(CONTENT_DIR)
df_topic = pd.read_csv(TOPIC_DIR)
for i in tqdm(range(4)):
    dir_name = os.path.join(CV_DIR, 'train', f'fold_{i}')
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    train = df_train.query('fold!=@i and fold!=4')
    test = df_train.query('fold==@i')
    
    correlations = train.merge(pd.read_csv(CORR_DIR), left_on='id', right_on='topic_id')[['topic_id', 'content_ids']]
    submission = test.merge(pd.read_csv(CORR_DIR), left_on='id', right_on='topic_id')[['topic_id', 'content_ids']]
    
    df_topic.to_csv(os.path.join(dir_name, 'topics.csv'), index=False)
    df_content.to_csv(os.path.join(dir_name, 'content.csv'), index=False)
    correlations.to_csv(os.path.join(dir_name, 'correlations.csv'), index=False)
    submission.to_csv(os.path.join(dir_name, 'sample_submission.csv'), index=False)
    
    
# valid fold
dir_name = os.path.join(CV_DIR, 'valid', f'fold_4')
if not os.path.exists(dir_name):
    os.makedirs(dir_name)
train = df_train.query('fold!=4')
test = df_train.query('fold==4')

correlations = train.merge(pd.read_csv(CORR_DIR), left_on='id', right_on='topic_id')[['topic_id', 'content_ids']]
submission = test.merge(pd.read_csv(CORR_DIR), left_on='id', right_on='topic_id')[['topic_id', 'content_ids']]

df_topic.to_csv(os.path.join(dir_name, 'topics.csv'), index=False)
df_content.to_csv(os.path.join(dir_name, 'content.csv'), index=False)
correlations.to_csv(os.path.join(dir_name, 'correlations.csv'), index=False)
submission.to_csv(os.path.join(dir_name, 'sample_submission.csv'), index=False)

  0%|          | 0/4 [00:00<?, ?it/s]