In [17]:
import os
import time
import datetime
import shutil

import pandas as pd
import polars as pl
import numpy as np
from tqdm import tqdm

# Setup

In [18]:
DATA_PATH = "../data"
INPUT_DATA_PATH = os.path.join(DATA_PATH, "input")
RAW_DATA_DIR = os.path.join(INPUT_DATA_PATH, "raw")

CONTENT_PATH = os.path.join(RAW_DATA_DIR, "content.csv")
TOPIC_PATH = os.path.join(RAW_DATA_DIR, "topics.csv")
CORRELATIONS_PATH = os.path.join(RAW_DATA_DIR, "correlations.csv")

FLOD = 5
SEED_LIST = [42, 20, 91, 41, 44]

KFLOD_DATA_DIRNAME = os.path.join(INPUT_DATA_PATH, "kflod_data")

for i in range(FLOD):
    os.makedirs(os.path.join(KFLOD_DATA_DIRNAME, f"flod{i}"), exist_ok=True)

# Build

In [19]:
TEST_ADDITIONAL_DATA_SIZE = 10000
RAW_TRAIN_DATA_SIZE = 154047

TRAIN_DATA_SIZE = RAW_TRAIN_DATA_SIZE*RAW_TRAIN_DATA_SIZE // (RAW_TRAIN_DATA_SIZE + TEST_ADDITIONAL_DATA_SIZE)
VALID_DATA_SIZE = RAW_TRAIN_DATA_SIZE - TRAIN_DATA_SIZE

print(f"train data size: {TRAIN_DATA_SIZE}, valid data size: {VALID_DATA_SIZE}")

VALID_RATIO = 0.2

train data size: 144656, valid data size: 9391


In [22]:
%%time

def generate_flod_data(k):
    # Content
    df_content = pd.read_csv(CONTENT_PATH)
    content_list = df_content["id"].tolist()
    np.random.seed(SEED_LIST[k])
    train_content_list = np.random.choice(content_list, TRAIN_DATA_SIZE, replace=False)
    df_train_content = df_content[df_content["id"].isin(train_content_list)].reset_index(drop=True)
    # df_valid_content = df_content[~df_content["id"].isin(train_content_list)].reset_index(drop=True)
    df_valid_content = df_content
    df_train_content.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"train_content_flod{k}.pqt"))
    df_valid_content.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"valid_content_flod{k}.pqt"))

    # Topics
    df_topics = pd.read_csv(TOPIC_PATH)
    topic_list = df_topics["id"].tolist()
    np.random.seed(SEED_LIST[k])
    train_topic_list = np.random.choice(topic_list, int(len(topic_list)*(1-VALID_RATIO)), replace=False)
    df_train_topics = df_topics[df_topics["id"].isin(train_topic_list)].reset_index(drop=True)
    df_valid_topics = df_topics[~df_topics["id"].isin(train_topic_list)].reset_index(drop=True)
    df_train_topics.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"train_topics_flod{k}.pqt"))
    df_valid_topics.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"valid_topics_flod{k}.pqt"))

    # Correlations
    df_correlations = pd.read_csv(CORRELATIONS_PATH)
    df_correlations["content_ids_list"] = df_correlations["content_ids"].apply(lambda x: x.split())
    df_correlations_exploded = df_correlations[["topic_id", "content_ids_list"]].explode("content_ids_list").rename({"content_ids_list": "content_ids"}, axis=1)
    df_train_correlations = df_correlations_exploded[(df_correlations_exploded["topic_id"].isin(train_topic_list))&(df_correlations_exploded["content_ids"].isin(train_content_list))].reset_index(drop=True)
    df_valid_correlations = df_correlations_exploded[~df_correlations_exploded["topic_id"].isin(train_topic_list)].reset_index(drop=True)
    df_train_correlations.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"train_correlations_flod{k}.pqt"))
    df_valid_correlations.to_parquet(os.path.join(KFLOD_DATA_DIRNAME, f"flod{k}", f"valid_correlations_flod{k}.pqt"))
    return df_train_correlations, df_train_topics
    
    
for k in tqdm(range(FLOD)):
    x, b = generate_flod_data(k)
    break

  0%|                                                                                                                                              | 0/5 [00:17<?, ?it/s]

CPU times: user 13.7 s, sys: 2.01 s, total: 15.8 s
Wall time: 17.8 s





In [23]:
x

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00069b63a70a,c_11a1dc0bfb99
...,...,...
211308,t_fff9e5407d13,c_d64037a72376
211309,t_fffbe1d5d43c,c_46f852a49c08
211310,t_fffbe1d5d43c,c_6659207b25d5
211311,t_fffe14f1be1e,c_cece166bad6a


In [24]:
b

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
4,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True
...,...,...,...,...,...,...,...,...,...
61572,t_fff9e5407d13,NA_U06 - El periódico,,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True
61573,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
61574,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True
61575,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True


In [27]:
b["id"].unique().size

61577

In [28]:
x["topic_id"].unique().size

48573

# Fork

In [8]:
def get_topic_field(d):
    title = list(filter(lambda x: pd.notna(x), d['title_level']))
    title = ' of '.join(title[-1::-1])
    title = 'No information' if title=='' else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    field = title + description
    return field

def get_content_field(d):
    title = d['title']
    title = 'No information' if pd.isna(title) else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    kind = '[' + d['kind'] + '] '
    field = kind + title + description
    return field

In [11]:
class DataPreparation:
    
    def __init__(self, topic_path, content_path, submission_path):
        self.topic = pd.read_parquet(topic_path)
        self.content = pd.read_parquet(content_path)
        self.corr = pd.read_parquet(submission_path)
        # self.topic = self.topic[self.topic['id'].isin(self.corr['topic_id'].to_list())]
        self.match_dict = None
    
    def prepare_topic(self):
        df_level = self._get_level_features(self.topic)
        self.topic = self.topic.merge(df_level, on='id', how='inner')
        self.topic['field'] = self.topic.apply(lambda x: get_topic_field(x), axis=1)
        return self.topic
    
    def prepare_content(self):
        self.content['field'] = self.content.apply(lambda x: get_content_field(x), axis=1)
        return self.content
    
    def prepare_language_match(self):
        topic = self.topic[['id', 'language']].merge(self.corr, left_on='id', right_on='topic_id', how='right')[['id', 'language']]
        match_dict = {}
        for language in topic['language'].unique():
            match_dict[language] = (topic.query('language==@language')[['id']], self.content.query('language==@language')[['id']])
        self.match_dict = match_dict
        return match_dict
    
    
    def _get_level_features(self, df_topic, level_cols=['title']):
        cols = list(set(level_cols + ['id', 'parent', 'level', 'has_content']))
        df_hier = df_topic[cols]
        
        highest_level = df_hier['level'].max()
        print(f'Highest Level: {highest_level}')

        df_level = df_hier.query('level == 0').copy(deep=True)
        level_list = list()
        for col in level_cols:
            df_level[f'{col}_level'] = df_level[f'{col}'].apply(lambda x: [x])

        for i in tqdm(range(highest_level + 1)):
            level_list.append(df_level[df_level['has_content']])
            df_level_high = df_hier.query('level == @i+1')
            df_level = df_level_high.merge(df_level, left_on='parent', right_on='id', suffixes=['', '_parent'], how='inner')
            for col in level_cols:
                df_level[f'{col}_level'] = df_level[f'{col}_level'] + df_level[f'{col}'].apply(lambda x: [x])
            for col in df_level.columns:
                if col.endswith('_parent'):
                    df_level.drop(columns=col, inplace=True)
        df = pd.concat(level_list).reset_index(drop=True)
        return df[set(['id'] + [f'{col}_level' for col in level_cols])]
    
    def prepare(self):
        self.prepare_topic()
        self.prepare_content()
        self.prepare_language_match()

In [12]:
%%time
topic_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_topics_flod0.pqt"
content_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_content_flod0.pqt"
corr_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_correlations_flod0.pqt"


dp = DataPreparation(topic_dir, content_dir, corr_dir)
dp.prepare()

Highest Level: 10


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 98.34it/s]
  return df[set(['id'] + [f'{col}_level' for col in level_cols])]


CPU times: user 4.69 s, sys: 1.28 s, total: 5.97 s
Wall time: 7.02 s


In [13]:
dp.topic

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,title_level,field
0,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True,"[MIT Blossoms, Engineering, Flow Charts: Logic...",[TITLE] Transcripts of Flow Charts: Logical Th...
1,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True,"[Khan Academy (български език), Математика, Ал...",[TITLE] Графики на експоненциални функции (Алг...
2,t_000feba42136,As vacinas,,8e286a,source,4,pt,t_dfc8ec591573,True,"[Khan Academy (Português (Brasil)), Ciências p...",[TITLE] As vacinas of Vida e evolução: o siste...
3,t_0010852b7049,Divisão com vários algarismos,You know your multiplication tables and are ge...,8e286a,source,4,pt,t_33355476f050,True,"[Khan Academy (Português (Brasil)), Matemática...",[TITLE] Divisão com vários algarismos of Númer...
4,t_0014d6945f7e,يحدّد بعض قواعد الاشتقاق: يجد مشتقة اقتران باس...,5acb7211ecf6d9049f561089,9fd860,source,3,ar,t_48e4fb5d42e1,True,"[Edraak (العربيّة), التفاضل والتكامل, المفاهيم...",[TITLE] يحدّد بعض قواعد الاشتقاق: يجد مشتقة اق...
...,...,...,...,...,...,...,...,...,...,...,...
20472,t_fff1f01cfeb0,Desarrollo de poliedros,Identifica prismas con su desarrollo plano.,998df9,supplemental,5,es,t_82dd0e9526f0,True,"[Chile - 1° Básico a 2° Medio - Matemáticas, C...",[TITLE] Desarrollo de poliedros of Sólidos geo...
20473,t_fff5da49c4d3,Business Writing,,735876,source,3,en,t_c4c783ae6cc1,True,"[Thoughtful Learning, Student Models, Level: G...",[TITLE] Business Writing of Level: Grade 9 of ...
20474,t_fff7f2dd208b,Fatoração de polinômios encontrando fatores co...,Aprenda a fatorar utilizando fatores comuns.,8e286a,source,4,pt,t_28dfc9e80110,True,"[Khan Academy (Português (Brasil)), Matemática...",[TITLE] Fatoração de polinômios encontrando fa...
20475,t_fff830472691,Scalar Projections,,fef095,source,4,en,t_c75d6acecf78,True,"[K-12, Math, Analysis, Vector Analysis, Scalar...",[TITLE] Scalar Projections of Vector Analysis ...


In [14]:
dp.content

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license,field
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,,[video] [TITLE] Sumar números de varios dígito...
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,,[video] [TITLE] Trovare i fattori di un numero...
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,,[video] [TITLE] Sumar curvas de demanda. [DESC...
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND,[document] [TITLE] Nado de aproximação. [DESCR...
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA,[document] [TITLE] geometry-m3-topic-a-overvie...
...,...,...,...,...,...,...,...,...,...
144651,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA,[html5] [TITLE] 2. 12: Diffusion. [DESCRIPTION...
144652,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,,[video] [TITLE] Sommare facendo gruppi da 10. ...
144653,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,,[video] [TITLE] Introdução à subtração. [DESCR...
144654,c_ffff04ba7ac7,SA of a Cone,,video,,en,,,[video] [TITLE] SA of a Cone. [DESCRIPTION]No ...


In [None]:
%%time
dp = DataPreparation(TOPIC_DIR, CONTENT_DIR, CORR_DIR)
dp.prepare()