In [1]:
import pandas as pd
import os

from tqdm import tqdm
from sklearn.model_selection import StratifiedGroupKFold

pd.set_option('display.max_rows', 100)

In [2]:
class CFG:
    input_dir = '/kaggle/input/learning-equality-curriculum-recommendations'
    content_dir = os.path.join(input_dir, 'content.csv')
    correlations_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')

In [3]:
df_content = pd.read_csv(CFG.content_dir)
df_corr = pd.read_csv(CFG.correlations_dir)
df_sub = pd.read_csv(CFG.submission_dir)
df_topic = pd.read_csv(CFG.topic_dir)

In [4]:
df_topic['has_content'].value_counts()

True     61517
False    15455
Name: has_content, dtype: int64

## 为叶子topic抽取先前节点的特征
仅包含has_content=True的特征

In [5]:
# function to extract the level features for each leaf topic
def get_level_features(df_topic, level_cols=['title']):
    df_hier = df_topic[set(level_cols + ['id', 'parent', 'level', 'has_content'])]
    highest_level = df_hier['level'].max()
    print(f'Highest Level: {highest_level}')
    
    df_level = df_hier.query('level == 0').copy(deep=True)
    level_list = list()
    for col in level_cols:
        df_level[f'{col}_level'] = df_level[f'{col}'].apply(lambda x: [x])

    for i in tqdm(range(highest_level + 1)):
        level_list.append(df_level[df_level['has_content']])
        df_level_high = df_hier.query('level == @i+1')
        df_level = df_level_high.merge(df_level, left_on='parent', right_on='id', suffixes=['', '_parent'], how='inner')
        for col in level_cols:
            df_level[f'{col}_level'] = df_level[f'{col}_level'] + df_level[f'{col}'].apply(lambda x: [x])
        for col in df_level.columns:
            if col.endswith('_parent'):
                del df_level[col]
    df = pd.concat(level_list).reset_index(drop=True)
    return df[set(['id'] + [f'{col}_level' for col in level_cols])]

In [6]:
level_cols = ['id', 'title', 'description', 'level', 'language', 'has_content']
df_level = get_level_features(df_topic.fillna(''), level_cols)
df_level

Highest Level: 10


100%|██████████| 11/11 [00:01<00:00,  6.25it/s]


Unnamed: 0,id,has_content_level,title_level,description_level,id_level,language_level,level_level
0,t_10035396d740,[True],[قناة كم كلمة لتمكين المعلّمين الناطقين باللغة...,[],[t_10035396d740],[ar],[0]
1,t_30dd476279c8,[True],[Medicine],[],[t_30dd476279c8],[en],[0]
2,t_3efcae0132f0,[True],[وزارة التربية والتعليم الأردنية],[],[t_3efcae0132f0],[ar],[0]
3,t_470986f56fbe,[True],[Khan Academy - Standardized Test Preparation],[],[t_470986f56fbe],[en],[0]
4,t_72d1321023d9,[True],[Matemáticas Quinto Grado (Guatemala)],[],[t_72d1321023d9],[es],[0]
...,...,...,...,...,...,...,...
61512,t_ed58066fa353,"[False, False, True, False, True, True, True, ...","[Ganar - Habilidades para el Trabajo, Laborato...","[, , , , , , , , , ]","[t_44623019938d, t_0d57669638bd, t_4cc29e19e15...","[es, es, es, es, es, es, es, es, es, es]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
61513,t_d7e92d224506,"[False, False, True, False, True, True, True, ...","[Ganar - Habilidades para el Trabajo, Laborato...","[, , , , , , , , , ]","[t_44623019938d, t_0d57669638bd, t_4cc29e19e15...","[es, es, es, es, es, es, es, es, es, es]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
61514,t_f60d6940436e,"[False, False, True, False, True, False, True,...","[Ganar - Habilidades para el Trabajo, Laborato...","[, , , , , , , , , ]","[t_44623019938d, t_0d57669638bd, t_4cc29e19e15...","[es, es, es, es, es, es, es, es, es, es]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
61515,t_2cb874d9bfed,"[False, False, True, False, True, True, True, ...","[Ganar - Habilidades para el Trabajo, Laborato...","[, , , , , , , , , , ]","[t_44623019938d, t_0d57669638bd, t_4cc29e19e15...","[es, es, es, es, es, es, es, es, es, es, es]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"


## TODOs

### Train Valid Split
1. 同一个channel的category唯一
2. 不同语言的字数统计方式不同，按字长分可能对部分语言不友好
3. 合适的candidate feature for split有channel category language[???]
4. test set中的channel会出现训练集中有过的channel + 全新的channel
5. test set中的category只有aligned和supplemental，没有source
6. 将train set和valid set中的content分很开可能没用，因为测试环境中的content是训练集content的超集

category为source的topic不会在test set中出现

In [7]:
# Exclude source category since they won't appear in test set
df_valid_candidate = df_topic[df_topic['category']!='source']
df_valid_candidate.head()

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
5,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True
6,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True
8,t_000d1fb3f2f5,2.1.2 - Logarithms,,e77b55,aligned,5,en,t_b897d168db90,True
10,t_00102869fbcb,Triangles and polygons,Learning outcomes: students must be able to so...,a91e32,aligned,3,en,t_039cecc12bb8,True


In [8]:
df_valid_candidate

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
5,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True
6,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True
8,t_000d1fb3f2f5,2.1.2 - Logarithms,,e77b55,aligned,5,en,t_b897d168db90,True
10,t_00102869fbcb,Triangles and polygons,Learning outcomes: students must be able to so...,a91e32,aligned,3,en,t_039cecc12bb8,True
...,...,...,...,...,...,...,...,...,...
76964,t_fff80f4eee89,Polynomial division,"After we have added, subtracted, and multiplie...",6b09a4,supplemental,5,en,t_e93f4fad3893,False
76966,t_fff9e5407d13,NA_U06 - El periódico,,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True
76967,t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False
76969,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True


In [9]:
StratifiedGroupKFold

sklearn.model_selection._split.StratifiedGroupKFold

## Content

Content 会存在少数title为空的row，训练集中9条暂时不用管