In [1]:
import pandas as pd
import os

from tqdm import tqdm
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold

pd.set_option('display.max_rows', 100)

In [2]:
class CFG:
    input_dir = '/kaggle/input/learning-equality-curriculum-recommendations'
    cv_dir = '/kaggle/working/cv_data'
    
    content_dir = os.path.join(input_dir, 'content.csv')
    correlation_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')
    
    max_len = 38
    seed = 17
    n_fold = 4

In [3]:
df_content = pd.read_csv(CFG.content_dir)
df_corr = pd.read_csv(CFG.correlation_dir)
df_sub = pd.read_csv(CFG.submission_dir)
df_topic = pd.read_csv(CFG.topic_dir)

In [4]:
df_topic.query("category!='source'")['has_content'].value_counts()

True     25003
False     8482
Name: has_content, dtype: int64

In [5]:
df_topic['has_content'].value_counts()

True     61517
False    15455
Name: has_content, dtype: int64

## 准备训练数据

In [6]:
def prepare_topic(cfg):
    topic = pd.read_csv(cfg.topic_dir)
    topic['title'] = topic['title'].fillna(topic["description"]).fillna('')
    return topic
    
def prepare_content(cfg):
    content = pd.read_csv(cfg.content_dir)
    content['title'] = content['title'].fillna(content["description"]).fillna(content['text']).fillna('')
    return content

def prepare_correlation(cfg):
    correlation = pd.read_csv(cfg.correlation_dir)
    correlation['content_id']=correlation['content_ids'].apply(lambda x: x.split())
    correlation = correlation.explode('content_id').drop(columns='content_ids')
    return correlation

def merge_train_data(cfg, content_cols=['id', 'title'], topic_cols=['id', 'title']):
    dup_cols = ['id', 'title', 'description', 'language']
    topic = prepare_topic(cfg)
    content = prepare_content(cfg)
    correlation = prepare_correlation(cfg)
    correlation = (
        correlation
        .merge(content[content_cols], left_on='content_id', right_on='id', how='left')
        .drop(columns='id')
        .rename(columns=dict(zip(dup_cols, ['content_'+x for x in dup_cols])))
        .merge(topic[topic_cols], left_on='topic_id', right_on='id', how='left')
        .drop(columns='id')
        .rename(columns=dict(zip(dup_cols, ['topic_'+x for x in dup_cols])))
    )
    return correlation

In [7]:
%%time
content_cols = ['id', 'title', 'description', 'language']
topic_cols = ['id', 'title', 'description', 'channel', 'category', 'language']
df_train = merge_train_data(CFG, content_cols, topic_cols)
df_train

CPU times: user 12.9 s, sys: 2.03 s, total: 15 s
Wall time: 15 s


Unnamed: 0,topic_id,content_id,content_title,content_description,content_language,topic_title,topic_description,channel,category,topic_language
0,t_00004da3a1b2,c_1108dd0c7a5d,Молив като резистор,"Моливът причинява промяна в отклонението, подо...",bg,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,bg
1,t_00004da3a1b2,c_376c5a8eb028,Да чуем променливото съпротивление,Тук чертаем линия на лист хартия и я използвам...,bg,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,bg
2,t_00004da3a1b2,c_5bc0e1e2cba0,Променлив резистор (реостат) с графит от молив,Използваме сърцевината на молива (неговия граф...,bg,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,bg
3,t_00004da3a1b2,c_76231f9d0b5e,Последователно свързване на галваничен елемент...,"Защо отклонението се променя, когато се свърже...",bg,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,bg
4,t_00068291e9a4,c_639ea2ef9c95,Dados e resultados de funções: gráficos,Encontre todas as entradas que correspondem a ...,pt,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,pt
...,...,...,...,...,...,...,...,...,...,...
279914,t_fff9e5407d13,c_d64037a72376,Introducción: El periódico,,es,NA_U06 - El periódico,,71fd51,supplemental,es
279915,t_fffbe1d5d43c,c_46f852a49c08,Proof: Right triangles inscribed in circles -d...,Proof showing that a triangle inscribed in a c...,sw,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,sw
279916,t_fffbe1d5d43c,c_6659207b25d5,Area of inscribed equilateral triangle -dubbed...,A worked example of finding the area of an equ...,sw,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,sw
279917,t_fffe14f1be1e,c_cece166bad6a,Juego con las palabras,,es,Lección 7,,6e90a7,aligned,es


## One Fold CV

### Train Valid Split
1. 同一个channel的category唯一
2. 不同语言的字数统计方式不同，按字长分可能对部分语言不友好
3. 合适的candidate feature for split有channel category language[???]
4. test set中的channel会出现训练集中有过的channel + 全新的channel
5. test set中的category只有aligned和supplemental，没有source
6. 将train set和valid set中的content分很开可能没用，因为测试环境中的content是训练集content的超集

In [8]:
def cv_split(df_train, cfg):
    df_train['fold'] = -1
    df_candidates = df_train[df_train["category"]!='source'].copy(deep=True)
    df_candidates['count'] = df_candidates.groupby('channel')['channel'].transform('count')
    df_shared_topic = df_candidates.query('count > 4000').reset_index(drop=True)
    df_independent_topic = df_candidates.query('count <= 4000').reset_index(drop=True)
    gkf = GroupKFold(n_splits=cfg.n_fold)
    for i, (train_index, test_index) in enumerate(gkf.split(df_independent_topic, 
                                                            df_independent_topic['category'], 
                                                            df_independent_topic['channel'],)):
        df_independent_topic.loc[test_index, 'fold'] = i
    for i, (train_index, test_index) in enumerate(gkf.split(df_shared_topic,
                                                            df_shared_topic['channel'],
                                                            df_shared_topic['topic_id'],)):
        df_shared_topic.loc[test_index, 'fold'] = i
    df_source = df_train.query('category=="source"')
    df_train.drop(columns='fold', inplace=True)
    df_train = pd.concat([df_independent_topic, df_shared_topic, df_source]).reset_index(drop=True)
    df_train = df_train.drop(columns=['count'])
    return df_train

In [9]:
%%time
df_train = cv_split(df_train, CFG)

CPU times: user 473 ms, sys: 46.8 ms, total: 519 ms
Wall time: 520 ms


In [10]:
tk0 = (df_train.query('fold!=0'), df_train.query('fold!=0'))

Save train test dataset

In [11]:
if not os.path.exists(CFG.cv_dir):
    os.makedirs(CFG.cv_dir)

In [12]:
%%time
for i in tqdm(range(CFG.n_fold)):
    dir_name = os.path.join(CFG.cv_dir, f'fold_{i}')
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    train = df_train.query('fold!=@i')
    test = df_train.query('fold==@i')
    correlations = train.groupby('topic_id')['content_id'].apply(lambda x: " ".join(x)).reset_index(name='content_ids')
    submission = test.groupby('topic_id')['content_id'].apply(lambda x: " ".join(x)).reset_index(name='content_ids')
    
    df_topic.to_csv(os.path.join(dir_name, 'topics.csv'), index=False)
    df_content.to_csv(os.path.join(dir_name, 'content.csv'), index=False)
    correlations.to_csv(os.path.join(dir_name, 'correlations.csv'), index=False)
    submission.to_csv(os.path.join(dir_name, 'sample_submission.csv'), index=False)

100%|██████████| 4/4 [01:33<00:00, 23.48s/it]

CPU times: user 1min 28s, sys: 5.63 s, total: 1min 33s
Wall time: 1min 33s





In [13]:
!ls cv_data
!ls cv_data/fold_0

fold_0	fold_1	fold_2	fold_3
content.csv  correlations.csv  sample_submission.csv  topics.csv


## 为叶子topic抽取先前节点的特征
仅包含has_content=True的特征

In [14]:
# function to extract the level features for each leaf topic
def get_level_features(df_topic, level_cols=['title']):
    df_hier = df_topic[set(level_cols + ['id', 'parent', 'level', 'has_content'])]
    highest_level = df_hier['level'].max()
    print(f'Highest Level: {highest_level}')
    
    df_level = df_hier.query('level == 0').copy(deep=True)
    level_list = list()
    for col in level_cols:
        df_level[f'{col}_level'] = df_level[f'{col}'].apply(lambda x: [x])

    for i in tqdm(range(highest_level + 1)):
        level_list.append(df_level[df_level['has_content']])
        df_level_high = df_hier.query('level == @i+1')
        df_level = df_level_high.merge(df_level, left_on='parent', right_on='id', suffixes=['', '_parent'], how='inner')
        for col in level_cols:
            df_level[f'{col}_level'] = df_level[f'{col}_level'] + df_level[f'{col}'].apply(lambda x: [x])
        for col in df_level.columns:
            if col.endswith('_parent'):
                df_level.drop(columns=col, inplace=True)
    df = pd.concat(level_list).reset_index(drop=True)
    return df[set(['id'] + [f'{col}_level' for col in level_cols])]

In [15]:
%%time
level_cols = ['id', 'title', 'description', 'level', 'language', 'has_content']
df_level = get_level_features(df_topic.fillna(''), level_cols)
df_level

Highest Level: 10


100%|██████████| 11/11 [00:02<00:00,  4.56it/s]


CPU times: user 2.48 s, sys: 116 ms, total: 2.6 s
Wall time: 2.65 s


Unnamed: 0,language_level,level_level,id,description_level,has_content_level,title_level,id_level
0,[ar],[0],t_10035396d740,[],[True],[قناة كم كلمة لتمكين المعلّمين الناطقين باللغة...,[t_10035396d740]
1,[en],[0],t_30dd476279c8,[],[True],[Medicine],[t_30dd476279c8]
2,[ar],[0],t_3efcae0132f0,[],[True],[وزارة التربية والتعليم الأردنية],[t_3efcae0132f0]
3,[en],[0],t_470986f56fbe,[],[True],[Khan Academy - Standardized Test Preparation],[t_470986f56fbe]
4,[es],[0],t_72d1321023d9,[],[True],[Matemáticas Quinto Grado (Guatemala)],[t_72d1321023d9]
...,...,...,...,...,...,...,...
61512,"[es, es, es, es, es, es, es, es, es, es]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",t_ed58066fa353,"[, , , , , , , , , ]","[False, False, True, False, True, True, True, ...","[Ganar - Habilidades para el Trabajo, Laborato...","[t_44623019938d, t_0d57669638bd, t_4cc29e19e15..."
61513,"[es, es, es, es, es, es, es, es, es, es]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",t_d7e92d224506,"[, , , , , , , , , ]","[False, False, True, False, True, True, True, ...","[Ganar - Habilidades para el Trabajo, Laborato...","[t_44623019938d, t_0d57669638bd, t_4cc29e19e15..."
61514,"[es, es, es, es, es, es, es, es, es, es]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",t_f60d6940436e,"[, , , , , , , , , ]","[False, False, True, False, True, False, True,...","[Ganar - Habilidades para el Trabajo, Laborato...","[t_44623019938d, t_0d57669638bd, t_4cc29e19e15..."
61515,"[es, es, es, es, es, es, es, es, es, es, es]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",t_2cb874d9bfed,"[, , , , , , , , , , ]","[False, False, True, False, True, True, True, ...","[Ganar - Habilidades para el Trabajo, Laborato...","[t_44623019938d, t_0d57669638bd, t_4cc29e19e15..."


## Match by Language
超过99%的correlations为同一语种，可以通过匹配language缩小retrieval candidates

In [16]:
def prepare_language_match(cfg, mode='train'):
    topic = pd.read_csv(cfg.topic_dir)[['id', 'language']]
    content = pd.read_csv(cfg.content_dir)[['id', 'language']]
    if mode == 'train':
        corr = pd.read_csv(cfg.correlation_dir)
    elif mode == 'valid':
        corr = pd.read_csv(cfg.submission_dir)
    topic = topic.merge(corr, left_on='id', right_on='topic_id')[['id', 'language']]
    match_dict = {}
    for language in topic['language'].unique():
        match_dict[language] = (topic.query('language==@language')[['id']], content.query('language==@language')[['id']])
    return match_dict

In [17]:
%%time
topic_content_match = prepare_language_match(CFG)

CPU times: user 13.5 s, sys: 1.14 s, total: 14.6 s
Wall time: 14.6 s


In [18]:
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

In [19]:
topic, content = topic_content_match['en']

In [20]:
topic

Unnamed: 0,id
2,t_00069b63a70a
5,t_0008a1bd84ba
6,t_000d1fb3f2f5
8,t_00102869fbcb
10,t_0012a45fa09c
...,...
61504,t_fff05585df72
61507,t_fff51448598c
61509,t_fff5da49c4d3
61510,t_fff7782561f4


In [21]:
content

Unnamed: 0,id
5,c_00019840d110
8,c_00027d03ca7d
10,c_000425df0161
11,c_00046806ad8a
18,c_000751f58836
...,...
154035,c_fffa90b024e3
154041,c_fffbfc3d60c6
154042,c_fffcbdd4de8b
154045,c_ffff04ba7ac7


## TODOs

### F2
$$
F2 = \frac{5\cdot{}precision\cdot{}recall}{4\cdot{}(precision+recall)}
$$

### Recall@N