In [9]:
import os
from tqdm.auto import tqdm
import pandas as pd

from sklearn.model_selection import StratifiedGroupKFold, GroupKFold

from transformers import AutoTokenizer, AutoModel

In [16]:
class PATH:
    input_dir = '/root/autodl-nas/data/k12'
    output_dir = '/root/autodl-nas/data/k12/out'
    cv_dir = '/root/autodl-nas/data/k12/cv_data'
    pretrained_dir = '/root/autodl-nas/model/'
    content_dir = os.path.join(input_dir, 'content.csv')
    correlation_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')
    
    
class CFG:
    seed = 11
    n_fold = 3
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'

## Save Tokenizer

In [11]:
special_tokens = [
    '[TITLE]',
    '[DESCRIPTION]',
    '[video]',
    '[document]',
    '[html5]',
    '[exercise]',
    '[audio]',
]

In [14]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, padding=True)
tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
model = AutoModel.from_pretrained(CFG.model_name)
model.resize_token_embeddings(len(tokenizer))

Embedding(30529, 384)

In [15]:
tokenizer

PreTrainedTokenizerFast(name_or_path='sentence-transformers/all-MiniLM-L6-v2', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[TITLE]', '[DESCRIPTION]', '[video]', '[document]', '[html5]', '[exercise]', '[audio]']})

In [20]:
tokenizer.save_pretrained(f"{PATH.pretrained_dir}/{CFG.model_name}_new/tokenizer")
model.save_pretrained(f"{PATH.pretrained_dir}/{CFG.model_name}_new/model")

In [19]:
pwd

'/root/kaggle/k12/notebook'

## Process & Save topic & content

In [4]:
def get_level_features(df_topic, level_cols=['title']):
    df_hier = df_topic[set(level_cols + ['id', 'parent', 'level', 'has_content'])]
    highest_level = df_hier['level'].max()
    print(f'Highest Level: {highest_level}')
    
    df_level = df_hier.query('level == 0').copy(deep=True)
    level_list = list()
    for col in level_cols:
        df_level[f'{col}_level'] = df_level[f'{col}'].apply(lambda x: [x])

    for i in tqdm(range(highest_level + 1)):
        level_list.append(df_level[df_level['has_content']])
        df_level_high = df_hier.query('level == @i+1')
        df_level = df_level_high.merge(df_level, left_on='parent', right_on='id', suffixes=['', '_parent'], how='inner')
        for col in level_cols:
            df_level[f'{col}_level'] = df_level[f'{col}_level'] + df_level[f'{col}'].apply(lambda x: [x])
        for col in df_level.columns:
            if col.endswith('_parent'):
                df_level.drop(columns=col, inplace=True)
    df = pd.concat(level_list).reset_index(drop=True)
    return df[set(['id'] + [f'{col}_level' for col in level_cols])]

In [5]:
def get_topic_field(d):
    title = list(filter(lambda x: pd.notna(x), d['title_level']))
    title = ' of '.join(title[-1::-1])
    title = 'No information' if title=='' else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    field = title + description
    return field

def get_content_field(d):
    title = d['title']
    title = 'No information' if pd.isna(title) else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    kind = '[' + d['kind'] + '] '
    field = kind + title + description
    return field

In [19]:
%%time
level_cols = ['title']
df_topic = pd.read_csv(PATH.topic_dir)
df_level = get_level_features(df_topic, level_cols)
df_topic = df_topic.merge(df_level, on='id', how='inner')
df_topic['field'] = df_topic.apply(lambda x: get_topic_field(x), axis=1)
df_topic = df_topic[['id', 'field']]
df_topic

Highest Level: 10


  df_hier = df_topic[set(level_cols + ['id', 'parent', 'level', 'has_content'])]


  0%|          | 0/11 [00:00<?, ?it/s]

  return df[set(['id'] + [f'{col}_level' for col in level_cols])]


CPU times: user 2.06 s, sys: 38.5 ms, total: 2.1 s
Wall time: 2.09 s


Unnamed: 0,id,field
0,t_00004da3a1b2,[TITLE] Откриването на резисторите of Открития...
1,t_00068291e9a4,[TITLE] Entradas e saídas de uma função of Álg...
2,t_00069b63a70a,[TITLE] Transcripts of Flow Charts: Logical Th...
3,t_0006d41a73a8,[TITLE] Графики на експоненциални функции (Алг...
4,t_0008768bdee6,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...
...,...,...
61512,t_fff830472691,[TITLE] Scalar Projections of Vector Analysis ...
61513,t_fff9e5407d13,[TITLE] NA_U06 - El periódico of Lengua españo...
61514,t_fffbe1d5d43c,[TITLE] Inscribed shapes problem solving of Mi...
61515,t_fffe14f1be1e,[TITLE] Lección 7 of Unidad 4 of Español Activ...


In [21]:
df_content = pd.read_csv(PATH.content_dir)
df_content['field'] = df_content.apply(lambda x: get_content_field(x), axis=1)
df_content = df_content[['id', 'field']]
df_content

Unnamed: 0,id,field
0,c_00002381196d,[video] [TITLE] Sumar números de varios dígito...
1,c_000087304a9e,[video] [TITLE] Trovare i fattori di un numero...
2,c_0000ad142ddb,[video] [TITLE] Sumar curvas de demanda. [DESC...
3,c_0000c03adc8d,[document] [TITLE] Nado de aproximação. [DESCR...
4,c_00016694ea2a,[document] [TITLE] geometry-m3-topic-a-overvie...
...,...,...
154042,c_fffcbdd4de8b,[html5] [TITLE] 2. 12: Diffusion. [DESCRIPTION...
154043,c_fffe15a2d069,[video] [TITLE] Sommare facendo gruppi da 10. ...
154044,c_fffed7b0d13a,[video] [TITLE] Introdução à subtração. [DESCR...
154045,c_ffff04ba7ac7,[video] [TITLE] SA of a Cone. [DESCRIPTION]No ...


In [23]:
df_content.to_parquet(os.path.join(PATH.output_dir, 'content_field.pqt'))
df_topic.to_parquet(os.path.join(PATH.output_dir, 'topic_field.pqt'))

## Split and Save

In [74]:
def cv_split(df_train, cfg, labels=['id', 'channel', 'category', 'language', 'fold']):
    df_train['fold'] = -1
    df_candidates = df_train[df_train["category"]!='source'].copy(deep=True).reset_index(drop=True)
    cv = StratifiedGroupKFold(n_splits=cfg.n_fold, random_state=cfg.seed, shuffle=True)
    for i, (train_index, test_index) in enumerate(cv.split(df_candidates, df_candidates['language'], df_candidates['id'],)):
        df_candidates.loc[test_index, 'fold'] = i
    df_source = df_train.query('category=="source"')
    df_train = pd.concat([df_candidates, df_source]).reset_index(drop=True) 
    return df_train[labels]

In [86]:
%%time
df_train = pd.read_csv(PATH.topic_dir).query('has_content').reset_index(drop=True)
df_train = cv_split(df_train, CFG, ['id', 'fold'])

CPU times: user 6.05 s, sys: 62.7 ms, total: 6.11 s
Wall time: 6.11 s


In [95]:
fold=0
df_train.query('fold!=@fold').merge(pd.read_csv(PATH.correlation_dir), left_on='id', right_on='topic_id')[['topic_id', 'content_ids']]

Unnamed: 0,topic_id,content_ids
0,t_0008a1bd84ba,c_7ff92a954a3d c_8790b074383e
1,t_000d1fb3f2f5,c_07f1d0eec4b2 c_15a6fb858696 c_175e9db3fc44 c...
2,t_0012a45fa09c,c_dde078b8ea7a
3,t_0016d30772f3,c_061d9f90bb06 c_242ddc729eec c_61b851222e17 c...
4,t_001bd01717d7,c_16f1cff519b3 c_289a31069ea7 c_6f0a3cf19895 c...
...,...,...
53177,t_fff5da49c4d3,c_4a0305f5876d c_cbceabf1d5d6
53178,t_fff7f2dd208b,c_036efdd9e8c1 c_112de3281469 c_15470abc39f4 c...
53179,t_fff830472691,c_61fb63326e5d c_8f224e321c87
53180,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5


In [90]:
pd.read_csv(PATH.correlation_dir)

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4
...,...,...
61512,t_fff830472691,c_61fb63326e5d c_8f224e321c87
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5
61515,t_fffe14f1be1e,c_cece166bad6a


In [98]:
%%time
df_content = pd.read_csv(PATH.content_dir)
df_topic = pd.read_csv(PATH.topic_dir)
for i in tqdm(range(CFG.n_fold)):
    dir_name = os.path.join(PATH.cv_dir, f'fold_{i}')
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    train = df_train.query('fold!=@i')
    test = df_train.query('fold==@i')
    
    correlations = train.merge(pd.read_csv(PATH.correlation_dir), left_on='id', right_on='topic_id')[['topic_id', 'content_ids']]
    submission = test.merge(pd.read_csv(PATH.correlation_dir), left_on='id', right_on='topic_id')[['topic_id', 'content_ids']]
    
    df_topic.to_csv(os.path.join(dir_name, 'topics.csv'), index=False)
    df_content.to_csv(os.path.join(dir_name, 'content.csv'), index=False)
    correlations.to_csv(os.path.join(dir_name, 'correlations.csv'), index=False)
    submission.to_csv(os.path.join(dir_name, 'sample_submission.csv'), index=False)

  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 1min 3s, sys: 5.56 s, total: 1min 8s
Wall time: 1min 8s
