In [39]:
import os
import gc
import psutil
from pathlib import Path

import pandas as pd
import numpy as np
import random
pd.set_option('display.max_rows', 100)
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding

device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [40]:
class PATH:
    input_dir = '/root/autodl-tmp/data/k12/cv_split_new/train/fold_3'
    output_dir = '/root/autodl-tmp/data/k12/out/'
    # cv_dir = '/root/autodl-nas/data/k12/cv_data'
    content_dir = os.path.join(input_dir, 'content.csv')
    correlation_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')
    
class CFG:
    fold = 3

## Text Field Features

In [41]:
df_content = pd.read_parquet(os.path.join(PATH.output_dir, 'content_field.pqt'))
df_topic = pd.read_parquet(os.path.join(PATH.output_dir, 'topic_field.pqt'))

In [42]:
df_content

Unnamed: 0,id,field
0,c_00002381196d,[video] [TITLE] Sumar números de varios dígito...
1,c_000087304a9e,[video] [TITLE] Trovare i fattori di un numero...
2,c_0000ad142ddb,[video] [TITLE] Sumar curvas de demanda. [DESC...
3,c_0000c03adc8d,[document] [TITLE] Nado de aproximação. [DESCR...
4,c_00016694ea2a,[document] [TITLE] geometry-m3-topic-a-overvie...
...,...,...
154042,c_fffcbdd4de8b,[html5] [TITLE] 2. 12: Diffusion. [DESCRIPTION...
154043,c_fffe15a2d069,[video] [TITLE] Sommare facendo gruppi da 10. ...
154044,c_fffed7b0d13a,[video] [TITLE] Introdução à subtração. [DESCR...
154045,c_ffff04ba7ac7,[video] [TITLE] SA of a Cone. [DESCRIPTION]No ...


In [43]:
correlations = pd.read_csv(PATH.correlation_dir)

In [44]:
correlations['content_id'] = correlations['content_ids'].apply(lambda x: x.split())
correlations = correlations.explode('content_id').drop(columns='content_ids').reset_index(drop=True)

In [45]:
correlations

Unnamed: 0,topic_id,content_id
0,t_0008768bdee6,c_34e1424229b4
1,t_0008768bdee6,c_7d1a964d66d5
2,t_0008768bdee6,c_aab93ee667f4
3,t_0008a1bd84ba,c_7ff92a954a3d
4,t_0008a1bd84ba,c_8790b074383e
...,...,...
239420,t_fff830472691,c_61fb63326e5d
239421,t_fff830472691,c_8f224e321c87
239422,t_fffbe1d5d43c,c_46f852a49c08
239423,t_fffbe1d5d43c,c_6659207b25d5


In [46]:
df_train = correlations.merge(df_topic, left_on='topic_id', right_on='id').drop(columns='id')
df_train = df_train.merge(df_content, left_on='content_id', right_on='id', suffixes=['_topic', '_content']).drop(columns='id')
df_train

Unnamed: 0,topic_id,content_id,field_topic,field_content
0,t_0008768bdee6,c_34e1424229b4,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...
1,t_0476b243186a,c_34e1424229b4,[TITLE] 2-અંકની સંખ્યાઓ સાથેના સરવાળાનો પરિચય ...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...
2,t_f50e76d3ddb4,c_34e1424229b4,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...
3,t_0008768bdee6,c_7d1a964d66d5,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવીને ઉમેરવું . [DESCRI...
4,t_f50e76d3ddb4,c_7d1a964d66d5,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવીને ઉમેરવું . [DESCRI...
...,...,...,...,...
239420,t_fff1047917af,c_e6b95de6962f,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਰੋਲ ਪਲੇ (ਨਾਟਕ). [DESCRIPTION]s...
239421,t_fff1047917af,c_f59987cf8a75,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਤਾੜੀ ਵਜਾਉਣਾ. [DESCRIPTION]sour...
239422,t_fff1047917af,c_fc1eca95e2f3,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਹੋਮ ਵਿਜ਼ਿਟ. [DESCRIPTION]source...
239423,t_fff5d93d4dc2,c_79903740e1e8,[TITLE] Discriminação de preços of Decisões de...,[video] [TITLE] Discriminação de preços. [DESC...


In [47]:
df_train = df_train.merge(pd.read_csv(PATH.topic_dir)[['id', 'language']], left_on='topic_id', right_on='id')
df_train = df_train.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)

In [48]:
df_train

Unnamed: 0,topic_id,content_id,field_topic,field_content,id,language
0,t_501db1866d8b,c_ff3033d9f0ab,[TITLE] PHP 102 of Abdulla Eid Network (العربي...,[video] [TITLE] 4. كيفية تفعيل الـ Extension. ...,t_501db1866d8b,ar
1,t_b32cf2527fce,c_10fe018a6bd0,[TITLE] PHP 101 of Abdulla Eid Network (العربي...,[video] [TITLE] 112. throw.mov. [DESCRIPTION]N...,t_b32cf2527fce,ar
2,t_81e994ac4f2c,c_4b758908348b,[TITLE] أَنا وَأُمّي (٤) of تكنولوجيا وعلوم of...,[html5] [TITLE] أَنا وَأُمّي (٤). [DESCRIPTION...,t_81e994ac4f2c,ar
3,t_c598f150b21b,c_d0e930396f01,[TITLE] I LIKE vs. I'D LIKE LAY and LIE of Pro...,[video] [TITLE] Learn English Phrases: I LIKE ...,t_c598f150b21b,ar
4,t_f233115e2c97,c_9e288a3fc9eb,[TITLE] دورة بناء سكربت منتدى of مصادر إثرائية...,[document] [TITLE] شرح تعديل البيانات للمستخدم...,t_f233115e2c97,ar
...,...,...,...,...,...,...
239420,t_26bc5f0fc4bd,c_9a5c87a91f65,[TITLE] 招聘员工 of HP LIFE - 课程 (中国大陆). [DESCRIPT...,[html5] [TITLE] 故事. [DESCRIPTION]Ming的生意发展非常迅速...,t_26bc5f0fc4bd,zh
239421,t_44f97886b076,c_47a14cd9eb9a,[TITLE] 不规则名词复数： 突变体和外来词复数 of 词类：名词 of 语法 of 艺...,[video] [TITLE] 额外视频 — — 突变体复数的起源. [DESCRIPTIO...,t_44f97886b076,zh
239422,t_2c338a8d761c,c_4d88420afbad,[TITLE] 体验泳池乐趣 of 克服对水的恐惧 of 运动 of Sikana (中文)...,[video] [TITLE] 戴脚蹼游泳. [DESCRIPTION]本视频适用于游泳初学...,t_2c338a8d761c,zh
239423,t_81189d6b4f76,c_d5401beff097,[TITLE] 如何射门 of 如何踢足球 of 运动 of Sikana (中文). [D...,[video] [TITLE] 如何用脚背射门. [DESCRIPTION]本视频面向初学者...,t_81189d6b4f76,zh


In [49]:
# df_no_source = pd.read_csv(PATH.topic_dir).query('category!="source"')[['id']]
# df_ = df_train[df_train['topic_id'].isin(df_no_source['id'].to_list())]
# df_.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)[['field_topic', 'field_content']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled_no_source.csv'), index=None)

In [50]:
correlations

Unnamed: 0,topic_id,content_id
0,t_0008768bdee6,c_34e1424229b4
1,t_0008768bdee6,c_7d1a964d66d5
2,t_0008768bdee6,c_aab93ee667f4
3,t_0008a1bd84ba,c_7ff92a954a3d
4,t_0008a1bd84ba,c_8790b074383e
...,...,...
239420,t_fff830472691,c_61fb63326e5d
239421,t_fff830472691,c_8f224e321c87
239422,t_fffbe1d5d43c,c_46f852a49c08
239423,t_fffbe1d5d43c,c_6659207b25d5


## Sample Negatives

In [51]:
correlations = pd.read_csv(PATH.correlation_dir)
df_samples = correlations.merge(pd.read_csv('/root/autodl-tmp/data/k12/cv_split_new/train/fold_3/sample/f3r2_top50.csv'), on='topic_id', suffixes=['', '_samples'])
df_samples['content_ids'] = df_samples['content_ids'].apply(lambda x: x.split())
df_samples['content_ids_samples'] = df_samples['content_ids_samples'].apply(lambda x: x.split())
df_samples['n_pos'] = df_samples['content_ids'].apply(len)
df_samples['negatives'] = df_samples.apply(lambda x: set(x['content_ids_samples']).difference(x['content_ids']), axis=1)
df_samples['neg_samples'] = df_samples.apply(lambda x: random.choices(x['content_ids_samples'], k=x['n_pos']), axis=1)
df_samples = df_samples[['topic_id', 'content_ids', 'neg_samples']].explode(['content_ids', 'neg_samples']).rename(columns={
    'content_ids': 'content_id', 'neg_samples': 'neg_content_id'})
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id
0,t_0008768bdee6,c_34e1424229b4,c_1b34550c4077
0,t_0008768bdee6,c_7d1a964d66d5,c_3b7a0e2578ac
0,t_0008768bdee6,c_aab93ee667f4,c_f99e90e3f2a3
1,t_0008a1bd84ba,c_7ff92a954a3d,c_ff831da83c53
1,t_0008a1bd84ba,c_8790b074383e,c_9d2cfd89891b
...,...,...,...
51513,t_fff830472691,c_61fb63326e5d,c_ce6455bfe272
51513,t_fff830472691,c_8f224e321c87,c_b04528ab0a3d
51514,t_fffbe1d5d43c,c_46f852a49c08,c_d5056724582e
51514,t_fffbe1d5d43c,c_6659207b25d5,c_bac72256e994


In [52]:
df_samples = df_samples.merge(df_topic, left_on='topic_id', right_on='id')
df_samples = df_samples.merge(df_content, left_on='content_id', right_on='id', suffixes=['_topic', '_content'])
df_samples = df_samples.merge(df_content, left_on='neg_content_id', right_on='id')
df_samples = df_samples[['topic_id', 'content_id', 'neg_content_id', 'field_topic', 'field_content', 'field']].rename(columns={'field': 'neg_content_field'})

In [53]:
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id,field_topic,field_content,neg_content_field
0,t_0008768bdee6,c_34e1424229b4,c_1b34550c4077,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...,[exercise] [TITLE] દસ બનાવીને 2-અંકની સંખ્યાઓ ...
1,t_f50e76d3ddb4,c_34e1424229b4,c_1b34550c4077,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...,[exercise] [TITLE] દસ બનાવીને 2-અંકની સંખ્યાઓ ...
2,t_331ff094f036,c_7d1a964d66d5,c_1b34550c4077,[TITLE] 100 સુધીનો સરવાળો of સરવાળો અને બાદબાક...,[video] [TITLE] સમૂહ બનાવીને ઉમેરવું . [DESCRI...,[exercise] [TITLE] દસ બનાવીને 2-અંકની સંખ્યાઓ ...
3,t_0476b243186a,c_34e1424229b4,c_e99b3d423200,[TITLE] 2-અંકની સંખ્યાઓ સાથેના સરવાળાનો પરિચય ...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...,[video] [TITLE] સ્થાન કિંમતનો ઉપયોગ કરીને 10 બ...
4,t_7a9391e2802f,c_25f5ae2a130c,c_e99b3d423200,[TITLE] 100 સુધીની બાદબાકી of 100 સુધીના સરવાળ...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...,[video] [TITLE] સ્થાન કિંમતનો ઉપયોગ કરીને 10 બ...
...,...,...,...,...,...,...
239420,t_fff1047917af,c_e6b95de6962f,c_9ef26bfefe60,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਰੋਲ ਪਲੇ (ਨਾਟਕ). [DESCRIPTION]s...,[video] [TITLE] ਕਾਗਜ਼ ਦੇ ਲਿਫਾਫੇ ਦੇ ਫੁੱਲ. [DESCR...
239421,t_fff1047917af,c_9a1e7ca073bb,c_33ae4162f588,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਕਿਤਾਬਾਂ ਦਾ ਮਹੱਤਵ. [DESCRIPTION...,[video] [TITLE] ਤੈਰਨ ਵਾਲੀ ਕਿਸ਼ਤੀ. [DESCRIPTION]...
239422,t_fff1047917af,c_9c85c3660b6b,c_3bb33bd78c3a,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਮੈਚਿੰਗ ਖੇਡ. [DESCRIPTION]sourc...,[video] [TITLE] ਦਰੱਖਤ ਦਾ ਚਿੱਤਰ ਅਤੇ ਪੱਤੇ ਚਿਪਕਾਉ...
239423,t_fff1047917af,c_acb27fc151fc,c_463da5be56ce,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਬੱਚਿਆਂ ਦੇ ਬੈਗ. [DESCRIPTION]so...,[video] [TITLE] ਟਾਵਰ ਬਣਾਉਣਾ. [DESCRIPTION]sour...


In [54]:
df_samples = df_samples.merge(pd.read_csv(PATH.topic_dir)[['id', 'language']], left_on='topic_id', right_on='id')
df_samples = df_samples.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id,field_topic,field_content,neg_content_field,id,language
0,t_64b99783ffe9,c_6d7394bed259,c_9c3fe3e99008,[TITLE] يفهم النسبة المئويّة الأكبر من 100% في...,[video] [TITLE] Video. [DESCRIPTION]5ad47fa56b...,[video] [TITLE] Video. [DESCRIPTION]5ad4aca66b...,t_64b99783ffe9,ar
1,t_6c0419bd14c1,c_0e9c72c2904e,c_af7603d7f79d,[TITLE] يفهم معنى خط الانحدار ويقدّر أهميته في...,[video] [TITLE] Video. [DESCRIPTION]5b6cd5036b...,[video] [TITLE] Video. [DESCRIPTION]5ad47ee13a...,t_6c0419bd14c1,ar
2,t_00a2b86f7630,c_b6994dabea69,c_e691463ff118,[TITLE] تدريبات عامة فى الميكانيكا of ميكانيكا...,[video] [TITLE] مخطط الجسم الحر وشنطة السفر(3/...,[video] [TITLE] تعرف تبرم لسانك؟ | الجينات لما...,t_00a2b86f7630,ar
3,t_5ac8bc3412cd,c_bebd23323f46,c_bebd23323f46,[TITLE] علامات الترقيم of قواعد الإملاء العربي...,[video] [TITLE] علامات الترقيم 5. [DESCRIPTION...,[video] [TITLE] علامات الترقيم 5. [DESCRIPTION...,t_5ac8bc3412cd,ar
4,t_e11985323a41,c_4192d30ccab7,c_631a30b93180,[TITLE] PHP منوعات في of مصادر إثرائية of الحا...,[document] [TITLE] شرح طريقة إنشاء ملف وكتابة ...,[document] [TITLE] شرح لطريقة معرفة معلومات حو...,t_e11985323a41,ar
...,...,...,...,...,...,...,...,...
239420,t_bc38e08b20d5,c_de76120630e8,c_9afcdaade1ed,[TITLE] 共同基金和交易所交易基金 of 投资工具、保险和退休计划 of 金融和资本市...,[video] [TITLE] 封闭式共同基金. [DESCRIPTION]封闭式和开放式共...,[video] [TITLE] 托马斯 · 马尔萨斯和种群增长. [DESCRIPTION]...,t_bc38e08b20d5,zh
239421,t_50310b13bc42,c_9afd6faba548,c_06c7d4dc7359,[TITLE] 如何选择设备 of 学习北欧式健走 of 运动 of Sikana (中文)...,[video] [TITLE] 温和天气的服装选择. [DESCRIPTION]在本视频中，...,[document] [TITLE] 跳步走练习. [DESCRIPTION]在本视频中，您...,t_50310b13bc42,zh
239422,t_053885ea8395,c_2b3647638fda,c_3678c040d039,[TITLE] 血液系统简介 of 人体解剖学和生理学 of 卫生与医学 of 科学 of ...,[video] [TITLE] 氧含量. [DESCRIPTION]学习氧含量（CaO2）与...,[video] [TITLE] 人体循环系统的温度调节. [DESCRIPTION]No i...,t_053885ea8395,zh
239423,t_1cd108c690f0,c_5412483801a0,c_19d7cab18d2f,[TITLE] 开始健步走 of 速度运动：健步走 of 运动 of Sikana (中文)...,[video] [TITLE] 一次完整的健步走锻炼. [DESCRIPTION]在本视频中...,[video] [TITLE] 正确的姿势. [DESCRIPTION]在本视频中，您将学会...,t_1cd108c690f0,zh


## Save

In [55]:
# df_samples[['field_topic', 'field_content', 'neg_content_field']].to_parquet(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled_sampled_r3.pqt'))
df_samples[['field_topic', 'field_content', 'neg_content_field']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_f3r3_top50_shuffled_sampled_r3.csv'), index=None)

In [56]:
# df_train[['field_topic', 'field_content']].to_parquet(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled.pqt'))
## df_train[['field_topic', 'field_content']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled.csv'), index=None)


In [57]:
pd.read_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_f3r3_top50_shuffled_sampled_r3.csv'))

Unnamed: 0,field_topic,field_content,neg_content_field
0,[TITLE] يفهم النسبة المئويّة الأكبر من 100% في...,[video] [TITLE] Video. [DESCRIPTION]5ad47fa56b...,[video] [TITLE] Video. [DESCRIPTION]5ad4aca66b...
1,[TITLE] يفهم معنى خط الانحدار ويقدّر أهميته في...,[video] [TITLE] Video. [DESCRIPTION]5b6cd5036b...,[video] [TITLE] Video. [DESCRIPTION]5ad47ee13a...
2,[TITLE] تدريبات عامة فى الميكانيكا of ميكانيكا...,[video] [TITLE] مخطط الجسم الحر وشنطة السفر(3/...,[video] [TITLE] تعرف تبرم لسانك؟ | الجينات لما...
3,[TITLE] علامات الترقيم of قواعد الإملاء العربي...,[video] [TITLE] علامات الترقيم 5. [DESCRIPTION...,[video] [TITLE] علامات الترقيم 5. [DESCRIPTION...
4,[TITLE] PHP منوعات في of مصادر إثرائية of الحا...,[document] [TITLE] شرح طريقة إنشاء ملف وكتابة ...,[document] [TITLE] شرح لطريقة معرفة معلومات حو...
...,...,...,...
239420,[TITLE] 共同基金和交易所交易基金 of 投资工具、保险和退休计划 of 金融和资本市...,[video] [TITLE] 封闭式共同基金. [DESCRIPTION]封闭式和开放式共...,[video] [TITLE] 托马斯 · 马尔萨斯和种群增长. [DESCRIPTION]...
239421,[TITLE] 如何选择设备 of 学习北欧式健走 of 运动 of Sikana (中文)...,[video] [TITLE] 温和天气的服装选择. [DESCRIPTION]在本视频中，...,[document] [TITLE] 跳步走练习. [DESCRIPTION]在本视频中，您...
239422,[TITLE] 血液系统简介 of 人体解剖学和生理学 of 卫生与医学 of 科学 of ...,[video] [TITLE] 氧含量. [DESCRIPTION]学习氧含量（CaO2）与...,[video] [TITLE] 人体循环系统的温度调节. [DESCRIPTION]No i...
239423,[TITLE] 开始健步走 of 速度运动：健步走 of 运动 of Sikana (中文)...,[video] [TITLE] 一次完整的健步走锻炼. [DESCRIPTION]在本视频中...,[video] [TITLE] 正确的姿势. [DESCRIPTION]在本视频中，您将学会...
