In [51]:
import os
import gc
import psutil
from pathlib import Path

import pandas as pd
import numpy as np
import random
pd.set_option('display.max_rows', 100)
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding

device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [52]:
class CFG:
    fold = 4

class PATH:
    input_dir = f'/root/autodl-tmp/data/k12/cv_split_new/valid/fold_{CFG.fold}'
    output_dir = '/root/autodl-tmp/data/k12/out'
    cv_dir = '/root/autodl-tmp/data/k12/cv_split_new'
    content_dir = os.path.join(input_dir, 'content.csv')
    correlation_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')
    


In [53]:
# import pandas as pd
# pd.read_csv('/root/autodl-tmp/data/k12/out/retrieval/retrieval_field_fold_0_shuffled.csv')

## Text Field Features

In [54]:
df_content = pd.read_parquet(os.path.join(PATH.output_dir, 'content_field.pqt'))
df_topic = pd.read_parquet(os.path.join(PATH.output_dir, 'topic_field.pqt'))

In [55]:
df_content

Unnamed: 0,id,field
0,c_00002381196d,[video] [TITLE] Sumar números de varios dígito...
1,c_000087304a9e,[video] [TITLE] Trovare i fattori di un numero...
2,c_0000ad142ddb,[video] [TITLE] Sumar curvas de demanda. [DESC...
3,c_0000c03adc8d,[document] [TITLE] Nado de aproximação. [DESCR...
4,c_00016694ea2a,[document] [TITLE] geometry-m3-topic-a-overvie...
...,...,...
154042,c_fffcbdd4de8b,[html5] [TITLE] 2. 12: Diffusion. [DESCRIPTION...
154043,c_fffe15a2d069,[video] [TITLE] Sommare facendo gruppi da 10. ...
154044,c_fffed7b0d13a,[video] [TITLE] Introdução à subtração. [DESCR...
154045,c_ffff04ba7ac7,[video] [TITLE] SA of a Cone. [DESCRIPTION]No ...


In [56]:
correlations = pd.read_csv(PATH.correlation_dir)

In [57]:
correlations['content_id'] = correlations['content_ids'].apply(lambda x: x.split())
correlations = correlations.explode('content_id').drop(columns='content_ids').reset_index(drop=True)

In [58]:
correlations

Unnamed: 0,topic_id,content_id
0,t_0008768bdee6,c_34e1424229b4
1,t_0008768bdee6,c_7d1a964d66d5
2,t_0008768bdee6,c_aab93ee667f4
3,t_0008a1bd84ba,c_7ff92a954a3d
4,t_0008a1bd84ba,c_8790b074383e
...,...,...
259435,t_fff830472691,c_61fb63326e5d
259436,t_fff830472691,c_8f224e321c87
259437,t_fffbe1d5d43c,c_46f852a49c08
259438,t_fffbe1d5d43c,c_6659207b25d5


In [59]:
df_train = correlations.merge(df_topic, left_on='topic_id', right_on='id').drop(columns='id')
df_train = df_train.merge(df_content, left_on='content_id', right_on='id', suffixes=['_topic', '_content']).drop(columns='id')
df_train

Unnamed: 0,topic_id,content_id,field_topic,field_content
0,t_0008768bdee6,c_34e1424229b4,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...
1,t_0476b243186a,c_34e1424229b4,[TITLE] 2-અંકની સંખ્યાઓ સાથેના સરવાળાનો પરિચય ...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...
2,t_f50e76d3ddb4,c_34e1424229b4,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...
3,t_0008768bdee6,c_7d1a964d66d5,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવીને ઉમેરવું . [DESCRI...
4,t_f50e76d3ddb4,c_7d1a964d66d5,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવીને ઉમેરવું . [DESCRI...
...,...,...,...,...
259435,t_fff1047917af,c_e6b95de6962f,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਰੋਲ ਪਲੇ (ਨਾਟਕ). [DESCRIPTION]s...
259436,t_fff1047917af,c_f59987cf8a75,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਤਾੜੀ ਵਜਾਉਣਾ. [DESCRIPTION]sour...
259437,t_fff1047917af,c_fc1eca95e2f3,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਹੋਮ ਵਿਜ਼ਿਟ. [DESCRIPTION]source...
259438,t_fff5d93d4dc2,c_79903740e1e8,[TITLE] Discriminação de preços of Decisões de...,[video] [TITLE] Discriminação de preços. [DESC...


In [60]:
df_train = df_train.merge(pd.read_csv(PATH.topic_dir)[['id', 'language']], left_on='topic_id', right_on='id')
df_train = df_train.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)

In [61]:
df_train

Unnamed: 0,topic_id,content_id,field_topic,field_content,id,language
0,t_e11985323a41,c_3ae1e6b63ece,[TITLE] PHP منوعات في of مصادر إثرائية of الحا...,[document] [TITLE] شرح طريقة نسخ الملفات بإستخ...,t_e11985323a41,ar
1,t_14cdcb1953f6,c_bc0497ed6f2d,[TITLE] مثال عملي على Static Method - عملي of ...,[video] [TITLE] 34. مثال على Static Method. [D...,t_14cdcb1953f6,ar
2,t_3205564ddc8a,c_084288f99d74,[TITLE] سَبْعونَ سَنَةٍ مَرَّتْ of قضايا اجتما...,[html5] [TITLE] سَبْعونَ سَنَةٍ مَرَّتْ. [DESC...,t_3205564ddc8a,ar
3,t_cae8ba7d5d0a,c_9efb2065585d,[TITLE] Idiomatic Expressions with the Word HA...,[video] [TITLE] 18 Idiomatic Expressions with ...,t_cae8ba7d5d0a,ar
4,t_f233115e2c97,c_d7d61cadac70,[TITLE] دورة بناء سكربت منتدى of مصادر إثرائية...,[document] [TITLE] شرح التاكيد من ان المستخدم ...,t_f233115e2c97,ar
...,...,...,...,...,...,...
259435,t_74d190e54edd,c_aa0f9a333840,[TITLE] 栽培技术 of 有机园艺 of 自然 of Sikana (中文). [DE...,[video] [TITLE] 有机园艺：种植花草甸. [DESCRIPTION]No in...,t_74d190e54edd,zh
259436,t_3462b3883258,c_ee931b86c45f,[TITLE] 神经系统简介 of Advanced nervous system phys...,[video] [TITLE] 自主神经系统. [DESCRIPTION]No inform...,t_3462b3883258,zh
259437,t_85054bea1344,c_b51d17b93124,[TITLE] 如何建造堆肥 of 建造堆肥 of 自然 of Sikana (中文). [...,[video] [TITLE] 堆肥：含碳植物肥料. [DESCRIPTION]微博：htt...,t_85054bea1344,zh
259438,t_10fec8edaa9a,c_302de4d6e346,[TITLE] 可下載的資源 of 生产能力最大化 of HP LIFE - 课程 (中国大...,[document] [TITLE] 员工工作时间量计算器电子表格(LibreOffice)...,t_10fec8edaa9a,zh


In [62]:
# df_no_source = pd.read_csv(PATH.topic_dir).query('category!="source"')[['id']]
# df_ = df_train[df_train['topic_id'].isin(df_no_source['id'].to_list())]
# df_.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)[['field_topic', 'field_content']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled_no_source.csv'), index=None)

In [63]:
df_train[['field_topic', 'field_content']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_fold_{CFG.fold}_shuffled.csv'), index=None)

## Sample Negatives

In [10]:
correlations = pd.read_csv(PATH.correlation_dir)
df_samples = correlations.merge(pd.read_csv('samples_r4.csv'), on='topic_id', suffixes=['', '_samples'])
df_samples['content_ids'] = df_samples['content_ids'].apply(lambda x: x.split())
df_samples['content_ids_samples'] = df_samples['content_ids_samples'].apply(lambda x: x.split())
df_samples['n_pos'] = df_samples['content_ids'].apply(len)
df_samples['negatives'] = df_samples.apply(lambda x: set(x['content_ids_samples']).difference(x['content_ids']), axis=1)
df_samples['neg_samples'] = df_samples.apply(lambda x: random.choices(x['content_ids_samples'], k=x['n_pos']), axis=1)
df_samples = df_samples[['topic_id', 'content_ids', 'neg_samples']].explode(['content_ids', 'neg_samples']).rename(columns={
    'content_ids': 'content_id', 'neg_samples': 'neg_content_id'})
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id
0,t_0008a1bd84ba,c_7ff92a954a3d,c_09aea29572e2
0,t_0008a1bd84ba,c_8790b074383e,c_31a1dab1f0b6
1,t_000d1fb3f2f5,c_07f1d0eec4b2,c_15a6fb858696
1,t_000d1fb3f2f5,c_15a6fb858696,c_15a6fb858696
1,t_000d1fb3f2f5,c_175e9db3fc44,c_4698dc0a94dc
...,...,...,...
53179,t_fff830472691,c_61fb63326e5d,c_61d312cafa03
53179,t_fff830472691,c_8f224e321c87,c_b47b9cf0dbb8
53180,t_fffbe1d5d43c,c_46f852a49c08,c_bb1db83147d6
53180,t_fffbe1d5d43c,c_6659207b25d5,c_467796e7240b


In [11]:
df_samples = df_samples.merge(df_topic, left_on='topic_id', right_on='id')
df_samples = df_samples.merge(df_content, left_on='content_id', right_on='id', suffixes=['_topic', '_content'])
df_samples = df_samples.merge(df_content, left_on='neg_content_id', right_on='id')
df_samples = df_samples[['topic_id', 'content_id', 'neg_content_id', 'field_topic', 'field_content', 'field']].rename(columns={'field': 'neg_content_field'})

In [12]:
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id,field_topic,field_content,neg_content_field
0,t_0008a1bd84ba,c_7ff92a954a3d,c_09aea29572e2,[TITLE] 12. 20: Bird Reproduction of 12: Verte...,[html5] [TITLE] 12. 20: Bird Reproduction. [DE...,[video] [TITLE] How the sun will die : and wha...
1,t_802587e51d37,c_2b49c61c4579,c_09aea29572e2,[TITLE] Life in the universe of Life on earth ...,[video] [TITLE] Detectable civilizations in ou...,[video] [TITLE] How the sun will die : and wha...
2,t_0008a1bd84ba,c_8790b074383e,c_31a1dab1f0b6,[TITLE] 12. 20: Bird Reproduction of 12: Verte...,[video] [TITLE] Astounding Mating Dance Birds ...,[video] [TITLE] What is Mitosis? . [DESCRIPTIO...
3,t_cc793ab0157a,c_da0b96708086,c_31a1dab1f0b6,[TITLE] Stages of Mitosis of Genetics of Biol...,[video] [TITLE] DNA Replication . [DESCRIPTION...,[video] [TITLE] What is Mitosis? . [DESCRIPTIO...
4,t_000d1fb3f2f5,c_07f1d0eec4b2,c_15a6fb858696,[TITLE] 2.1.2 - Logarithms of 2.1 - Exponents ...,[video] [TITLE] Proof of the logarithm change ...,[video] [TITLE] Intro to logarithm properties ...
...,...,...,...,...,...,...
246471,t_fff1047917af,c_9a1e7ca073bb,c_198ec6788329,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਕਿਤਾਬਾਂ ਦਾ ਮਹੱਤਵ. [DESCRIPTION...,[video] [TITLE] ਸਕੂਲ ਦੀ ਤਿਆਰੀ. [DESCRIPTION]so...
246472,t_fff1047917af,c_fc1eca95e2f3,c_198ec6788329,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਹੋਮ ਵਿਜ਼ਿਟ. [DESCRIPTION]source...,[video] [TITLE] ਸਕੂਲ ਦੀ ਤਿਆਰੀ. [DESCRIPTION]so...
246473,t_fff1047917af,c_9c85c3660b6b,c_db15ac2f1b6a,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਮੈਚਿੰਗ ਖੇਡ. [DESCRIPTION]sourc...,[video] [TITLE] ਸੁਤੰਤਰ ਚਿੱਤਰ. [DESCRIPTION]sou...
246474,t_fff1047917af,c_e6b95de6962f,c_db15ac2f1b6a,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਰੋਲ ਪਲੇ (ਨਾਟਕ). [DESCRIPTION]s...,[video] [TITLE] ਸੁਤੰਤਰ ਚਿੱਤਰ. [DESCRIPTION]sou...


In [13]:
df_samples = df_samples.merge(pd.read_csv(PATH.topic_dir)[['id', 'language']], left_on='topic_id', right_on='id')
df_samples = df_samples.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id,field_topic,field_content,neg_content_field,id,language
0,t_a8fcd3872f24,c_a06a51cf19bf,c_b7f51b67d1fb,[TITLE] الفيزياء of الثالث الإعدادي of Science...,[video] [TITLE] الدرس الأول : الخصائص الفيزيائ...,[video] [TITLE] درس : المبادئ الأساسية للوراثة...,t_a8fcd3872f24,ar
1,t_3098e7b7f7c1,c_39a4118a92e0,c_721365da986b,[TITLE] تعرّف على المتجهات في الفراغ من خلال ت...,[exercise] [TITLE] تعرّف على المتجهات في الفرا...,[video] [TITLE] Video. [DESCRIPTION]5ad47e11f3...,t_3098e7b7f7c1,ar
2,t_732c59949a46,c_e1fce8bce0ec,c_6b9b424357c1,[TITLE] يوجد محصّلة قوّتين مقدارا واتجاها (الق...,[video] [TITLE] Video. [DESCRIPTION]5acc83e63a...,[exercise] [TITLE] يحدّد بعض قواعد الاشتقاق: م...,t_732c59949a46,ar
3,t_84cf80c84113,c_fe76033dfea0,c_165017b1d9fc,"[TITLE] ""كشّ مَلِك"" of فنّ وثقافة of صف ٤-٦ of...",[exercise] [TITLE] المفردات والتراكيب. [DESCRI...,[exercise] [TITLE] الاستيعاب. [DESCRIPTION]No ...,t_84cf80c84113,ar
4,t_ee01a6ee89be,c_c6ec4ee7a3f2,c_20ea49d54f30,[TITLE] نحو للصف الثالث الإعدادي of النحو وقوا...,[video] [TITLE] الثوابت الإعرابية. [DESCRIPTIO...,[video] [TITLE] الإستفهام. [DESCRIPTION]No inf...,t_ee01a6ee89be,ar
...,...,...,...,...,...,...,...,...
246471,t_dd62a56c1d37,c_93df549e5744,c_f56358d3ee60,[TITLE] 乘法运算定律 of 运算定律 of 四年级 of 数学 of Khan Ac...,[exercise] [TITLE] 面积和分配律. [DESCRIPTION]在求长方形的...,[exercise] [TITLE] 商是整十数的除法题 2. [DESCRIPTION]练...,t_dd62a56c1d37,zh
246472,t_225167cfab39,c_f060c2d07245,c_cd0997614fef,[TITLE] 书写基本的应用题代数表达式 of 变量和表达式 of 六年级（美国） of ...,[exercise] [TITLE] 书写基本的应用题表达式. [DESCRIPTION]练...,"[video] [TITLE] 主语和宾语代词. [DESCRIPTION]在学习代词之前,...",t_225167cfab39,zh
246473,t_53914f169741,c_45a9c81b0f81,c_562e0c255c9b,[TITLE] 亿以内数的认识 of 大数的认识 of 四年级 of 数学 of Khan ...,[exercise] [TITLE] 整数四舍五入应用题. [DESCRIPTION]四舍五...,[exercise] [TITLE] 挑战：三位数的数位. [DESCRIPTION]练习：...,t_53914f169741,zh
246474,t_197ddff5fbba,c_69ff45ca7e68,c_05b75540830a,[TITLE] 分数的意义 of 分数的意义和性质 of 五年级 of 数学 of Khan...,[video] [TITLE] 一半与四分之一的认识. [DESCRIPTION]当我们把一...,"[video] [TITLE] 用分数除法造带分数. [DESCRIPTION]解释分数, ...",t_197ddff5fbba,zh


## Save

In [14]:
# df_samples[['field_topic', 'field_content', 'neg_content_field']].to_parquet(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled_sampled_r3.pqt'))
df_samples[['field_topic', 'field_content', 'neg_content_field']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled_sampled_r4.csv'), index=None)

In [15]:
# df_train[['field_topic', 'field_content']].to_parquet(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled.pqt'))
## df_train[['field_topic', 'field_content']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled.csv'), index=None)


In [16]:
pd.read_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled_sampled_r4.csv'))

Unnamed: 0,field_topic,field_content,neg_content_field
0,[TITLE] الفيزياء of الثالث الإعدادي of Science...,[video] [TITLE] الدرس الأول : الخصائص الفيزيائ...,[video] [TITLE] درس : المبادئ الأساسية للوراثة...
1,[TITLE] تعرّف على المتجهات في الفراغ من خلال ت...,[exercise] [TITLE] تعرّف على المتجهات في الفرا...,[video] [TITLE] Video. [DESCRIPTION]5ad47e11f3...
2,[TITLE] يوجد محصّلة قوّتين مقدارا واتجاها (الق...,[video] [TITLE] Video. [DESCRIPTION]5acc83e63a...,[exercise] [TITLE] يحدّد بعض قواعد الاشتقاق: م...
3,"[TITLE] ""كشّ مَلِك"" of فنّ وثقافة of صف ٤-٦ of...",[exercise] [TITLE] المفردات والتراكيب. [DESCRI...,[exercise] [TITLE] الاستيعاب. [DESCRIPTION]No ...
4,[TITLE] نحو للصف الثالث الإعدادي of النحو وقوا...,[video] [TITLE] الثوابت الإعرابية. [DESCRIPTIO...,[video] [TITLE] الإستفهام. [DESCRIPTION]No inf...
...,...,...,...
246471,[TITLE] 乘法运算定律 of 运算定律 of 四年级 of 数学 of Khan Ac...,[exercise] [TITLE] 面积和分配律. [DESCRIPTION]在求长方形的...,[exercise] [TITLE] 商是整十数的除法题 2. [DESCRIPTION]练...
246472,[TITLE] 书写基本的应用题代数表达式 of 变量和表达式 of 六年级（美国） of ...,[exercise] [TITLE] 书写基本的应用题表达式. [DESCRIPTION]练...,"[video] [TITLE] 主语和宾语代词. [DESCRIPTION]在学习代词之前,..."
246473,[TITLE] 亿以内数的认识 of 大数的认识 of 四年级 of 数学 of Khan ...,[exercise] [TITLE] 整数四舍五入应用题. [DESCRIPTION]四舍五...,[exercise] [TITLE] 挑战：三位数的数位. [DESCRIPTION]练习：...
246474,[TITLE] 分数的意义 of 分数的意义和性质 of 五年级 of 数学 of Khan...,[video] [TITLE] 一半与四分之一的认识. [DESCRIPTION]当我们把一...,"[video] [TITLE] 用分数除法造带分数. [DESCRIPTION]解释分数, ..."
