In [20]:
import os
import gc
import psutil
from pathlib import Path

import pandas as pd
import numpy as np
import random
pd.set_option('display.max_rows', 100)
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding

device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [21]:
class PATH:
    input_dir = '/root/autodl-tmp/data/k12/cv_split_new/train/fold_3'
    output_dir = '/root/autodl-tmp/data/k12/out/'
    # cv_dir = '/root/autodl-nas/data/k12/cv_data'
    content_dir = os.path.join(input_dir, 'content.csv')
    correlation_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')
    
class CFG:
    fold = 3

## Text Field Features

In [22]:
df_content = pd.read_parquet(os.path.join(PATH.output_dir, 'content_field.pqt'))
df_topic = pd.read_parquet(os.path.join(PATH.output_dir, 'topic_field.pqt'))

In [23]:
df_content

Unnamed: 0,id,field
0,c_00002381196d,[video] [TITLE] Sumar números de varios dígito...
1,c_000087304a9e,[video] [TITLE] Trovare i fattori di un numero...
2,c_0000ad142ddb,[video] [TITLE] Sumar curvas de demanda. [DESC...
3,c_0000c03adc8d,[document] [TITLE] Nado de aproximação. [DESCR...
4,c_00016694ea2a,[document] [TITLE] geometry-m3-topic-a-overvie...
...,...,...
154042,c_fffcbdd4de8b,[html5] [TITLE] 2. 12: Diffusion. [DESCRIPTION...
154043,c_fffe15a2d069,[video] [TITLE] Sommare facendo gruppi da 10. ...
154044,c_fffed7b0d13a,[video] [TITLE] Introdução à subtração. [DESCR...
154045,c_ffff04ba7ac7,[video] [TITLE] SA of a Cone. [DESCRIPTION]No ...


In [24]:
correlations = pd.read_csv(PATH.correlation_dir)

In [25]:
correlations['content_id'] = correlations['content_ids'].apply(lambda x: x.split())
correlations = correlations.explode('content_id').drop(columns='content_ids').reset_index(drop=True)

In [26]:
correlations

Unnamed: 0,topic_id,content_id
0,t_0008768bdee6,c_34e1424229b4
1,t_0008768bdee6,c_7d1a964d66d5
2,t_0008768bdee6,c_aab93ee667f4
3,t_0008a1bd84ba,c_7ff92a954a3d
4,t_0008a1bd84ba,c_8790b074383e
...,...,...
238536,t_fff830472691,c_61fb63326e5d
238537,t_fff830472691,c_8f224e321c87
238538,t_fffbe1d5d43c,c_46f852a49c08
238539,t_fffbe1d5d43c,c_6659207b25d5


In [27]:
df_train = correlations.merge(df_topic, left_on='topic_id', right_on='id').drop(columns='id')
df_train = df_train.merge(df_content, left_on='content_id', right_on='id', suffixes=['_topic', '_content']).drop(columns='id')
df_train

Unnamed: 0,topic_id,content_id,field_topic,field_content
0,t_0008768bdee6,c_34e1424229b4,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...
1,t_0476b243186a,c_34e1424229b4,[TITLE] 2-અંકની સંખ્યાઓ સાથેના સરવાળાનો પરિચય ...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...
2,t_0008768bdee6,c_7d1a964d66d5,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવીને ઉમેરવું . [DESCRI...
3,t_12ae8350acdf,c_7d1a964d66d5,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવીને ઉમેરવું . [DESCRI...
4,t_23bd51dafdd2,c_7d1a964d66d5,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવીને ઉમેરવું . [DESCRI...
...,...,...,...,...
238536,t_fff1047917af,c_f59987cf8a75,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਤਾੜੀ ਵਜਾਉਣਾ. [DESCRIPTION]sour...
238537,t_fff1047917af,c_fc1eca95e2f3,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਹੋਮ ਵਿਜ਼ਿਟ. [DESCRIPTION]source...
238538,t_fff5d93d4dc2,c_79903740e1e8,[TITLE] Discriminação de preços of Decisões de...,[video] [TITLE] Discriminação de preços. [DESC...
238539,t_fff830472691,c_61fb63326e5d,[TITLE] Scalar Projections of Vector Analysis ...,[html5] [TITLE] Scalar Projections. [DESCRIPTI...


In [28]:
df_train = df_train.merge(pd.read_csv(PATH.topic_dir)[['id', 'language']], left_on='topic_id', right_on='id')
df_train = df_train.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)

In [29]:
df_train

Unnamed: 0,topic_id,content_id,field_topic,field_content,id,language
0,t_9b671585b1db,c_6a0c6a3c8e24,[TITLE] JavaScript DOM of Abdulla Eid Network ...,[video] [TITLE] 43. مثال على التعامل مع أكثر م...,t_9b671585b1db,ar
1,t_830b762427ad,c_5bfd6e59291e,[TITLE] توضيح مفهوم Return بتفصيل - نظري of جا...,[video] [TITLE] 25. مفهوم Return بتفصيل. [DESC...,t_830b762427ad,ar
2,t_8a4a7ffd1ca7,c_b067a77653ff,[TITLE] Unit 5 of المهارات اللغوية of Arabic L...,[video] [TITLE] الأسماء المرفوعة 51. [DESCRIP...,t_8a4a7ffd1ca7,ar
3,t_2b707e84b379,c_91a7bd26fb91,[TITLE] رَفْرِفْ يا عَلَمَ جُمْهوريَّةِ جيبوتي...,[exercise] [TITLE] المفردات والتراكيب. [DESCRI...,t_2b707e84b379,ar
4,t_d23864accb71,c_5c1d5064c7be,[TITLE] ما هو الإقتصاد؟ of علوم اجتماعية وإنسا...,[video] [TITLE] تخطى الإنحياز فى الإقتصاد | ما...,t_d23864accb71,ar
...,...,...,...,...,...,...
238536,t_4e0883cf050c,c_f65e3a55d749,"[TITLE] 地球,太阳,银河,以及宇宙的尺度 of 宇宙的尺度 of 宇宙学和天文学 o...",[video] [TITLE] 相邻恒星的尺度. [DESCRIPTION]离我们太阳系最近...,t_4e0883cf050c,zh
238537,t_2c5183f1189d,c_12cd3d36601c,[TITLE] 10-20的认识 of 计数和数位 of 幼儿园 of 数学 of Khan...,[video] [TITLE] 11~20各数的认识. [DESCRIPTION]探究数字十...,t_2c5183f1189d,zh
238538,t_8ea0d9e17796,c_2f1570de4669,[TITLE] 小数乘法 of 数学运算 of 六年级（美国） of 数学 of Khan ...,[exercise] [TITLE] 像2.45x3.6的小数乘法 (标准算法). [DES...,t_8ea0d9e17796,zh
238539,t_ff4e86f91a69,c_69932781aa8f,[TITLE] 简介 of 涂鸦：城市艺术 of 艺术 of Sikana (中文). [D...,[document] [TITLE] 城市涂鸦：使用喷漆. [DESCRIPTION]微博：...,t_ff4e86f91a69,zh


In [30]:
# df_no_source = pd.read_csv(PATH.topic_dir).query('category!="source"')[['id']]
# df_ = df_train[df_train['topic_id'].isin(df_no_source['id'].to_list())]
# df_.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)[['field_topic', 'field_content']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled_no_source.csv'), index=None)

In [31]:
correlations

Unnamed: 0,topic_id,content_id
0,t_0008768bdee6,c_34e1424229b4
1,t_0008768bdee6,c_7d1a964d66d5
2,t_0008768bdee6,c_aab93ee667f4
3,t_0008a1bd84ba,c_7ff92a954a3d
4,t_0008a1bd84ba,c_8790b074383e
...,...,...
238536,t_fff830472691,c_61fb63326e5d
238537,t_fff830472691,c_8f224e321c87
238538,t_fffbe1d5d43c,c_46f852a49c08
238539,t_fffbe1d5d43c,c_6659207b25d5


## Sample Negatives

In [32]:
correlations = pd.read_csv(PATH.correlation_dir)
df_samples = correlations.merge(pd.read_csv('/root/autodl-tmp/data/k12/cv_split_new/train/fold_3/sample/f3r2_top50.csv'), on='topic_id', suffixes=['', '_samples'])
df_samples['content_ids'] = df_samples['content_ids'].apply(lambda x: x.split())
df_samples['content_ids_samples'] = df_samples['content_ids_samples'].apply(lambda x: x.split())
df_samples['n_pos'] = df_samples['content_ids'].apply(len)
df_samples['negatives'] = df_samples.apply(lambda x: set(x['content_ids_samples']).difference(x['content_ids']), axis=1)
df_samples['neg_samples'] = df_samples.apply(lambda x: random.choices(x['content_ids_samples'], k=x['n_pos']), axis=1)
df_samples = df_samples[['topic_id', 'content_ids', 'neg_samples']].explode(['content_ids', 'neg_samples']).rename(columns={
    'content_ids': 'content_id', 'neg_samples': 'neg_content_id'})
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id
0,t_0008768bdee6,c_34e1424229b4,c_d975a404e6b1
0,t_0008768bdee6,c_7d1a964d66d5,c_71a2b6ad17d9
0,t_0008768bdee6,c_aab93ee667f4,c_42ab073d6b99
1,t_0008a1bd84ba,c_7ff92a954a3d,c_ed3edd51507c
1,t_0008a1bd84ba,c_8790b074383e,c_9d2cfd89891b
...,...,...,...
51514,t_fff830472691,c_61fb63326e5d,c_67b919667b2b
51514,t_fff830472691,c_8f224e321c87,c_705360d9e63d
51515,t_fffbe1d5d43c,c_46f852a49c08,c_b77fd3e7e442
51515,t_fffbe1d5d43c,c_6659207b25d5,c_0dd3eab0f444


In [33]:
df_samples = df_samples.merge(df_topic, left_on='topic_id', right_on='id')
df_samples = df_samples.merge(df_content, left_on='content_id', right_on='id', suffixes=['_topic', '_content'])
df_samples = df_samples.merge(df_content, left_on='neg_content_id', right_on='id')
df_samples = df_samples[['topic_id', 'content_id', 'neg_content_id', 'field_topic', 'field_content', 'field']].rename(columns={'field': 'neg_content_field'})

In [34]:
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id,field_topic,field_content,neg_content_field
0,t_0008768bdee6,c_34e1424229b4,c_d975a404e6b1,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] સમૂહ બનાવ્યા વિના 2-અંકની સંખ્...,[exercise] [TITLE] 2-અંકની સંખ્યા બાદ કરો (સમૂ...
1,t_66080e9067f6,c_aab93ee667f4,c_d975a404e6b1,[TITLE] સમૂહ બનાવીને ઉમેરવું of સમૂહ બનાવીને ...,[exercise] [TITLE] સ્થાનકિંમતના બ્લોકનો ઉપયોગ ...,[exercise] [TITLE] 2-અંકની સંખ્યા બાદ કરો (સમૂ...
2,t_66080e9067f6,c_6330c364e01e,c_d975a404e6b1,[TITLE] સમૂહ બનાવીને ઉમેરવું of સમૂહ બનાવીને ...,[video] [TITLE] 10 નો એક સમૂહ બનાવીને ઉમેરવું ...,[exercise] [TITLE] 2-અંકની સંખ્યા બાદ કરો (સમૂ...
3,t_a8d1356b2e4a,c_b38aa74197df,c_d975a404e6b1,[TITLE] 100 સુધીની બાદબાકી of 100 સુધીના સરવાળ...,[video] [TITLE] સમૂહ બનાવીને બાદબાકી કરવી (દશક...,[exercise] [TITLE] 2-અંકની સંખ્યા બાદ કરો (સમૂ...
4,t_7a9391e2802f,c_ce289db53b64,c_d975a404e6b1,[TITLE] 100 સુધીની બાદબાકી of 100 સુધીના સરવાળ...,[exercise] [TITLE] સ્થાનકિંમતના બ્લોકનો ઉપયોગ ...,[exercise] [TITLE] 2-અંકની સંખ્યા બાદ કરો (સમૂ...
...,...,...,...,...,...,...
238536,t_fff1047917af,c_463da5be56ce,c_9a1e7ca073bb,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਟਾਵਰ ਬਣਾਉਣਾ. [DESCRIPTION]sour...,[video] [TITLE] ਕਿਤਾਬਾਂ ਦਾ ਮਹੱਤਵ. [DESCRIPTION...
238537,t_fff1047917af,c_e6b95de6962f,c_9a1e7ca073bb,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਰੋਲ ਪਲੇ (ਨਾਟਕ). [DESCRIPTION]s...,[video] [TITLE] ਕਿਤਾਬਾਂ ਦਾ ਮਹੱਤਵ. [DESCRIPTION...
238538,t_fff1047917af,c_65d9ac4c7e96,c_016def92802a,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਬਲਗੀਤ. [DESCRIPTION]source_url...,[video] [TITLE] ਆਂਗਨਵਾੜੀ ਦੀ ਸਜਾਵਟ. [DESCRIPTIO...
238539,t_fff1047917af,c_86627e3a623c,c_3bb33bd78c3a,[TITLE] ਅਧਿਆਪਕਾਂ ਲਈ of ਦੇਖੋ ਅਤੇ ਕਰੋ of 3-6 yea...,[video] [TITLE] ਬੱਚਿਆਂ ਦੀ ਸਿੱਖਿਆ ਲਈ ਸਮੁਦਾਇ ਦੀ ...,[video] [TITLE] ਦਰੱਖਤ ਦਾ ਚਿੱਤਰ ਅਤੇ ਪੱਤੇ ਚਿਪਕਾਉ...


In [35]:
df_samples = df_samples.merge(pd.read_csv(PATH.topic_dir)[['id', 'language']], left_on='topic_id', right_on='id')
df_samples = df_samples.groupby('language').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)
df_samples

Unnamed: 0,topic_id,content_id,neg_content_id,field_topic,field_content,neg_content_field,id,language
0,t_45798878fbb2,c_b7839730b0e1,c_1a9b3b5f171c,[TITLE] تَعَرَّفْ إِلى مُخْتَرِعِ كِمامَةِ N95...,[audio] [TITLE] تَعَرَّفْ إِلى مُخْتَرِعِ كِما...,[exercise] [TITLE] الاستماع. [DESCRIPTION]No i...,t_45798878fbb2,ar
1,t_99ae13cd44e9,c_5f3ba67f4972,c_38b61bc3e344,"[TITLE] ""رامي"" الحَكيم of قضايا اجتماعيّة وقيم...","[html5] [TITLE] ""رامي"" الحَكيم. [DESCRIPTION]ت...",[exercise] [TITLE] القواعد. [DESCRIPTION]No in...,t_99ae13cd44e9,ar
2,t_68390eefd5bc,c_fa254546d734,c_a1eca938db81,"[TITLE] سيّاراتُ ""تِسْلا"" of تكنولوجيا وعلوم o...",[exercise] [TITLE] الاستيعاب. [DESCRIPTION]No ...,[exercise] [TITLE] المفردات والتراكيب. [DESCRI...,t_68390eefd5bc,ar
3,t_d79eeb04dec2,c_801b9cce854a,c_b6f5216e4c6a,[TITLE] XMind of Abdulla Eid Network (العربيّة...,[video] [TITLE] 2. التعرف على بيئة XMind. [DES...,[video] [TITLE] 19. انشاء دوالك الخاصة في الـ ...,t_d79eeb04dec2,ar
4,t_027b0363589d,c_2ce52d6ac96b,c_72e72979dd08,[TITLE] Unit 3 of تقييم وتشخيص صعوبات التعلم o...,[video] [TITLE] تقييم وتشخيص صعوبات التعلم | م...,[video] [TITLE] تقييم وتشخيص صعوبات التعلم | ق...,t_027b0363589d,ar
...,...,...,...,...,...,...,...,...
238536,t_2b53fc2691c0,c_06c6a5dbd66c,c_c3b030650102,[TITLE] 传球技巧 of 如何踢足球 of 运动 of Sikana (中文). [D...,[video] [TITLE] 如何进行人球分过. [DESCRIPTION]在本视频中您将...,[video] [TITLE] 踩单车动作. [DESCRIPTION]本视频面向初学者，教...,t_2b53fc2691c0,zh
238537,t_8fbc333ff702,c_44305c24dba9,c_44305c24dba9,[TITLE] 有氧运动 of 老年人健身运动 of 运动 of Sikana (中文). ...,[document] [TITLE] 单人有氧运动：原地跑步. [DESCRIPTION]在...,[document] [TITLE] 单人有氧运动：原地跑步. [DESCRIPTION]在...,t_8fbc333ff702,zh
238538,t_098583a3882e,c_6ca908eae6d9,c_a7c7ace20b37,[TITLE] 遇到意外情况应该怎么办 of 远足徒步登山教程 of 运动 of Sikan...,[document] [TITLE] 何预防蜱虫叮咬. [DESCRIPTION]本视频针对...,[document] [TITLE] 外野手练习1. [DESCRIPTION]请跟随我们的...,t_098583a3882e,zh
238539,t_d710da654b7c,c_eb972abec0b2,c_979b53262495,[TITLE] 基础技巧 of 冲浪 of 运动 of Sikana (中文). [DESC...,[document] [TITLE] 划水姿势. [DESCRIPTION]在本视频中，你将...,[document] [TITLE] 冲管浪. [DESCRIPTION]在本视频中，你将学...,t_d710da654b7c,zh


## Save

In [36]:
# df_samples[['field_topic', 'field_content', 'neg_content_field']].to_parquet(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled_sampled_r3.pqt'))
df_samples[['field_topic', 'field_content', 'neg_content_field']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_f3r3_top50_shuffled_sampled_r3.csv'), index=None)

In [37]:
# df_train[['field_topic', 'field_content']].to_parquet(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled.pqt'))
## df_train[['field_topic', 'field_content']].to_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_{CFG.fold}_shuffled.csv'), index=None)


In [38]:
pd.read_csv(os.path.join(PATH.output_dir, 'retrieval', f'retrieval_field_f3r3_top50_shuffled_sampled_r3.csv'))

Unnamed: 0,field_topic,field_content,neg_content_field
0,[TITLE] تَعَرَّفْ إِلى مُخْتَرِعِ كِمامَةِ N95...,[audio] [TITLE] تَعَرَّفْ إِلى مُخْتَرِعِ كِما...,[exercise] [TITLE] الاستماع. [DESCRIPTION]No i...
1,"[TITLE] ""رامي"" الحَكيم of قضايا اجتماعيّة وقيم...","[html5] [TITLE] ""رامي"" الحَكيم. [DESCRIPTION]ت...",[exercise] [TITLE] القواعد. [DESCRIPTION]No in...
2,"[TITLE] سيّاراتُ ""تِسْلا"" of تكنولوجيا وعلوم o...",[exercise] [TITLE] الاستيعاب. [DESCRIPTION]No ...,[exercise] [TITLE] المفردات والتراكيب. [DESCRI...
3,[TITLE] XMind of Abdulla Eid Network (العربيّة...,[video] [TITLE] 2. التعرف على بيئة XMind. [DES...,[video] [TITLE] 19. انشاء دوالك الخاصة في الـ ...
4,[TITLE] Unit 3 of تقييم وتشخيص صعوبات التعلم o...,[video] [TITLE] تقييم وتشخيص صعوبات التعلم | م...,[video] [TITLE] تقييم وتشخيص صعوبات التعلم | ق...
...,...,...,...
238536,[TITLE] 传球技巧 of 如何踢足球 of 运动 of Sikana (中文). [D...,[video] [TITLE] 如何进行人球分过. [DESCRIPTION]在本视频中您将...,[video] [TITLE] 踩单车动作. [DESCRIPTION]本视频面向初学者，教...
238537,[TITLE] 有氧运动 of 老年人健身运动 of 运动 of Sikana (中文). ...,[document] [TITLE] 单人有氧运动：原地跑步. [DESCRIPTION]在...,[document] [TITLE] 单人有氧运动：原地跑步. [DESCRIPTION]在...
238538,[TITLE] 遇到意外情况应该怎么办 of 远足徒步登山教程 of 运动 of Sikan...,[document] [TITLE] 何预防蜱虫叮咬. [DESCRIPTION]本视频针对...,[document] [TITLE] 外野手练习1. [DESCRIPTION]请跟随我们的...
238539,[TITLE] 基础技巧 of 冲浪 of 运动 of Sikana (中文). [DESC...,[document] [TITLE] 划水姿势. [DESCRIPTION]在本视频中，你将...,[document] [TITLE] 冲管浪. [DESCRIPTION]在本视频中，你将学...
