In [25]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import pandas as pd
import numpy as np
from tqdm import tqdm

import polars as pl
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from transformers import AutoTokenizer, AutoModel, AutoConfig, BertForSequenceClassification

from sklearn.neighbors import NearestNeighbors

In [None]:
ON_STAGE2 = True  # setup False to close Stage2
N_NEIGHBOR = 20 # count of Stage1 retrival

# INPUT_DIR = '/kaggle/input/learning-equality-curriculum-recommendations'
OUTPUT_PATH = '/home/search3/lichunyu/k12-curriculum-recommendations/tmp/output'

# TOPIC_DIR = os.path.join(INPUT_DIR, 'topics.csv')
# CONTENT_DIR = os.path.join(INPUT_DIR, 'content.csv')
# CORR_DIR = os.path.join(INPUT_DIR, 'sample_submission.csv')

MODEL_DIR = '/home/search3/lichunyu/k12-curriculum-recommendations/data/input/models/stage1/all-MiniLM-L6-v2/r1_12_38520/checkpoint-38520'
TOKENIZER_DIR = '/home/search3/lichunyu/k12-curriculum-recommendations/data/input/models/stage1/all-MiniLM-L6-v2/all-MiniLM-L6-v2/all-MiniLM-L6-v2_new'

In [None]:
def get_topic_field(d):
    title = list(filter(lambda x: pd.notna(x), d['title_level']))
    title = ' of '.join(title[-1::-1])
    title = 'No information' if title=='' else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    field = title + description
    return field

def get_content_field(d):
    title = d['title']
    title = 'No information' if pd.isna(title) else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    kind = '[' + d['kind'] + '] '
    field = kind + title + description
    return field

In [None]:
class DataPreparation:
    
    def __init__(self, topic_path, content_path, submission_path):
        self.topic = pd.read_csv(topic_path)
        self.content = pd.read_parquet(content_path)
        self.corr = pd.read_parquet(submission_path).drop_duplicates(subset="topic_id").reset_index(drop=True)
        # self.topic = self.topic[self.topic['id'].isin(self.corr['topic_id'].to_list())]
        self.match_dict = None
    
    def prepare_topic(self):
        df_level = self._get_level_features(self.topic)
        self.topic = self.topic.merge(df_level, on='id', how='inner')
        self.topic['field'] = self.topic.apply(lambda x: get_topic_field(x), axis=1)
        return self.topic
    
    def prepare_content(self):
        self.content['field'] = self.content.apply(lambda x: get_content_field(x), axis=1)
        return self.content
    
    def prepare_language_match(self):
        topic = self.topic[['id', 'language']].merge(self.corr, left_on='id', right_on='topic_id', how='right')[['id', 'language']]
        match_dict = {}
        print(topic['language'].unique())
        for language in topic['language'].unique():
            match_dict[language] = (topic.query('language==@language')[['id']], self.content.query('language==@language')[['id']])
        self.match_dict = match_dict
        return match_dict
    
    
    def _get_level_features(self, df_topic, level_cols=['title']):
        cols = list(set(level_cols + ['id', 'parent', 'level', 'has_content']))
        df_hier = df_topic[cols]
        
        highest_level = df_hier['level'].max()
        print(f'Highest Level: {highest_level}')

        df_level = df_hier.query('level == 0').copy(deep=True)
        level_list = list()
        for col in level_cols:
            df_level[f'{col}_level'] = df_level[f'{col}'].apply(lambda x: [x])

        for i in tqdm(range(highest_level + 1)):
            level_list.append(df_level[df_level['has_content']])
            df_level_high = df_hier.query('level == @i+1')
            df_level = df_level_high.merge(df_level, left_on='parent', right_on='id', suffixes=['', '_parent'], how='inner')
            for col in level_cols:
                df_level[f'{col}_level'] = df_level[f'{col}_level'] + df_level[f'{col}'].apply(lambda x: [x])
            for col in df_level.columns:
                if col.endswith('_parent'):
                    df_level.drop(columns=col, inplace=True)
        df = pd.concat(level_list).reset_index(drop=True)
        return df[set(['id'] + [f'{col}_level' for col in level_cols])]
    
    def prepare(self):
        self.prepare_topic()
        self.prepare_content()
        self.prepare_language_match()

In [None]:
%%time

# Train
# topic_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/raw/topics.csv"
# content_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_content_flod0.pqt"
# corr_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_correlations_flod0.pqt"

# Valid
topic_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/raw/topics.csv"
content_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/valid_content_flod0.pqt"
corr_dir = "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/valid_correlations_flod0_no_source.pqt"


dp = DataPreparation(topic_dir, content_dir, corr_dir)
x = dp.prepare()

# Retrival

In [None]:
class PlainDataset(Dataset):

    def __init__(self, df, tokenizer, label_name="") -> None:
        super().__init__()
        self.data = df[label_name].tolist()
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        text = self.data[index]
        inputs = self.tokenizer(
                text, 
                add_special_tokens = True,
                truncation='longest_first',
                max_length = 128,
                padding = 'max_length',
                return_attention_mask = True,
                return_tensors = 'pt',
        )
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        return inputs

    def __len__(self):
        return len(self.data)

In [None]:
class Retrieval():
    
    def __init__(self, model_path, tokenizer_path, dp):
        self.model = AutoModel.from_pretrained(model_path).cuda()
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        
        self.topic = dp.topic
        self.content = dp.content
        self.corr = dp.corr
        self.topic_content_match = dp.match_dict
    
    def convert2embed(self, df, label_name='field'):
        embed: list = []
        dataset = PlainDataset(df, tokenizer=self.tokenizer, label_name=label_name)
        dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=32)
        with torch.no_grad():
            for batch in dataloader:
                batch = {k: v.cuda() for k, v in batch.items()}
                embeddings = self.model(**batch, output_hidden_states=True, return_dict=True).pooler_output
                embed.append(embeddings.cpu().clone().detach().numpy())
        embed = np.concatenate(embed, axis=0)
        return embed
    
    def get_embed(self):
        for lang in self.topic_content_match.keys():
            if not isinstance(lang, str):
                continue
            topic_, content_ = self.topic_content_match[lang]
            topic_ = topic_[['id']].merge(self.topic[['id', 'field']], on='id', how='left')
            content_ = content_[['id']].merge(self.content[['id', 'field']], on='id', how='left')
            
            topic_path = os.path.join(OUTPUT_PATH, f"topic_{lang}.pqt")
            content_path = os.path.join(OUTPUT_PATH, f"content_{lang}.pqt")
            topic_.to_parquet(topic_path)
            content_.to_parquet(content_path)
            
            for t in ["content", "topic"]:
                path = os.path.join(OUTPUT_PATH, f"{t}_{lang}.pqt")
                df = pd.read_parquet(path)
                embed = self.convert2embed(df, label_name="field")
                np.save(path.replace(".pqt", ".npy"), embed)
                
    def inference(self):
        recall_amount = 0
        recall_amount_total = 0
        recall_num = 0
        recall_total = {}
        
        df_pred_list = []
        for lang in self.topic_content_match.keys():
            if not isinstance(lang, str):
                continue
            # global df_pred, df_correlations
            content_path = os.path.join(OUTPUT_PATH, f"content_{lang}.npy")
            topics_path = os.path.join(OUTPUT_PATH, f"topic_{lang}.npy")
            content_array = np.load(content_path)
            topics_array = np.load(topics_path)
            
            model = NearestNeighbors(n_neighbors=N_NEIGHBOR, metric="cosine")
            model.fit(content_array)
            d, r = model.kneighbors(topics_array)
            df_content = pd.read_parquet(content_path.replace(".npy", ".pqt"))
            df_topics = pd.read_parquet(topics_path.replace(".npy", ".pqt"))
            df_correlations = self.corr

            pred = {"topic_id": [], "content_ids": []}
            for i in range(len(df_topics)):
                r_t = r[i]
                tmp = []
                for c in r_t:
                    content_id = df_content.iloc[c]["id"]
                    tmp.append(content_id)
                topics_id = df_topics.iloc[i]["id"]
                pred["topic_id"].append(topics_id)
                pred["content_ids"].append(tmp)

            df_pred = pd.DataFrame(pred).astype({"topic_id": str})
            df_pred_list.append(df_pred)
        df_pred = pd.concat(df_pred_list)
        self.df_pred = df_pred
        self.df_pred['content_ids'] = self.df_pred.apply(lambda x: ' '.join(x['content_ids']), axis=1)
        
    def save_pred(self, path='submission.csv'):
        self.df_pred.to_csv(path, index=None)
                
    def retrieval(self):
        self.get_embed()
        self.inference()
        self.save_pred()
        return self.df_pred

In [None]:
%%time
s1 = Retrieval(MODEL_DIR, TOKENIZER_DIR, dp)
df_retrival = s1.retrieval()

In [None]:
df_retrival = df_retrival.reset_index(drop=True)

In [None]:
df_retrival

In [None]:
df_retrival.to_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/top20/valid/raw_submission.pqt")

In [None]:
dp.topic.to_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/top20/valid/topic.pqt")

In [None]:
dp.content.to_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/top20/valid/content.pqt")

In [None]:
df_retrival["content_ids"] = df_retrival["content_ids"].apply(lambda x: x.split())
df_retrival = df_retrival.explode("content_ids").reset_index(drop=True)

df_retrival = df_retrival.merge(dp.topic[["id", "field"]], left_on="topic_id", right_on="id", how="left")[
    ["topic_id", "content_ids", "field"]
].rename({"field": "topic_field"}, axis=1)
df_retrival = df_retrival.merge(dp.content[["id", "field"]], left_on="content_ids", right_on="id", how="left")[
    ["topic_id", "content_ids", "topic_field", "field"]
].rename({"field": "content_field"}, axis=1)

df_retrival

In [None]:
label = pd.read_parquet(corr_dir)
label["label"] = 1
label

In [None]:
df_retrival = df_retrival.merge(label, on=["topic_id", "content_ids"], how="left").fillna({"label": 0})
df_retrival["label"] = df_retrival["label"].astype("int")
df_retrival

In [None]:
df_retrival.to_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/top20/valid/valid.pqt")

In [5]:
df_train_content = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/stage2/train/content.pqt")
df_train_content

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license,field
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,,[video] [TITLE] Sumar números de varios dígito...
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,,[video] [TITLE] Trovare i fattori di un numero...
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,,[video] [TITLE] Sumar curvas de demanda. [DESC...
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND,[document] [TITLE] Nado de aproximação. [DESCR...
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA,[document] [TITLE] geometry-m3-topic-a-overvie...
...,...,...,...,...,...,...,...,...,...
144651,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA,[html5] [TITLE] 2. 12: Diffusion. [DESCRIPTION...
144652,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,,[video] [TITLE] Sommare facendo gruppi da 10. ...
144653,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,,[video] [TITLE] Introdução à subtração. [DESCR...
144654,c_ffff04ba7ac7,SA of a Cone,,video,,en,,,[video] [TITLE] SA of a Cone. [DESCRIPTION]No ...


In [3]:
df_train_topic = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/stage2/train/topic.pqt")
df_train_topic

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,title_level,field
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,"[Khan Academy (български език), Наука, Физика,...",[TITLE] Откриването на резисторите of Открития...
1,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,"[Khan Academy (Português (Brasil)), Matemática...",[TITLE] Entradas e saídas de uma função of Álg...
2,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True,"[MIT Blossoms, Engineering, Flow Charts: Logic...",[TITLE] Transcripts of Flow Charts: Logical Th...
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True,"[Khan Academy (български език), Математика, Ал...",[TITLE] Графики на експоненциални функции (Алг...
4,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True,"[DIGITAL EDUCATION WITH MEHUL, ધોરણ ૩, પ્રારંભ...",[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...
...,...,...,...,...,...,...,...,...,...,...,...
61512,t_fff830472691,Scalar Projections,,fef095,source,4,en,t_c75d6acecf78,True,"[K-12, Math, Analysis, Vector Analysis, Scalar...",[TITLE] Scalar Projections of Vector Analysis ...
61513,t_fff9e5407d13,NA_U06 - El periódico,,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True,"[PF (Español), Lengua española, NA_U06 - El pe...",[TITLE] NA_U06 - El periódico of Lengua españo...
61514,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True,"[Khan Academy (Kiswahili), Hisabati, Class 9 (...",[TITLE] Inscribed shapes problem solving of Mi...
61515,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True,"[CREE, Para el Estudiante, I Ciclo, 01 Primero...",[TITLE] Lección 7 of Unidad 4 of Español Activ...


In [2]:
df_train = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/stage2/train/train.pqt")
df_train

Unnamed: 0,topic_id,content_ids,topic_field,content_field,label
0,t_00004da3a1b2,c_3b7657ad7868,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Съпротивление и проводимост. [...,0
1,t_00004da3a1b2,c_5bc0e1e2cba0,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Променлив резистор (реостат) с...,1
2,t_00004da3a1b2,c_6fe03bf94e75,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Прост електрически мотор. [DES...,0
3,t_00004da3a1b2,c_1108dd0c7a5d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Молив като резистор. [DESCRIPT...,1
4,t_00004da3a1b2,c_29acb8a6a26d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Проводимост на електролити (за...,0
...,...,...,...,...,...
2428645,t_ed73fc05e532,c_6a7706ae6095,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Geri Dönüşüm: Tahta Kasayı ...,0
2428646,t_ed73fc05e532,c_173e3d350837,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Origami Nud Papillon. [DES...,0
2428647,t_ed73fc05e532,c_33ba1fc2915c,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[video] [TITLE] Sprey Boya Kullanımı. [DESCRIP...,0
2428648,t_ed73fc05e532,c_086df38ecc64,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Stenstil Uygulama. [DESCRIP...,0


In [11]:
df_train_corr = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_correlations_flod0.pqt")
df_train_corr

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00069b63a70a,c_11a1dc0bfb99
...,...,...
211308,t_fff9e5407d13,c_d64037a72376
211309,t_fffbe1d5d43c,c_46f852a49c08
211310,t_fffbe1d5d43c,c_6659207b25d5
211311,t_fffe14f1be1e,c_cece166bad6a


In [12]:
df_train_corr = df_train_corr.merge(df_train_topic[["id", "field"]], left_on="topic_id", right_on="id", how="left")[
    ["topic_id", "content_ids", "field"]
].rename({"field": "topic_field"}, axis=1)
df_train_corr = df_train_corr.merge(df_train_content[["id", "field"]], left_on="content_ids", right_on="id", how="left")[
    ["topic_id", "content_ids", "topic_field", "field"]
].rename({"field": "content_field"}, axis=1)

df_train_corr["label"] = 1
df_train_corr

In [13]:
df_train_corr

Unnamed: 0,topic_id,content_ids,topic_field,content_field,label
0,t_00004da3a1b2,c_1108dd0c7a5d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Молив като резистор. [DESCRIPT...,1
1,t_00004da3a1b2,c_376c5a8eb028,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Да чуем променливото съпротивл...,1
2,t_00004da3a1b2,c_5bc0e1e2cba0,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Променлив резистор (реостат) с...,1
3,t_00004da3a1b2,c_76231f9d0b5e,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Последователно свързване на га...,1
4,t_00069b63a70a,c_11a1dc0bfb99,[TITLE] Transcripts of Flow Charts: Logical Th...,[document] [TITLE] Flow Charts: Logical..: Wri...,1
...,...,...,...,...,...
211308,t_fff9e5407d13,c_d64037a72376,[TITLE] NA_U06 - El periódico of Lengua españo...,[html5] [TITLE] Introducción: El periódico. [D...,1
211309,t_fffbe1d5d43c,c_46f852a49c08,[TITLE] Inscribed shapes problem solving of Mi...,[video] [TITLE] Proof: Right triangles inscrib...,1
211310,t_fffbe1d5d43c,c_6659207b25d5,[TITLE] Inscribed shapes problem solving of Mi...,[video] [TITLE] Area of inscribed equilateral ...,1
211311,t_fffe14f1be1e,c_cece166bad6a,[TITLE] Lección 7 of Unidad 4 of Español Activ...,[document] [TITLE] Juego con las palabras. [DE...,1


In [15]:
df_new_train = pd.concat([df_train, df_train_corr]).reset_index(drop=True)
df_new_train

Unnamed: 0,topic_id,content_ids,topic_field,content_field,label
0,t_00004da3a1b2,c_3b7657ad7868,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Съпротивление и проводимост. [...,0
1,t_00004da3a1b2,c_5bc0e1e2cba0,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Променлив резистор (реостат) с...,1
2,t_00004da3a1b2,c_6fe03bf94e75,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Прост електрически мотор. [DES...,0
3,t_00004da3a1b2,c_1108dd0c7a5d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Молив като резистор. [DESCRIPT...,1
4,t_00004da3a1b2,c_29acb8a6a26d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Проводимост на електролити (за...,0
...,...,...,...,...,...
2639958,t_fff9e5407d13,c_d64037a72376,[TITLE] NA_U06 - El periódico of Lengua españo...,[html5] [TITLE] Introducción: El periódico. [D...,1
2639959,t_fffbe1d5d43c,c_46f852a49c08,[TITLE] Inscribed shapes problem solving of Mi...,[video] [TITLE] Proof: Right triangles inscrib...,1
2639960,t_fffbe1d5d43c,c_6659207b25d5,[TITLE] Inscribed shapes problem solving of Mi...,[video] [TITLE] Area of inscribed equilateral ...,1
2639961,t_fffe14f1be1e,c_cece166bad6a,[TITLE] Lección 7 of Unidad 4 of Español Activ...,[document] [TITLE] Juego con las palabras. [DE...,1


In [18]:
df_new_train.drop_duplicates(subset=["topic_id", "content_ids"]).reset_index(drop=True).to_parquet(
    "/home/search3/lichunyu/k12-curriculum-recommendations/data/output/stage2/train/train2.pqt"
)

In [19]:
pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/stage2/train/train2.pqt")

Unnamed: 0,topic_id,content_ids,topic_field,content_field,label
0,t_00004da3a1b2,c_3b7657ad7868,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Съпротивление и проводимост. [...,0
1,t_00004da3a1b2,c_5bc0e1e2cba0,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Променлив резистор (реостат) с...,1
2,t_00004da3a1b2,c_6fe03bf94e75,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Прост електрически мотор. [DES...,0
3,t_00004da3a1b2,c_1108dd0c7a5d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Молив като резистор. [DESCRIPT...,1
4,t_00004da3a1b2,c_29acb8a6a26d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Проводимост на електролити (за...,0
...,...,...,...,...,...
2447197,t_ffba5459a977,c_04a421dba8aa,[TITLE] Filter of Ejercicios of Composition of...,[html5] [TITLE] solution_filter.js. [DESCRIPTI...,1
2447198,t_ffba5459a977,c_787a7a2e7217,[TITLE] Filter of Ejercicios of Composition of...,[html5] [TITLE] boilerplate_filter.js. [DESCRI...,1
2447199,t_ffba5459a977,c_a6c82a21c03a,[TITLE] Filter of Ejercicios of Composition of...,[html5] [TITLE] test_filter.spec.js. [DESCRIPT...,1
2447200,t_ffc2d9fdec62,c_0d337a8ef2dd,[TITLE] Unidad 2 of Módulo 1 of Grado 3 Artes ...,[document] [TITLE] Lección 6. [DESCRIPTION]LEE...,1


In [20]:
df_top50_train = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/stage2/train/train.pqt")
df_top50_train

Unnamed: 0,topic_id,content_ids,topic_field,content_field,label
0,t_00004da3a1b2,c_3b7657ad7868,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Съпротивление и проводимост. [...,0
1,t_00004da3a1b2,c_5bc0e1e2cba0,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Променлив резистор (реостат) с...,1
2,t_00004da3a1b2,c_6fe03bf94e75,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Прост електрически мотор. [DES...,0
3,t_00004da3a1b2,c_1108dd0c7a5d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Молив като резистор. [DESCRIPT...,1
4,t_00004da3a1b2,c_29acb8a6a26d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Проводимост на електролити (за...,0
...,...,...,...,...,...
2428645,t_ed73fc05e532,c_6a7706ae6095,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Geri Dönüşüm: Tahta Kasayı ...,0
2428646,t_ed73fc05e532,c_173e3d350837,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Origami Nud Papillon. [DES...,0
2428647,t_ed73fc05e532,c_33ba1fc2915c,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[video] [TITLE] Sprey Boya Kullanımı. [DESCRIP...,0
2428648,t_ed73fc05e532,c_086df38ecc64,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Stenstil Uygulama. [DESCRIP...,0


In [32]:
df_top50_train["rank"] = df_top50_train.groupby("topic_id").cumcount()
# df_top50_train["rank"] = df_top50_train["rank"].astype("str")
df_top50_train

Unnamed: 0,topic_id,content_ids,topic_field,content_field,label,rank
0,t_00004da3a1b2,c_3b7657ad7868,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Съпротивление и проводимост. [...,0,0
1,t_00004da3a1b2,c_5bc0e1e2cba0,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Променлив резистор (реостат) с...,1,1
2,t_00004da3a1b2,c_6fe03bf94e75,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Прост електрически мотор. [DES...,0,2
3,t_00004da3a1b2,c_1108dd0c7a5d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Молив като резистор. [DESCRIPT...,1,3
4,t_00004da3a1b2,c_29acb8a6a26d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Проводимост на електролити (за...,0,4
...,...,...,...,...,...,...
2428645,t_ed73fc05e532,c_6a7706ae6095,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Geri Dönüşüm: Tahta Kasayı ...,0,45
2428646,t_ed73fc05e532,c_173e3d350837,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Origami Nud Papillon. [DES...,0,46
2428647,t_ed73fc05e532,c_33ba1fc2915c,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[video] [TITLE] Sprey Boya Kullanımı. [DESCRIP...,0,47
2428648,t_ed73fc05e532,c_086df38ecc64,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Stenstil Uygulama. [DESCRIP...,0,48


In [33]:
df_top50_train["rank"] = df_top50_train["rank"].apply(lambda x: f"[{x},{x},{x},{x},{x}]")
df_top50_train

Unnamed: 0,topic_id,content_ids,topic_field,content_field,label,rank
0,t_00004da3a1b2,c_3b7657ad7868,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Съпротивление и проводимост. [...,0,"[0,0,0,0,0]"
1,t_00004da3a1b2,c_5bc0e1e2cba0,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Променлив резистор (реостат) с...,1,"[1,1,1,1,1]"
2,t_00004da3a1b2,c_6fe03bf94e75,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Прост електрически мотор. [DES...,0,"[2,2,2,2,2]"
3,t_00004da3a1b2,c_1108dd0c7a5d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Молив като резистор. [DESCRIPT...,1,"[3,3,3,3,3]"
4,t_00004da3a1b2,c_29acb8a6a26d,[TITLE] Откриването на резисторите of Открития...,[video] [TITLE] Проводимост на електролити (за...,0,"[4,4,4,4,4]"
...,...,...,...,...,...,...
2428645,t_ed73fc05e532,c_6a7706ae6095,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Geri Dönüşüm: Tahta Kasayı ...,0,"[45,45,45,45,45]"
2428646,t_ed73fc05e532,c_173e3d350837,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Origami Nud Papillon. [DES...,0,"[46,46,46,46,46]"
2428647,t_ed73fc05e532,c_33ba1fc2915c,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[video] [TITLE] Sprey Boya Kullanımı. [DESCRIP...,0,"[47,47,47,47,47]"
2428648,t_ed73fc05e532,c_086df38ecc64,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Stenstil Uygulama. [DESCRIP...,0,"[48,48,48,48,48]"


In [34]:
df_top50_train["topic_field"] = df_top50_train["rank"].str.cat(df_top50_train["topic_field"])
df_top50_train

Unnamed: 0,topic_id,content_ids,topic_field,content_field,label,rank
0,t_00004da3a1b2,c_3b7657ad7868,"[0,0,0,0,0][TITLE] Откриването на резисторите ...",[video] [TITLE] Съпротивление и проводимост. [...,0,"[0,0,0,0,0]"
1,t_00004da3a1b2,c_5bc0e1e2cba0,"[1,1,1,1,1][TITLE] Откриването на резисторите ...",[video] [TITLE] Променлив резистор (реостат) с...,1,"[1,1,1,1,1]"
2,t_00004da3a1b2,c_6fe03bf94e75,"[2,2,2,2,2][TITLE] Откриването на резисторите ...",[video] [TITLE] Прост електрически мотор. [DES...,0,"[2,2,2,2,2]"
3,t_00004da3a1b2,c_1108dd0c7a5d,"[3,3,3,3,3][TITLE] Откриването на резисторите ...",[video] [TITLE] Молив като резистор. [DESCRIPT...,1,"[3,3,3,3,3]"
4,t_00004da3a1b2,c_29acb8a6a26d,"[4,4,4,4,4][TITLE] Откриването на резисторите ...",[video] [TITLE] Проводимост на електролити (за...,0,"[4,4,4,4,4]"
...,...,...,...,...,...,...
2428645,t_ed73fc05e532,c_6a7706ae6095,"[45,45,45,45,45][TITLE] ÖAP Kullanımı of Öğütü...",[document] [TITLE] Geri Dönüşüm: Tahta Kasayı ...,0,"[45,45,45,45,45]"
2428646,t_ed73fc05e532,c_173e3d350837,"[46,46,46,46,46][TITLE] ÖAP Kullanımı of Öğütü...",[document] [TITLE] Origami Nud Papillon. [DES...,0,"[46,46,46,46,46]"
2428647,t_ed73fc05e532,c_33ba1fc2915c,"[47,47,47,47,47][TITLE] ÖAP Kullanımı of Öğütü...",[video] [TITLE] Sprey Boya Kullanımı. [DESCRIP...,0,"[47,47,47,47,47]"
2428648,t_ed73fc05e532,c_086df38ecc64,"[48,48,48,48,48][TITLE] ÖAP Kullanımı of Öğütü...",[document] [TITLE] Stenstil Uygulama. [DESCRIP...,0,"[48,48,48,48,48]"
