In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from transformers import AutoTokenizer, AutoModel, AutoConfig

from sklearn.neighbors import NearestNeighbors

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
INPUT_DIR = '/home/search3/lichunyu/k12-curriculum-recommendations/data/input/raw'
OUTPUT_PATH = '/home/search3/lichunyu/k12-curriculum-recommendations/data/output'

TOPIC_DIR = os.path.join(INPUT_DIR, 'topics.csv')
CONTENT_DIR = os.path.join(INPUT_DIR, 'content.csv')
CORR_DIR = os.path.join(INPUT_DIR, 'sample_submission.csv')

In [3]:
def get_topic_field(d):
    title = list(filter(lambda x: pd.notna(x), d['title_level']))
    title = ' of '.join(title[-1::-1])
    title = 'No information' if title=='' else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    field = title + description
    return field

def get_content_field(d):
    title = d['title']
    title = 'No information' if pd.isna(title) else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    kind = '[' + d['kind'] + '] '
    field = kind + title + description
    return field

In [4]:
class DataPreparation:
    
    def __init__(self, topic_path, content_path, submission_path):
        self.topic = pd.read_csv(topic_path)
        self.content = pd.read_csv(content_path)
        self.corr = pd.read_csv(submission_path)
        # self.topic = self.topic[self.topic['id'].isin(self.corr['topic_id'].to_list())]
        self.match_dict = None
    
    def prepare_topic(self):
        df_level = self._get_level_features(self.topic)
        self.topic = self.topic.merge(df_level, on='id', how='inner')
        self.topic['field'] = self.topic.apply(lambda x: get_topic_field(x), axis=1)
        return self.topic
    
    def prepare_content(self):
        self.content['field'] = self.content.apply(lambda x: get_content_field(x), axis=1)
        return self.content
    
    def prepare_language_match(self):
        topic = self.topic[['id', 'language']].merge(self.corr, left_on='id', right_on='topic_id', how='right')[['id', 'language']]
        match_dict = {}
        for language in topic['language'].unique():
            match_dict[language] = (topic.query('language==@language')[['id']], self.content.query('language==@language')[['id']])
        self.match_dict = match_dict
        return match_dict
    
    
    def _get_level_features(self, df_topic, level_cols=['title']):
        cols = list(set(level_cols + ['id', 'parent', 'level', 'has_content']))
        df_hier = df_topic[cols]
        
        highest_level = df_hier['level'].max()
        print(f'Highest Level: {highest_level}')

        df_level = df_hier.query('level == 0').copy(deep=True)
        level_list = list()
        for col in level_cols:
            df_level[f'{col}_level'] = df_level[f'{col}'].apply(lambda x: [x])

        for i in tqdm(range(highest_level + 1)):
            level_list.append(df_level[df_level['has_content']])
            df_level_high = df_hier.query('level == @i+1')
            df_level = df_level_high.merge(df_level, left_on='parent', right_on='id', suffixes=['', '_parent'], how='inner')
            for col in level_cols:
                df_level[f'{col}_level'] = df_level[f'{col}_level'] + df_level[f'{col}'].apply(lambda x: [x])
            for col in df_level.columns:
                if col.endswith('_parent'):
                    df_level.drop(columns=col, inplace=True)
        df = pd.concat(level_list).reset_index(drop=True)
        return df[set(['id'] + [f'{col}_level' for col in level_cols])]
    
    def prepare(self):
        self.prepare_topic()
        self.prepare_content()
        self.prepare_language_match()

In [5]:
%%time
dp = DataPreparation(TOPIC_DIR, CONTENT_DIR, CORR_DIR)
dp.prepare()

Highest Level: 10


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 27.91it/s]
  return df[set(['id'] + [f'{col}_level' for col in level_cols])]


CPU times: user 10.5 s, sys: 800 ms, total: 11.3 s
Wall time: 12.3 s


In [None]:
class Stage2(object):

    def __init__(self) -> None:
        ...

    def inference(self, df_recall):
        return df_recall

In [16]:
dp.topic

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,title_level,field
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,"[Khan Academy (български език), Наука, Физика,...",[TITLE] Откриването на резисторите of Открития...
1,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,"[Khan Academy (Português (Brasil)), Matemática...",[TITLE] Entradas e saídas de uma função of Álg...
2,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True,"[MIT Blossoms, Engineering, Flow Charts: Logic...",[TITLE] Transcripts of Flow Charts: Logical Th...
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True,"[Khan Academy (български език), Математика, Ал...",[TITLE] Графики на експоненциални функции (Алг...
4,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True,"[DIGITAL EDUCATION WITH MEHUL, ધોરણ ૩, પ્રારંભ...",[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...
...,...,...,...,...,...,...,...,...,...,...,...
61512,t_fff830472691,Scalar Projections,,fef095,source,4,en,t_c75d6acecf78,True,"[K-12, Math, Analysis, Vector Analysis, Scalar...",[TITLE] Scalar Projections of Vector Analysis ...
61513,t_fff9e5407d13,NA_U06 - El periódico,,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True,"[PF (Español), Lengua española, NA_U06 - El pe...",[TITLE] NA_U06 - El periódico of Lengua españo...
61514,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True,"[Khan Academy (Kiswahili), Hisabati, Class 9 (...",[TITLE] Inscribed shapes problem solving of Mi...
61515,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True,"[CREE, Para el Estudiante, I Ciclo, 01 Primero...",[TITLE] Lección 7 of Unidad 4 of Español Activ...


In [17]:
dp.content

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license,field
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,,[video] [TITLE] Sumar números de varios dígito...
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,,[video] [TITLE] Trovare i fattori di un numero...
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,,[video] [TITLE] Sumar curvas de demanda. [DESC...
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND,[document] [TITLE] Nado de aproximação. [DESCR...
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA,[document] [TITLE] geometry-m3-topic-a-overvie...
...,...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA,[html5] [TITLE] 2. 12: Diffusion. [DESCRIPTION...
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,,[video] [TITLE] Sommare facendo gruppi da 10. ...
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,,[video] [TITLE] Introdução à subtração. [DESCR...
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,,[video] [TITLE] SA of a Cone. [DESCRIPTION]No ...


In [30]:
df_submission = pd.read_csv("../submission.csv")[["topics_id", "content_id"]].rename({"topics_id": "topic_id", "content_id": "content_ids"}, axis=1)
df_submission["content_ids"] = df_submission["content_ids"].apply(lambda x: " ".join(eval(x)))
df_submission

Unnamed: 0,topic_id,content_ids
0,t_0016d30772f3,c_d59e0f06425a c_c25053d6fafd c_061d9f90bb06 c...
1,t_001bcbb22694,c_264c04a3643d c_745a7c39bbee c_b887e284cb16 c...
2,t_001c75b83927,c_5bde36cd830f c_ad87ae1e36de c_48bda3de3813 c...
3,t_0021d8020514,c_5bde36cd830f c_ad87ae1e36de c_48bda3de3813 c...
4,t_005386225b10,c_ecdd62900c59 c_1b5287816358 c_429610f9043f c...
...,...,...
15387,t_cc4454f1465a,c_ccf4b1977851 c_803738ec21d6 c_d9b9801a44ef c...
15388,t_d3e0eb360723,c_84d00e2dbaf5 c_a8b129d20b3a c_a0d7850f33bd c...
15389,t_db9bb829fa51,c_00515a86b30f c_1f5e288b48e9 c_ee02f27e82d3 c...
15390,t_ec5c92daa38f,c_ccf4b1977851 c_803738ec21d6 c_4d0f0acc73c6 c...


In [31]:
df_retrival = df_submission
df_retrival["content_ids"] = df_retrival["content_ids"].apply(lambda x: x.split(" "))
df_retrival = df_retrival.explode("content_ids").reset_index(drop=True)
df_retrival

Unnamed: 0,topic_id,content_ids
0,t_0016d30772f3,c_d59e0f06425a
0,t_0016d30772f3,c_c25053d6fafd
0,t_0016d30772f3,c_061d9f90bb06
0,t_0016d30772f3,c_e72cbf36b600
0,t_0016d30772f3,c_ea312de91d4f
...,...,...
15391,t_ecdaff0e35f5,c_96f20ad2bce9
15391,t_ecdaff0e35f5,c_3721a18f13e2
15391,t_ecdaff0e35f5,c_7e8e8462ff83
15391,t_ecdaff0e35f5,c_c39af48d9e90


In [33]:
df_retrival.merge(
    dp.topic[["id", "field"]], left_on="topic_id", right_on="id", how="left"
).merge(
    dp.content[["id", "field"]], left_on="content_ids", right_on="id", how="left"
)

Unnamed: 0,topic_id,content_ids,id_x,field_x,id_y,field_y
0,t_0016d30772f3,c_d59e0f06425a,t_0016d30772f3,[TITLE] Números mixtos of Nivel 3 of Fraccione...,c_d59e0f06425a,[video] [TITLE] Números mixtos y fracciones im...
1,t_0016d30772f3,c_c25053d6fafd,t_0016d30772f3,[TITLE] Números mixtos of Nivel 3 of Fraccione...,c_c25053d6fafd,[video] [TITLE] Comparar fracciones impropias ...
2,t_0016d30772f3,c_061d9f90bb06,t_0016d30772f3,[TITLE] Números mixtos of Nivel 3 of Fraccione...,c_061d9f90bb06,[video] [TITLE] Escribir números mixtos como f...
3,t_0016d30772f3,c_e72cbf36b600,t_0016d30772f3,[TITLE] Números mixtos of Nivel 3 of Fraccione...,c_e72cbf36b600,[video] [TITLE] Reescribir fracciones impropia...
4,t_0016d30772f3,c_ea312de91d4f,t_0016d30772f3,[TITLE] Números mixtos of Nivel 3 of Fraccione...,c_ea312de91d4f,[video] [TITLE] Escribir fracciones impropias ...
...,...,...,...,...,...,...
769595,t_ecdaff0e35f5,c_96f20ad2bce9,,,c_96f20ad2bce9,[document] [TITLE] Drewniana półka z oświetlen...
769596,t_ecdaff0e35f5,c_3721a18f13e2,,,c_3721a18f13e2,[video] [TITLE] Wieszak z odzyskanego drewna. ...
769597,t_ecdaff0e35f5,c_7e8e8462ff83,,,c_7e8e8462ff83,[document] [TITLE] Wieszak z odzyskanego drewn...
769598,t_ecdaff0e35f5,c_c39af48d9e90,,,c_c39af48d9e90,[video] [TITLE] Robienie polinezyjskiego stołu...


In [36]:
df_retrival.groupby("topic_id").agg(list).reset_index()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,"[c_0feaaa5dc39d, c_3b7657ad7868, c_1108dd0c7a5..."
1,t_0006d41a73a8,"[c_41ad3d9bd68a, c_29117d57eff7, c_5c737630789..."
2,t_00102869fbcb,"[c_f9091a48e912, c_48562a9fd7a5, c_e7daeadb242..."
3,t_001139e52dbe,"[c_2429552008e3, c_d77d8303ce5c, c_e4f75fca285..."
4,t_001394e54503,"[c_1e98e759877b, c_6b14b3a8738f, c_042a2def970..."
...,...,...
15387,t_fff05585df72,"[c_d9bbe8422c6b, c_ad9da9f1a277, c_88bc7ee86c8..."
15388,t_fff7782561f4,"[c_69b61f90d63e, c_bca8280a9ad1, c_b5bb8220f73..."
15389,t_fff80f4eee89,"[c_9af2856b97ab, c_10ec42571537, c_4f90335e47a..."
15390,t_fff9e5407d13,"[c_0fb048a6412c, c_d1635b5d7097, c_20de7752260..."


In [41]:
x = pd.DataFrame({
    "a": ["1", "2", "3"],
    "b": [["q", "w", "e"], ["q", "w", "e"], ["q", "w", "e"]]
})
x = x.explode("b")
x

Unnamed: 0,a,b
0,1,q
0,1,w
0,1,e
1,2,q
1,2,w
1,2,e
2,3,q
2,3,w
2,3,e


In [43]:
x["cum"] = x.groupby("a").cumcount()
x[x["cum"]==0]

Unnamed: 0,a,b,cum
0,1,q,0
0,1,w,1
0,1,e,2
1,2,q,0
1,2,w,1
1,2,e,2
2,3,q,0
2,3,w,1
2,3,e,2


In [46]:
x.groupby("a").head(1)

Unnamed: 0,a,b,cum
0,1,q,0
1,2,q,0
2,3,q,0
