In [1]:
import os
import time
import datetime
import shutil
import pathlib

import pandas as pd
import polars as pl
import numpy as np
from tqdm import tqdm
from scipy.special import comb
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


# Setup

In [11]:
DATA_PATH = "../data"
OUTPUT_PATH = os.path.join(DATA_PATH, "output")
INPUT_DATA_PATH = os.path.join(DATA_PATH, "input")
RAW_DATA_DIR = os.path.join(INPUT_DATA_PATH, "raw")
KFLOD_DATA_DIRNAME = os.path.join(INPUT_DATA_PATH, "kflod_data")
FLOD0_PATH = os.path.join(KFLOD_DATA_DIRNAME, "flod0")

CONTENT_PATH = os.path.join(RAW_DATA_DIR, "content.csv")
TOPIC_PATH = os.path.join(RAW_DATA_DIR, "topics.csv")
CORRELATIONS_PATH = os.path.join(RAW_DATA_DIR, "correlations.csv")

TRAIN_CONTENT_PATH = os.path.join(FLOD0_PATH, "train_content_flod0.pqt")
TRAIN_TOPIC_PATH = os.path.join(FLOD0_PATH, "train_topics_flod0.pqt")
TRAIN_CORRELATIONS_PATH = os.path.join(FLOD0_PATH, "train_correlations_flod0.pqt")

VALID_CONTENT_PATH = os.path.join(FLOD0_PATH, "valid_content_flod0.pqt")
VALID_TOPIC_PATH = os.path.join(FLOD0_PATH, "valid_topics_flod0.pqt")
VALID_CORRELATIONS_PATH = os.path.join(FLOD0_PATH, "valid_correlations_flod0.pqt")

FLOD = 5
SEED_LIST = [42, 20, 91, 41, 44]

KFLOD_DATA_DIRNAME = os.path.join(INPUT_DATA_PATH, "kflod_data")

# Build Train Dataset

In [None]:
df_train_content = pd.read_parquet(TRAIN_CONTENT_PATH)
df_train_content["title"] = df_train_content["title"].apply(lambda x: x if x is not None else "")
df_train_content["description"] = df_train_content["description"].apply(lambda x: x if x is not None else "")
df_train_content["content_text"] = df_train_content["title"]+df_train_content["description"]
df_train_content

In [None]:
df_train_topic = pd.read_parquet(TRAIN_TOPIC_PATH)
df_train_topic["title"] = df_train_topic["title"].apply(lambda x: x if x is not None else "")
df_train_topic["description"] = df_train_topic["description"].apply(lambda x: x if x is not None else "")
df_train_topic["topic_text"] = df_train_topic["title"]+df_train_topic["description"]
df_train_topic

In [None]:
df_label = pd.read_parquet(TRAIN_CORRELATIONS_PATH)
df_label = df_label.merge(df_train_topic[["id", "topic_text"]], left_on="topic_id", right_on="id", how="inner")
df_label = df_label.merge(df_train_content[["id", "content_text"]], left_on="content_ids", right_on="id", how="inner")

df_label[["topic_text", "content_text"]].to_parquet(os.path.join(FLOD0_PATH, "unsup_cl_data.parquet"), index=False)
df_label

# Build Valid Dataset

In [None]:
df_content = pd.read_csv(CONTENT_PATH)
df_content["title"] = df_content["title"].apply(lambda x: x if x is not None else "")
df_content["description"] = df_content["description"].apply(lambda x: x if x is not None else "")
df_content["content_text"] = df_content["title"]+df_content["description"]
df_content = df_content[["id", "content_text", "language"]].fillna("")
df_content

In [None]:
df_valid_topics = pd.read_parquet(VALID_TOPIC_PATH)
df_valid_topics["title"] = df_valid_topics["title"].apply(lambda x: x if x is not None else "")
df_valid_topics["description"] = df_valid_topics["description"].apply(lambda x: x if x is not None else "")
df_valid_topics["topics_text"] = df_valid_topics["title"]+df_valid_topics["description"]
df_valid_topics = df_valid_topics[["id", "topics_text", "language"]].fillna("")
df_valid_topics

In [None]:
df_valid_label = pd.read_parquet(VALID_CORRELATIONS_PATH)
df_valid_label = df_valid_label.merge(df_valid_topics[["id", "language"]].drop_duplicates(subset=["id", "language"]), left_on="topic_id", right_on="id")[
    ["topic_id", "content_ids", "language"]
]
df_valid_label

In [None]:
df_valid_label

In [None]:
valid_language = df_valid_label["language"].unique().tolist()
with open(os.path.join(OUTPUT_PATH, "valid", "language.txt"), "w") as f:
    f.write("\n".join(valid_language))

for p in tqdm(valid_language):
    path = os.path.join(OUTPUT_PATH, "valid", p)
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
    df_content[df_content["language"]==p].to_parquet(os.path.join(path, f"content_{p}.pqt"))
    df_valid_topics[df_valid_topics["language"]==p].to_parquet(os.path.join(path, f"topics_{p}.pqt"))
    df_valid_label[df_valid_label["language"]==p].to_parquet(os.path.join(path, f"correlations_{p}.pqt"))

# Craft

In [None]:
df_valid_flod1_topics = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod1/valid_topics_flod1.pqt")
df_valid_flod1_topics

In [None]:
sample_submission = df_valid_flod1_topics[["id"]].sample(3000).rename({"id": "topic_id"}, axis=1).reset_index(drop=True)
sample_submission["content_ids"] = ""
sample_submission

In [None]:
sample_submission.to_csv(
    "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod1/sample_submission.csv",
    index=False
)

# Stage2

In [12]:
df_content = pd.read_csv(CONTENT_PATH)
df_content["title"] = df_content["title"].apply(lambda x: x if x is not None else "")
df_content["description"] = df_content["description"].apply(lambda x: x if x is not None else "")
df_content["content_text"] = df_content["title"]+df_content["description"]
df_content = df_content[["id", "content_text", "language"]].fillna("")
df_content

Unnamed: 0,id,content_text,language
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,93...",es
1,c_000087304a9e,Trovare i fattori di un numeroSal trova i fatt...,it
2,c_0000ad142ddb,Sumar curvas de demandaCómo añadir curvas de d...,es
3,c_0000c03adc8d,Nado de aproximaçãoNeste vídeo você vai aprend...,pt
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdfgeometry-m3-to...,es
...,...,...,...
154042,c_fffcbdd4de8b,,en
154043,c_fffe15a2d069,Sommare facendo gruppi da 10Sal somma 5+68 spe...,it
154044,c_fffed7b0d13a,Introdução à subtraçãoSal fala sobre o que sig...,pt
154045,c_ffff04ba7ac7,,en


In [16]:
df_topic = pd.read_parquet(TRAIN_TOPIC_PATH)
df_topic["title"] = df_topic["title"].apply(lambda x: x if x is not None else "")
df_topic["description"] = df_topic["description"].apply(lambda x: x if x is not None else "")
df_topic["topics_text"] = df_topic["title"]+df_topic["description"]
df_topic = df_topic[["id", "topics_text", "language"]].fillna("")
df_topic

Unnamed: 0,id,topics_text,language
0,t_00004da3a1b2,Откриването на резисторитеИзследване на матери...,bg
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,en
2,t_00069b63a70a,Transcripts,en
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,bg
4,t_0008a1bd84ba,12. 20: Bird Reproduction,en
...,...,...,...
61572,t_fff9e5407d13,NA_U06 - El periódico,es
61573,t_fffbe1d5d43c,Inscribed shapes problem solvingUse properties...,sw
61574,t_fffe14f1be1e,Lección 7,es
61575,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,ar


In [18]:
df_recall_label = pd.read_parquet(TRAIN_CORRELATIONS_PATH)
df_recall_label = df_recall_label.merge(df_topic[["id", "language"]].drop_duplicates(subset=["id", "language"]), left_on="topic_id", right_on="id")[
    ["topic_id", "content_ids", "language"]
]
df_recall_label

Unnamed: 0,topic_id,content_ids,language
0,t_00004da3a1b2,c_1108dd0c7a5d,bg
1,t_00004da3a1b2,c_376c5a8eb028,bg
2,t_00004da3a1b2,c_5bc0e1e2cba0,bg
3,t_00004da3a1b2,c_76231f9d0b5e,bg
4,t_00069b63a70a,c_11a1dc0bfb99,en
...,...,...,...
211308,t_fff9e5407d13,c_d64037a72376,es
211309,t_fffbe1d5d43c,c_46f852a49c08,sw
211310,t_fffbe1d5d43c,c_6659207b25d5,sw
211311,t_fffe14f1be1e,c_cece166bad6a,es


In [19]:
recall_language = df_recall_label["language"].unique().tolist()
with open(os.path.join(OUTPUT_PATH, "stage2", "language.txt"), "w") as f:
    f.write("\n".join(recall_language))

for p in tqdm(recall_language):
    path = os.path.join(OUTPUT_PATH, "stage2", p)
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
    df_content[df_content["language"]==p].to_parquet(os.path.join(path, f"content_{p}.pqt"))
    df_topic[df_topic["language"]==p].to_parquet(os.path.join(path, f"topics_{p}.pqt"))
    df_recall_label[df_recall_label["language"]==p].to_parquet(os.path.join(path, f"correlations_{p}.pqt"))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:01<00:00, 22.54it/s]


In [20]:
df_recall = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/stage2/recall.pqt")
df_recall

Unnamed: 0,topics_id,content_id,label
0,t_00004da3a1b2,c_3b7657ad7868,0
1,t_00004da3a1b2,c_0feaaa5dc39d,0
2,t_00004da3a1b2,c_431a13312468,0
3,t_00004da3a1b2,c_d35077f2c3d3,0
4,t_00004da3a1b2,c_dfa229bd21df,0
...,...,...,...
2428645,t_ed73fc05e532,c_1abfef0cd811,0
2428646,t_ed73fc05e532,c_6fa210096020,0
2428647,t_ed73fc05e532,c_55957ce69cc4,0
2428648,t_ed73fc05e532,c_fa3efe520e86,0


In [21]:
df_topic = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_topics_flod0.pqt")
df_topic["title"] = df_topic["title"].apply(lambda x: x if x is not None else "")
df_topic["description"] = df_topic["description"].apply(lambda x: x if x is not None else "")
df_topic["topics_text"] = df_topic["title"]+df_topic["description"]
df_topic = df_topic[["id", "topics_text", "language"]].fillna("")
df_topic

Unnamed: 0,id,topics_text,language
0,t_00004da3a1b2,Откриването на резисторитеИзследване на матери...,bg
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,en
2,t_00069b63a70a,Transcripts,en
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,bg
4,t_0008a1bd84ba,12. 20: Bird Reproduction,en
...,...,...,...
61572,t_fff9e5407d13,NA_U06 - El periódico,es
61573,t_fffbe1d5d43c,Inscribed shapes problem solvingUse properties...,sw
61574,t_fffe14f1be1e,Lección 7,es
61575,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,ar


In [22]:
df_content = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_content_flod0.pqt")
df_content["title"] = df_content["title"].apply(lambda x: x if x is not None else "")
df_content["description"] = df_content["description"].apply(lambda x: x if x is not None else "")
df_content["content_text"] = df_content["title"]+df_content["description"]
df_content = df_content[["id", "content_text", "language"]].fillna("")
df_content

Unnamed: 0,id,content_text,language
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,93...",es
1,c_000087304a9e,Trovare i fattori di un numeroSal trova i fatt...,it
2,c_0000ad142ddb,Sumar curvas de demandaCómo añadir curvas de d...,es
3,c_0000c03adc8d,Nado de aproximaçãoNeste vídeo você vai aprend...,pt
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdfgeometry-m3-to...,es
...,...,...,...
144651,c_fffcbdd4de8b,2. 12: Diffusion,en
144652,c_fffe15a2d069,Sommare facendo gruppi da 10Sal somma 5+68 spe...,it
144653,c_fffed7b0d13a,Introdução à subtraçãoSal fala sobre o que sig...,pt
144654,c_ffff04ba7ac7,SA of a Cone,en


In [23]:
df_recall = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/output/stage2/recall.pqt")
df_recall = df_recall.merge(df_content, left_on="content_id", right_on="id", how="left")
df_recall = df_recall.merge(df_topic, left_on="topics_id", right_on="id", how="left")
df_recall

Unnamed: 0,topics_id,content_id,label,id_x,content_text,language_x,id_y,topics_text,language_y
0,t_00004da3a1b2,c_3b7657ad7868,0,c_3b7657ad7868,Съпротивление и проводимостДа разгледаме свойс...,bg,t_00004da3a1b2,Откриването на резисторитеИзследване на матери...,bg
1,t_00004da3a1b2,c_0feaaa5dc39d,0,c_0feaaa5dc39d,Успоредно свързани резисториУспоредно свързани...,bg,t_00004da3a1b2,Откриването на резисторитеИзследване на матери...,bg
2,t_00004da3a1b2,c_431a13312468,0,c_431a13312468,Успоредни резистори (част 2)Множество успоредн...,bg,t_00004da3a1b2,Откриването на резисторитеИзследване на матери...,bg
3,t_00004da3a1b2,c_d35077f2c3d3,0,c_d35077f2c3d3,Опростяване на мрежи от резисториДа упражним з...,bg,t_00004da3a1b2,Откриването на резисторитеИзследване на матери...,bg
4,t_00004da3a1b2,c_dfa229bd21df,0,,,,t_00004da3a1b2,Откриването на резисторитеИзследване на матери...,bg
...,...,...,...,...,...,...,...,...,...
2428645,t_ed73fc05e532,c_1abfef0cd811,0,,,,t_ed73fc05e532,ÖAP Kullanımı,tr
2428646,t_ed73fc05e532,c_6fa210096020,0,c_6fa210096020,Grue - Prison BreakÜcretsiz ve yüksek kalitede...,tr,t_ed73fc05e532,ÖAP Kullanımı,tr
2428647,t_ed73fc05e532,c_55957ce69cc4,0,c_55957ce69cc4,Feuille CarréeÜcretsiz ve yüksek kalitedeki ye...,tr,t_ed73fc05e532,ÖAP Kullanımı,tr
2428648,t_ed73fc05e532,c_fa3efe520e86,0,c_fa3efe520e86,Feuille CarréeÜcretsiz ve yüksek kalitedeki ye...,tr,t_ed73fc05e532,ÖAP Kullanımı,tr


In [51]:
df_recall[["content_text", "topics_text", "label"]].to_parquet("stage2_flod0.pqt")

In [52]:
df_recall[["content_text", "topics_text", "label"]]

Unnamed: 0,content_text,topics_text,label
0,Съпротивление и проводимостДа разгледаме свойс...,Откриването на резисторитеИзследване на матери...,0
1,Успоредно свързани резисториУспоредно свързани...,Откриването на резисторитеИзследване на матери...,0
2,Успоредни резистори (част 2)Множество успоредн...,Откриването на резисторитеИзследване на матери...,0
3,Опростяване на мрежи от резисториДа упражним з...,Откриването на резисторитеИзследване на матери...,0
4,,Откриването на резисторитеИзследване на матери...,0
...,...,...,...
2428645,,ÖAP Kullanımı,0
2428646,Grue - Prison BreakÜcretsiz ve yüksek kalitede...,ÖAP Kullanımı,0
2428647,Feuille CarréeÜcretsiz ve yüksek kalitedeki ye...,ÖAP Kullanımı,0
2428648,Feuille CarréeÜcretsiz ve yüksek kalitedeki ye...,ÖAP Kullanımı,0


In [31]:
df_recall["content_text"][0]

'Съпротивление и проводимостДа разгледаме свойствата на материалите, които карат резистора да пречи на\nпротичащия ток\n\n'

In [26]:
tokenizer = AutoTokenizer.from_pretrained("/home/search3/lichunyu/pretrain_model/bert-base-multilingual-uncased")

In [None]:
df_recall["content_text_prune"] = df_recall["content_text"].apply(lambda x: )

In [49]:
    from transformers.utils.generic import PaddingStrategy
    from transformers.tokenization_utils import TruncationStrategy

tokenizer(
    df_recall["content_text"][0],
    df_recall["content_text"][1],
    max_length=7,
    padding="max_length",
    truncation="longest_first"
#     padding_strategy=PaddingStrategy.MAX_LENGTH,
#     truncation_strategy=TruncationStrategy.LONGEST_FIRST
)

{'input_ids': [101, 323, 28583, 102, 325, 67832, 102], 'token_type_ids': [0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [2]:
df_topic = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_topics_flod0.pqt")
df_topic

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
3,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
4,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True
...,...,...,...,...,...,...,...,...,...
61572,t_fff9e5407d13,NA_U06 - El periódico,,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True
61573,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
61574,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True
61575,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True


In [4]:
df_topic = df_topic[df_topic["category"]!="source"]
df_topic

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
4,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True
6,t_000d1fb3f2f5,2.1.2 - Logarithms,,e77b55,aligned,5,en,t_b897d168db90,True
8,t_00102869fbcb,Triangles and polygons,Learning outcomes: students must be able to so...,a91e32,aligned,3,en,t_039cecc12bb8,True
11,t_0012a45fa09c,Quiz: materials and techniques,,2ee29d,aligned,4,en,t_6957d4a9f469,True
...,...,...,...,...,...,...,...,...,...
61566,t_fff1f01cfeb0,Desarrollo de poliedros,Identifica prismas con su desarrollo plano.,998df9,supplemental,5,es,t_82dd0e9526f0,True
61569,t_fff7782561f4,Introduction,"In certain situations, comparison by division ...",d5fb04,supplemental,3,en,t_2a4dc28b0431,True
61572,t_fff9e5407d13,NA_U06 - El periódico,,71fd51,supplemental,2,es,t_5bd8f6ae9f7d,True
61574,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True


In [5]:
df_train_corr = pd.read_parquet("/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_correlations_flod0.pqt")
df_train_corr

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00069b63a70a,c_11a1dc0bfb99
...,...,...
211308,t_fff9e5407d13,c_d64037a72376
211309,t_fffbe1d5d43c,c_46f852a49c08
211310,t_fffbe1d5d43c,c_6659207b25d5
211311,t_fffe14f1be1e,c_cece166bad6a


In [8]:
df_train_corr[df_train_corr["topic_id"].isin(df_topic["id"])].reset_index(drop=True).to_parquet(
    "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_correlations_flod0_no_source.pqt"
)

In [10]:
df_topic.reset_index(drop=True).to_parquet(
    "/home/search3/lichunyu/k12-curriculum-recommendations/data/input/kflod_data/flod0/train_topics_flod0_no_source.pqt"
)