In [None]:
import os
import time
import datetime
import shutil
import pathlib

import pandas as pd
import polars as pl
import numpy as np
from tqdm import tqdm
from scipy.special import comb
from sklearn.neighbors import NearestNeighbors

# Setup

In [None]:
DATA_PATH = "../data"
OUTPUT_PATH = os.path.join(DATA_PATH, "output")
INPUT_DATA_PATH = os.path.join(DATA_PATH, "input")
RAW_DATA_DIR = os.path.join(INPUT_DATA_PATH, "raw")
KFLOD_DATA_DIRNAME = os.path.join(INPUT_DATA_PATH, "kflod_data")
FLOD0_PATH = os.path.join(KFLOD_DATA_DIRNAME, "flod0")

CONTENT_PATH = os.path.join(RAW_DATA_DIR, "content.csv")
TOPIC_PATH = os.path.join(RAW_DATA_DIR, "topics.csv")
CORRELATIONS_PATH = os.path.join(RAW_DATA_DIR, "correlations.csv")

TRAIN_CONTENT_PATH = os.path.join(FLOD0_PATH, "train_content_flod0.pqt")
TRAIN_TOPIC_PATH = os.path.join(FLOD0_PATH, "train_topics_flod0.pqt")
TRAIN_CORRELATIONS_PATH = os.path.join(FLOD0_PATH, "train_correlations_flod0.pqt")

VALID_CONTENT_PATH = os.path.join(FLOD0_PATH, "valid_content_flod0.pqt")
VALID_TOPIC_PATH = os.path.join(FLOD0_PATH, "valid_topics_flod0.pqt")
VALID_CORRELATIONS_PATH = os.path.join(FLOD0_PATH, "valid_correlations_flod0.pqt")

FLOD = 5
SEED_LIST = [42, 20, 91, 41, 44]

KFLOD_DATA_DIRNAME = os.path.join(INPUT_DATA_PATH, "kflod_data")

# Build Train Dataset

In [None]:
df_train_content = pd.read_parquet(TRAIN_CONTENT_PATH)
df_train_content["title"] = df_train_content["title"].apply(lambda x: x if x is not None else "")
df_train_content["description"] = df_train_content["description"].apply(lambda x: x if x is not None else "")
df_train_content["content_text"] = df_train_content["title"]+df_train_content["description"]
df_train_content

In [None]:
df_train_topic = pd.read_parquet(TRAIN_TOPIC_PATH)
df_train_topic["title"] = df_train_topic["title"].apply(lambda x: x if x is not None else "")
df_train_topic["description"] = df_train_topic["description"].apply(lambda x: x if x is not None else "")
df_train_topic["topic_text"] = df_train_topic["title"]+df_train_topic["description"]
df_train_topic

In [None]:
df_label = pd.read_parquet(TRAIN_CORRELATIONS_PATH)
df_label = df_label.merge(df_train_topic[["id", "topic_text"]], left_on="topic_id", right_on="id", how="inner")
df_label = df_label.merge(df_train_content[["id", "content_text"]], left_on="content_ids", right_on="id", how="inner")

df_label[["topic_text", "content_text"]].to_parquet(os.path.join(FLOD0_PATH, "unsup_cl_data.parquet"), index=False)
df_label

# Build Valid Dataset

In [None]:
df_content = pd.read_csv(CONTENT_PATH)
df_content["title"] = df_content["title"].apply(lambda x: x if x is not None else "")
df_content["description"] = df_content["description"].apply(lambda x: x if x is not None else "")
df_content["content_text"] = df_content["title"]+df_content["description"]
df_content = df_content[["id", "content_text", "language"]].fillna("")
df_content

In [None]:
df_valid_topics = pd.read_parquet(VALID_TOPIC_PATH)
df_valid_topics["title"] = df_valid_topics["title"].apply(lambda x: x if x is not None else "")
df_valid_topics["description"] = df_valid_topics["description"].apply(lambda x: x if x is not None else "")
df_valid_topics["topics_text"] = df_valid_topics["title"]+df_valid_topics["description"]
df_valid_topics = df_valid_topics[["id", "topics_text", "language"]].fillna("")
df_valid_topics

In [None]:
df_valid_label = pd.read_parquet(VALID_CORRELATIONS_PATH)
df_valid_label = df_valid_label.merge(df_valid_topics[["id", "language"]].drop_duplicates(subset=["id", "language"]), left_on="topic_id", right_on="id")[
    ["topic_id", "content_ids", "language"]
]
df_valid_label

In [None]:
df_valid_label

In [None]:
valid_language = df_valid_label["language"].unique().tolist()
with open(os.path.join(OUTPUT_PATH, "valid", "language.txt"), "w") as f:
    f.write("\n".join(valid_language))

for p in tqdm(valid_language):
    path = os.path.join(OUTPUT_PATH, "valid", p)
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
    df_content[df_content["language"]==p].to_parquet(os.path.join(path, f"content_{p}.pqt"))
    df_valid_topics[df_valid_topics["language"]==p].to_parquet(os.path.join(path, f"topics_{p}.pqt"))
    df_valid_label[df_valid_label["language"]==p].to_parquet(os.path.join(path, f"correlations_{p}.pqt"))

# Convert to embedding

In [None]:
with open(os.path.join(OUTPUT_PATH, "valid", "language.txt"), "r") as f:
    valid_language = f.read().splitlines()

for p in valid_language:
    path = os.path.join(OUTPUT_PATH, "valid", p)

In [None]:
path = "/home/search3/lichunyu/k12-curriculum-recommendations/data/output/valid/pt"

In [None]:
array = np.load(os.path.join(path, "content_pt.npy"))
array.shape

In [None]:
model = NearestNeighbors(n_neighbors=50, metric="cosine")
model.fit(array)

In [None]:
x = np.load(os.path.join(path, "topics_pt.npy"))
x.shape

In [None]:
z = model.kneighbors(x)

In [None]:
content = pd.read_parquet(os.path.join(path, "content_pt.pqt"))
content

In [None]:
content.iloc[0]

In [None]:
label = pd.read_parquet(os.path.join(path, "correlations_pt.pqt"))
label

In [None]:
topics = pd.read_parquet(os.path.join(path, "topics_pt.pqt"))
topics

In [None]:
label[label["topic_id"]=="t_00068291e9a4"]

In [None]:
r = []
for i in z[-1][0]:
    r.append(content.iloc[i]["id"])
    
set(r).intersection(set(label[label["topic_id"]=="t_00068291e9a4"]["content_ids"].unique().tolist()))

In [None]:
z[-1][0]