In [1]:
import os
import gc
import psutil
from pathlib import Path
import pathlib

import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding

from scipy.spatial.distance import cosine
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from sklearn.neighbors import NearestNeighbors

device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [2]:
class PATH:
    # epoch 34668
    model = '/root/autodl-nas/model/sentence-transformers/all-MiniLM-L6-v2_new_r1.1'
    # input_dir = '/root/autodl-nas/data/k12'
    input_dir = '/root/autodl-nas/data/k12/cv_data/fold_0'
    
    output_dir = '/root/autodl-nas/data/k12/out'
    cv_dir = '/root/autodl-nas/data/k12/cv_data'
    pretrained_dir = '/root/autodl-nas/model/'
    content_dir = os.path.join(input_dir, 'content.csv')
    correlation_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')
    
    
class CFG:
    seed = 11
    fold = 0
    n_fold = 3
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    # cpt = '/root/autodl-nas/model/checkpoint-34668/'

## Lang

In [3]:
def get_level_features(df_topic, level_cols=['title']):
    df_hier = df_topic[list(set(level_cols + ['id', 'parent', 'level', 'has_content']))]
    highest_level = df_hier['level'].max()
    
    df_level = df_hier.query('level == 0').copy(deep=True)
    level_list = list()
    for col in level_cols:
        df_level[f'{col}_level'] = df_level[f'{col}'].apply(lambda x: [x])

    for i in tqdm(range(highest_level + 1)):
        level_list.append(df_level[df_level['has_content']])
        df_level_high = df_hier.query('level == @i+1')
        df_level = df_level_high.merge(df_level, left_on='parent', right_on='id', suffixes=['', '_parent'], how='inner')
        for col in level_cols:
            df_level[f'{col}_level'] = df_level[f'{col}_level'] + df_level[f'{col}'].apply(lambda x: [x])
        for col in df_level.columns:
            if col.endswith('_parent'):
                df_level.drop(columns=col, inplace=True)
    df = pd.concat(level_list).reset_index(drop=True)
    return df[list(set(['id'] + [f'{col}_level' for col in level_cols]))]

In [4]:
def get_topic_field(d):
    title = list(filter(lambda x: pd.notna(x), d['title_level']))
    title = ' of '.join(title[-1::-1])
    title = 'No information' if title=='' else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    field = title + description
    return field

def get_content_field(d):
    title = d['title']
    title = 'No information' if pd.isna(title) else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    kind = '[' + d['kind'] + '] '
    field = kind + title + description
    return field

In [5]:
def prepare_language_match(path, mode='train'):
    topic = pd.read_csv(path.topic_dir)[['id', 'language']]
    content = pd.read_csv(path.content_dir)[['id', 'language']]
    if mode == 'train':
        corr = pd.read_csv(path.correlation_dir)
    elif mode == 'valid':
        corr = pd.read_csv(path.submission_dir)
    
    topic = topic.merge(corr, left_on='id', right_on='topic_id', how='right')[['id', 'language']]
    match_dict = {}
    for language in topic['language'].unique():
        match_dict[language] = (topic.query('language==@language')[['id']], content.query('language==@language')[['id']])
    return match_dict

In [6]:
def prepare_match_features(topic, content, path):
    df_topic = pd.read_csv(path.topic_dir)
    df_content = pd.read_csv(path.content_dir)
    level = get_level_features(df_topic)
    df_topic = df_topic.merge(level, on='id', how='right')
    df_topic['field'] = df_topic.apply(lambda x: get_topic_field(x), axis=1)
    df_content['field'] = df_content.apply(lambda x: get_content_field(x), axis=1)
    topic = topic[['id']].merge(df_topic[['id', 'field']], on='id', how='left')
    content = content[['id']].merge(df_content[['id', 'field']], on='id', how='left')
    return topic, content

In [7]:
%%time
topic_content_match = prepare_language_match(PATH, mode='valid')

CPU times: user 10.3 s, sys: 1.31 s, total: 11.6 s
Wall time: 11.6 s


In [18]:
%%time
for lang in topic_content_match.keys():
    print(f'{lang}\t - topics: {len(topic_content_match[lang][0])}\t - contents: {len(topic_content_match[lang][1])}')
    topic, content = topic_content_match[lang]
    topic, content = prepare_match_features(topic, content, PATH)

gu	 - topics: 131	 - contents: 3677
en	 - topics: 5373	 - contents: 65939
es	 - topics: 2189	 - contents: 30844
fr	 - topics: 111	 - contents: 10682
hi	 - topics: 132	 - contents: 4042
fil	 - topics: 83	 - contents: 516
pt	 - topics: 76	 - contents: 10435
bn	 - topics: 191	 - contents: 2513
as	 - topics: 18	 - contents: 641
sw	 - topics: 31	 - contents: 1447
CPU times: user 525 µs, sys: 0 ns, total: 525 µs
Wall time: 419 µs


## Calc Embeddings

In [12]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

MODEL_PATH = PATH.model
OUTPUT_PATH = os.path.join(PATH.cv_dir, f"fold_{CFG.fold}")
N_NEIGHBOR = 100

In [13]:
class PlainDataset(Dataset):

    def __init__(self, df, tokenizer, label_name="") -> None:
        super().__init__()
        self.data = df[label_name].tolist()
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        text = self.data[index]
        inputs = self.tokenizer(
                text, 
                add_special_tokens = True,
                truncation='longest_first',
                max_length = 64,
                padding = 'max_length',
                return_attention_mask = True,
                return_tensors = 'pt',
        )
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        return inputs

    def __len__(self):
        return len(self.data)

In [14]:
class Convert2Embed(object):

    def __init__(self) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        self.model = AutoModel.from_pretrained(MODEL_PATH).cuda()

    def convert2embeddind(self, df, label_name=""):
        embed: list = []
        dataset = PlainDataset(df, tokenizer=self.tokenizer, label_name=label_name)
        dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=32)
        for batch in dataloader:
            batch = {k: v.cuda() for k, v in batch.items()}
            with torch.no_grad():
                embeddings = self.model(**batch, output_hidden_states=True, return_dict=True).pooler_output
                embed.append(embeddings.cpu().clone().detach().numpy())
        embed = np.concatenate(embed, axis=0)
        return embed

    def get_embed(self):
        for lang in topic_content_match.keys():
            # topic, content = topic_content_match[lang]
            # topic, content = prepare_match_features(topic, content, PATH)
            # topic_path = os.path.join(OUTPUT_PATH, "valid", f"topic_{lang}.pqt")
            # content_path = os.path.join(OUTPUT_PATH, "valid", f"content_{lang}.pqt")
            # topic.to_parquet(topic_path)
            # content.to_parquet(content_path)
            
            for t in ["content", "topic"]:
                path = os.path.join(OUTPUT_PATH, "valid", f"{t}_{lang}.pqt")
                df = pd.read_parquet(path)
                embed = self.convert2embeddind(df, label_name=f"field")
                np.save(path.replace(".pqt", ".npy"), embed)
                
        # for p in tqdm(valid_language):
        #     for t in ["content", "topics"]:
        #         path = os.path.join(OUTPUT_PATH, "valid", p, f"{t}_{p}.pqt")
        #         df = pd.read_parquet(path)
        #         embed = self.convert2embeddind(df, label_name=f"{t}_text")
        #         np.save(path.replace(".pqt", ".npy"), embed)

In [15]:
def valid():
#     with open(os.path.join(OUTPUT_PATH, "valid", "language.txt"), "r") as f:
#         valid_language = f.read().splitlines()
    recall_amount = 0
    recall_num = 0
    recall_total = {}
    for lang in topic_content_match.keys():
        ## debug
        # global df_pred, df_correlations
        content_path = os.path.join(OUTPUT_PATH, "valid", f"content_{lang}.npy")
        topics_path = os.path.join(OUTPUT_PATH, "valid", f"topic_{lang}.npy")
        correlations_path = PATH.submission_dir
        content_array = np.load(content_path)
        topics_array = np.load(topics_path)
        model = NearestNeighbors(n_neighbors=N_NEIGHBOR, metric="cosine")
        model.fit(content_array)
        d, r = model.kneighbors(topics_array)
        df_content = pd.read_parquet(content_path.replace(".npy", ".pqt"))
        df_topics = pd.read_parquet(topics_path.replace(".npy", ".pqt"))
        df_correlations = pd.read_csv(correlations_path).astype({"topic_id": str})
        # .rename({"topic_id": "topics_id", "content_ids": "content_id"}, axis=1)
        
        pred = {"topic_id": [], "content_ids": []}
        for i in range(len(df_topics)):
            r_t = r[i]
            tmp = []
            for c in r_t:
                content_id = df_content.iloc[c]["id"]
                tmp.append(content_id)
            topics_id = df_topics.iloc[i]["id"]
            pred["topic_id"].append(topics_id)
            pred["content_ids"].append(tmp)
        
        df_pred = pd.DataFrame(pred).astype({"topic_id": str})
        df_correlations['content_ids'] = df_correlations['content_ids'].apply(lambda x: list(x.split()))
        df_pred = df_pred.merge(df_correlations, on='topic_id', how='left', suffixes=['_pred', '_true'])
        
        df_pred['recall'] = df_pred.apply(lambda x: len(set(x['content_ids_true']).intersection(x['content_ids_pred']))
                                                       / len(x['content_ids_true']), 
                                          axis=1)
        recall = df_pred['recall'].mean()
        recall_total[lang] = recall
        recall_num += len(df_pred)
        recall_amount += df_pred['recall'].sum()
    print(f"Recall: {recall_amount/recall_num}")
    print(f"----------------Details----------------")
    for k, v in recall_total.items():
        print(f"Recall for language {k}: {v}")


In [None]:
%%time
if __name__ == "__main__":
    # P = Convert2Embed()
    # P.get_embed()
    N_NEIGHBOR = 50
    valid()

In [None]:
%%time
if __name__ == "__main__":
    # P = Convert2Embed()
    # P.get_embed()
    N_NEIGHBOR = 100
    valid()

In [106]:
%%time
if __name__ == "__main__":
    # P = Convert2Embed()
    # P.get_embed()
    N_NEIGHBOR = 200
    valid()

Recall: 0.8091437845276167
----------------Details----------------
Recall for language gu: 0.5162500717442461
Recall for language en: 0.9077353249369697
Recall for language es: 0.5824658411831884
Recall for language fr: 0.8474526585207699
Recall for language hi: 0.7574651578002543
Recall for language fil: 0.9781985083189902
Recall for language pt: 0.8899573744968482
Recall for language bn: 0.7173052514675551
Recall for language as: 1.0
Recall for language sw: 0.8522539288668322
CPU times: user 1min 45s, sys: 16.8 s, total: 2min 2s
Wall time: 1min 43s
