In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from transformers import AutoTokenizer, AutoModel, AutoConfig

from scipy.spatial.distance import cosine
from sklearn.neighbors import NearestNeighbors

In [2]:
INPUT_DIR = '/root/autodl-nas/data/k12/cv_data/fold_0'
OUTPUT_PATH = '/root/autodl-nas/data/k12/out/rank'

FOLD = 0

TOPIC_DIR = os.path.join(INPUT_DIR, 'topics.csv')
CONTENT_DIR = os.path.join(INPUT_DIR, 'content.csv')
# CORR_DIR = os.path.join(INPUT_DIR, 'sample_submission.csv')
CORR_DIR = os.path.join(INPUT_DIR, 'correlations.csv')
SUB_DIR = os.path.join(INPUT_DIR, 'sample_submission.csv')

MODEL_DIR = '/root/autodl-nas/model/r1_10/checkpoint-30816'
TOKENIZER_DIR = '/root/autodl-nas/model/sentence-transformers/all-MiniLM-L6-v2_new_r1.1'

N_NEIGHBOR = 50

In [5]:
def get_topic_field(d):
    title = list(filter(lambda x: pd.notna(x), d['title_level']))
    title = ' of '.join(title[-1::-1])
    title = 'No information' if title=='' else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    field = title + description
    return field

def get_content_field(d):
    title = d['title']
    title = 'No information' if pd.isna(title) else title
    title = '[TITLE] ' + title + '. '
    description = d['description'] if pd.notna(d['description']) else 'No information'
    description = '[DESCRIPTION]' + description + '. '
    kind = '[' + d['kind'] + '] '
    field = kind + title + description
    return field

In [39]:
class DataPreparation:
    
    def __init__(self, topic_path, content_path, submission_path):
        self.topic = pd.read_csv(topic_path)
        self.content = pd.read_csv(content_path)
        self.corr = pd.read_csv(submission_path)
        self.match_dict = None
    
    def prepare_topic(self):
        df_level = self._get_level_features(self.topic)
        self.topic = self.topic.merge(df_level, on='id', how='inner')
        self.topic['field'] = self.topic.apply(lambda x: get_topic_field(x), axis=1)
        return self.topic
    
    def prepare_content(self):
        self.content['field'] = self.content.apply(lambda x: get_content_field(x), axis=1)
        return self.content
    
    def prepare_language_match(self):
        topic = self.topic[['id', 'language']].merge(self.corr, left_on='id', right_on='topic_id', how='right')[['id', 'language']]
        match_dict = {}
        for language in topic['language'].unique():
            match_dict[language] = (topic.query('language==@language')[['id']], self.content.query('language==@language')[['id']])
        self.match_dict = match_dict
        return match_dict
    
    
    def _get_level_features(self, df_topic, level_cols=['title']):
        cols = list(set(level_cols + ['id', 'parent', 'level', 'has_content']))
        df_hier = df_topic[cols]
        
        highest_level = df_hier['level'].max()
        print(f'Highest Level: {highest_level}')

        df_level = df_hier.query('level == 0').copy(deep=True)
        level_list = list()
        for col in level_cols:
            df_level[f'{col}_level'] = df_level[f'{col}'].apply(lambda x: [x])

        for i in tqdm(range(highest_level + 1)):
            level_list.append(df_level[df_level['has_content']])
            df_level_high = df_hier.query('level == @i+1')
            df_level = df_level_high.merge(df_level, left_on='parent', right_on='id', suffixes=['', '_parent'], how='inner')
            for col in level_cols:
                df_level[f'{col}_level'] = df_level[f'{col}_level'] + df_level[f'{col}'].apply(lambda x: [x])
            for col in df_level.columns:
                if col.endswith('_parent'):
                    df_level.drop(columns=col, inplace=True)
        df = pd.concat(level_list).reset_index(drop=True)
        return df[set(['id'] + [f'{col}_level' for col in level_cols])]
    
    def prepare_rank_data(self, data_path, save_path, corr_path):
        df = pd.read_csv(data_path)
        df_corr = pd.read_csv(corr_path)
        df = df.merge(df_corr, on='topic_id', how='left', suffixes=['_pred', '_true'])
        df['content_ids_pred'] = df['content_ids_pred'].apply(lambda x: list(x.split()))
        df['content_ids_true'] = df['content_ids_true'].apply(lambda x: list(x.split()))
        df['content_id'] = df.apply(lambda x: list(set(x['content_ids_pred']+x['content_ids_true'])), axis=1)
        df['label'] = df.apply(lambda x: [1 if id in x['content_ids_true'] else 0 for id in x['content_id']], axis=1)
        df = df[['topic_id', 'content_id', 'label']].explode(['content_id', 'label'])
        df = df.merge(self.topic[['id', 'field']], left_on='topic_id', right_on='id', how='left').rename(columns={'field': 'topic'})
        df = df.merge(self.content[['id', 'field']], left_on='content_id', right_on='id', how='left').rename(columns={'field': 'content'})
        df_id = df[['topic_id', 'content_id', 'label']]
        df_field = df[['topic', 'content', 'label']]
        df_field.to_csv(save_path, index=None)
        return df_field
        
        
    
    def prepare(self):
        self.prepare_topic()
        self.prepare_content()


In [40]:
%%time
dp = DataPreparation(TOPIC_DIR, CONTENT_DIR, CORR_DIR)
dp.prepare()

Highest Level: 10


  0%|          | 0/11 [00:00<?, ?it/s]

  return df[set(['id'] + [f'{col}_level' for col in level_cols])]


CPU times: user 15.9 s, sys: 475 ms, total: 16.4 s
Wall time: 16.4 s


In [41]:
%%time
dp.prepare_rank_data('samples_r3_valid.csv', os.path.join(OUTPUT_PATH, 'samples_r3_valid_labeled.csv'), SUB_DIR)

Unnamed: 0,topic,content,label
0,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[exercise] [TITLE] 2-અંકની સંખ્યાનું વિભાજન કર...,0
1,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] દશક ઉમેરીએ ત્યારે સ્થાન કિંમતન...,0
2,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] નજીકના 100 માં ફેરવવું. [DESCR...,0
3,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[exercise] [TITLE] 100 સુધીના સરવાળાના વ્યવહાર...,0
4,[TITLE] 100 સુધીનો સરવાળો of 100 સુધીના સરવાળા...,[video] [TITLE] 1-અંકની સંખ્યા ઉમેરવા સમૂહ બના...,0
...,...,...,...
425006,[TITLE] 6.3.a.xvii Kutoa namba kamili of 6.3.a...,[exercise] [TITLE] Sehemu kujumlisha na kutoa....,0
425007,[TITLE] 6.3.a.xvii Kutoa namba kamili of 6.3.a...,[video] [TITLE] Kigawe kidogo cha shirika: vig...,0
425008,[TITLE] 6.3.a.xvii Kutoa namba kamili of 6.3.a...,[video] [TITLE] Kutoa kwa njia ya makundi (kuk...,0
425009,[TITLE] 6.3.a.xvii Kutoa namba kamili of 6.3.a...,[exercise] [TITLE] Kupata 5. [DESCRIPTION]Kuju...,0


In [43]:
%%time
dp.prepare_rank_data('samples_r3.csv', os.path.join(OUTPUT_PATH, 'samples_r3_labeled.csv'), CORR_DIR)

Unnamed: 0,topic,content,label
0,[TITLE] 12. 20: Bird Reproduction of 12: Verte...,[html5] [TITLE] 12. 19: Bird Structure and Fun...,0
1,[TITLE] 12. 20: Bird Reproduction of 12: Verte...,[html5] [TITLE] Vertebrate Reproduction. [DESC...,0
2,[TITLE] 12. 20: Bird Reproduction of 12: Verte...,[video] [TITLE] LIFE - Mudskipper Mud Wrestles...,0
3,[TITLE] 12. 20: Bird Reproduction of 12: Verte...,[video] [TITLE] How to tie up a poultry bird b...,0
4,[TITLE] 12. 20: Bird Reproduction of 12: Verte...,[video] [TITLE] Biology Evolution part 9 (Adap...,0
...,...,...,...
2679491,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Kendin Yap: Eski Ahşap Masa...,0
2679492,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Tarım: Kompost Nasıl Yapılı...,0
2679493,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[document] [TITLE] Éléphant Rose. [DESCRIPTION...,0
2679494,[TITLE] ÖAP Kullanımı of Öğütülmüş Ağaç Kullan...,[video] [TITLE] Tourner Une Pile De Compost. [...,0
