In [1]:
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import numpy as np

class Preprocessor:
    def __init__(self):
        self.data_dir = Path('/kaggle/input/AI4Code')
    
    def read_notebook(self, path):
        return (
            pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'})
                .assign(id=path.stem)
                .rename_axis('cell_id')
            )
    
    def get_notebooks_test(self):
        paths_train = list((self.data_dir / 'test').glob('*.json'))
        notebooks_test = [
            self.read_notebook(path) for path in tqdm(paths_train, desc='Test NBs')
        ]
        return notebooks_test
    
    def get_test_df(self):
        notebooks_test = self.get_notebooks_test()
        df = (
            pd.concat(notebooks_test)
                .set_index('id', append=True)
                .swaplevel()
                .sort_index(level='id', sort_remaining=False)
        )
        return df
    
    def clean_code(self, cell):
        return str(cell).replace('\\n', '\n')
    
    def sample_cells(self, cells, n=20):
        cells = [self.clean_code(cell) for cell in cells]

        if n >= len(cells): # 코드 셀이 20개 이하라면 그냥 반환
            return [cell[:200] for cell in cells]
        else:
            results = []
            step = len(cells) / n # 총 20개의 코드셀이 샘플링 되도록 스텝을 조절
            idx = 0
            while int(np.round(idx) < len(cells)):
                results.append(cells[int(np.round(idx))])
                idx += step
            assert cells[0] in results # 첫번쨰 코드셀은 반드시 들어가야 한다?
            if cells[-1] not in results: # 말전 코드셀은 반드시 들어가야 한다?
                results[-1] = cells[-1]
            return results
        
    def get_features(self, df):
        features = dict()
        df = df.sort_values('rank').reset_index(drop=True)

        for idx, sub_df in tqdm(df.groupby('id')):
            features[idx] = dict()
            total_md = sub_df[sub_df.cell_type == 'markdown'].shape[0]
            code_sub_df = sub_df[sub_df.cell_type == 'code']
            total_code = code_sub_df.shape[0]
            codes = self.sample_cells(code_sub_df.source.values, 20)
            features[idx]['total_code'] = total_code
            features[idx]['total_md'] = total_md
            features[idx]['codes'] = codes
        
        return features

In [2]:
preprocessor = Preprocessor()

test_df = preprocessor.get_test_df().reset_index()
test_df['rank'] = test_df.groupby(['id', 'cell_type']).cumcount()
test_df['pred'] = test_df.groupby(['id', 'cell_type'])['rank'].rank(pct=True)

test_fts = preprocessor.get_features(test_df)

Test NBs: 100%|██████████| 4/4 [00:00<00:00, 89.26it/s]
100%|██████████| 4/4 [00:00<00:00, 704.13it/s]


In [3]:
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import sys

class MarkdownModel(nn.Module):
    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()

        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(769, 1)

    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        x = torch.cat((x[:, 0, :], fts), 1)
        x = self.top(x)
        return x

class MarkdownDataset(Dataset):
    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len
        self.fts = fts
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[row.id]['codes']],
            add_special_tokens=True,
            max_length=23,
            padding='max_length',
            truncation=True
        )

        n_md = self.fts[row.id]['total_md']
        n_code = self.fts[row.id]['total_code']
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        assert len(ids) == len(mask)

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]            

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()

def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)

def predict(model_path, ckpt_path):
    model = MarkdownModel(model_path)
    model = model.cuda()
    model.eval()
    model.load_state_dict(torch.load(ckpt_path))
    BS = 32
    NW = 2
    MAX_LEN = 64
    test_df['pct_rank'] = 0
    test_ds = MarkdownDataset(test_df[test_df['cell_type'] == 'markdown'].reset_index(drop=True),
                              md_max_len=64,
                              total_max_len=512,
                              model_name_or_path=model_path,
                              fts=test_fts)
    test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                             pin_memory=False, drop_last=False)
    _, y_test = validate(model, test_loader)
    return y_test

In [4]:
model_path = "/kaggle/input/huggingface-code-models/graphcodebert-base"
ckpt_path = "/kaggle/input/codebert2/model_epoch_5_0.8499189849500974.bin"
y_test = predict(model_path, ckpt_path)

Some weights of the model checkpoint at /kaggle/input/huggingface-code-models/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /kaggle/input/huggingface-code-models/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You s

100%|██████████| 2/2 [00:02<00:00,  1.08s/it]


In [5]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

Unnamed: 0,id,cell_order
0,0009d135ece78d,0a226b6a ddfd239c 8cb8d28a c6cd22db 1372ae9b e...
1,0010483c12ba9b,7f270e34 54c7cab3 fe66203e 7844d5f8 5ce8863c 4...
2,0010a919d60e4f,23607d04 b7578789 aafc3d23 bbff12d4 80e077ec b...
3,0028856e09c5b7,012c9d02 d22526d1 eb293dfc 3ae7ece3


In [6]:
sub_df.to_csv("submission1.csv", index=False)

# Pairwise: submission2.csv

In [7]:
import joblib
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
stemmer = WordNetLemmatizer()

PRETRAINED_MODEL_PATH = '/kaggle/input/pairwisemodels/checkpoint-18000'
FINETUNED_MODEL_PATH = '/kaggle/input/pairwisemodels//my_own_model.bin'
TOKENIZER_PATH = '/kaggle/input/pairwisemodels//my_own_tokenizer'

data_dir = Path('/kaggle/input/AI4Code')

def generate_triplet(df, mode='train'):
  triplets = []
  ids = df.id.unique()
  random_drop = np.random.random(size=10000)>0.9
  count = 0

  for id, df_tmp in tqdm(df.groupby('id')):
    df_tmp_markdown = df_tmp[df_tmp['cell_type']=='markdown']

    df_tmp_code = df_tmp[df_tmp['cell_type']=='code']
    df_tmp_code_rank = df_tmp_code['rank'].values
    df_tmp_code_cell_id = df_tmp_code['cell_id'].values

    for cell_id, rank in df_tmp_markdown[['cell_id', 'rank']].values:
      labels = np.array([(r==(rank+1)) for r in df_tmp_code_rank]).astype('int')

      for cid, label in zip(df_tmp_code_cell_id, labels):
        count += 1
        if label==1:
          triplets.append( [cell_id, cid, label] )
          # triplets.append( [cid, cell_id, label] )
        elif mode == 'test':
          triplets.append( [cell_id, cid, label] )
          # triplets.append( [cid, cell_id, label] )
        elif random_drop[count%10000]:
          triplets.append( [cell_id, cid, label] )
          # triplets.append( [cid, cell_id, label] )
    
  return triplets

def preprocess_text(document):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    #return document
    
    # Lemmatization
    tokens = document.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) > 3]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

class MarkdownModel(nn.Module):
    def __init__(self):
        super(MarkdownModel, self).__init__()
        self.distill_bert = AutoModel.from_pretrained(PRETRAINED_MODEL_PATH)
        self.top = nn.Linear(512, 1)

        self.dropout = nn.Dropout(0.2)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = self.dropout(x)
        x = self.top(x[:, 0, :])
        x = torch.sigmoid(x) 
        return x

class MarkdownDataset(Dataset):
    
    def __init__(self, df, max_len, mode='train'):
        super().__init__()
        self.df = df
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, do_lower_case=True)
        self.mode=mode

    def __getitem__(self, index):
        row = self.df[index]

        label = row[-1]

        txt = dict_cellid_source[row[0]] + '[SEP]' + dict_cellid_source[row[1]]

        inputs = self.tokenizer.encode_plus(
            txt,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask, torch.FloatTensor([label])

    def __len__(self):
        return len(self.df)

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader, mode='train'):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = np.zeros(len(val_loader.dataset), dtype='float32')
    labels = []
    count = 0

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(inputs[0], inputs[1]).detach().cpu().numpy().ravel()

            preds[count:count+len(pred)] = pred
            count += len(pred)
            
            if mode=='test':
              labels.append(target.detach().cpu().numpy().ravel())
    if mode=='test':
      return preds
    else:
      return np.concatenate(labels), np.concatenate(preds)

def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


In [8]:
BS = 128
NW = 8
MAX_LEN = 128

paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()

test_df.source = test_df.source.apply(preprocess_text)
dict_cellid_source = dict(zip(test_df['cell_id'].values, test_df['source'].values))
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=False)
test_triplets = generate_triplet(test_df, mode = 'test')
test_df["pct_rank"] = 0
test_ds = MarkdownDataset(test_triplets, max_len=MAX_LEN)
test_loader = DataLoader(test_ds, batch_size=BS * 4, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)
import gc 
gc.collect()
len(test_ds), test_ds[0]

Test NBs: 100%|██████████| 4/4 [00:00<00:00, 169.52it/s]
100%|██████████| 4/4 [00:00<00:00, 388.96it/s]
  cpuset_checked))


(999,
 (tensor([  101, 25169,  2951,  4094,  2951,  4162,   102, 12324, 16371,  8737,
           2100,  7399, 11208, 12324, 25462,  2951,  6364, 12324, 13523, 24759,
           4140, 29521,  1052, 22571, 10994,  2013, 15315, 19738,  6826, 22511,
          12324,  2013, 15315, 19738,  6826, 17463,  3217,  9623,  7741, 12324,
           4781,  9289,  2121,  2013, 15315, 19738,  6826, 17463,  3217,  9623,
           7741, 12324,  4094,  2013, 15315, 19738,  6826, 17727, 10421, 12324,
           3722,  5714, 18780,  2121, 12324, 16101, 18442,  5371, 18442,  3328,
          10556, 24679,  7953,  5371, 18442,  5371, 18442,  6140,  4130,  3693,
          16101, 18442,  5371, 18442,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,    

In [9]:
model = MarkdownModel()
model = model.cuda()
model.load_state_dict(torch.load(FINETUNED_MODEL_PATH))
y_test = validate(model, test_loader, mode='test')

Some weights of the model checkpoint at /kaggle/input/pairwisemodels/checkpoint-18000 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at /kaggle/input/pairwisemodels/checkpoint-18000 and are newly initialized: ['bert

  0%|          | 0/2 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling pa

In [10]:
preds_copy = y_test
pred_vals = []
count = 0
for id, df_tmp in tqdm(test_df.groupby('id')):
    df_tmp_mark = df_tmp[df_tmp['cell_type']=='markdown']
    df_tmp_code = df_tmp[df_tmp['cell_type']!='markdown']
    df_tmp_code_rank = df_tmp_code['rank'].rank().values
    N_code = len(df_tmp_code_rank)
    N_mark = len(df_tmp_mark)
    
    preds_tmp = preds_copy[count:count+N_mark * N_code]
    
    count += N_mark * N_code
    
    for i in range(N_mark):
      pred = preds_tmp[i*N_code:i*N_code+N_code] 
    
      softmax = np.exp((pred-np.mean(pred)) *20)/np.sum(np.exp((pred-np.mean(pred)) *20)) 
    
      rank = np.sum(softmax * df_tmp_code_rank)
      pred_vals.append(rank)
    
del model
del test_triplets[:]
del dict_cellid_source
gc.collect()

100%|██████████| 4/4 [00:00<00:00, 448.19it/s]


21

In [11]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = pred_vals
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

Unnamed: 0,id,cell_order
0,0009d135ece78d,ddfd239c 0a226b6a 8cb8d28a c6cd22db 1372ae9b 9...
1,0010483c12ba9b,54c7cab3 7f270e34 fe66203e 7844d5f8 5ce8863c 4...
2,0010a919d60e4f,aafc3d23 b7578789 23607d04 4ae17669 bbff12d4 8...
3,0028856e09c5b7,012c9d02 d22526d1 eb293dfc 3ae7ece3


In [12]:
sub_df.to_csv("submission2.csv", index=False)

# Rank Ensemble

In [13]:
# Reading the submissions
df_1 = pd.read_csv('submission1.csv')
df_2 = pd.read_csv('submission2.csv')

# Averaging the indices and sorting the resulting submission by the aggregated ensembled indices
new_samples = []
for sample_idx in range(len(df_1)):
    # {'0a226b6a': 0, ...}
    sample_1 = {k: v for v, k in enumerate(df_1.iloc[sample_idx]['cell_order'].split(' '))}
    sample_2 = {k: v for v, k in enumerate(df_2.iloc[sample_idx]['cell_order'].split(' '))}
    for key in sample_1: 
        sample_1[key] = ((sample_1[key] * 0.748) + (sample_2[key] * 0.252))
    new_samples.append(' '.join([i[0] for i in list(sorted(sample_1.items(), key = lambda x: x[1]))]))
df_1['cell_order'] = new_samples

In [14]:
df_1.to_csv('submission.csv', index = False)
df_1

Unnamed: 0,id,cell_order
0,0009d135ece78d,0a226b6a ddfd239c 8cb8d28a c6cd22db 1372ae9b e...
1,0010483c12ba9b,7f270e34 54c7cab3 fe66203e 7844d5f8 5ce8863c 4...
2,0010a919d60e4f,23607d04 b7578789 aafc3d23 bbff12d4 80e077ec b...
3,0028856e09c5b7,012c9d02 d22526d1 eb293dfc 3ae7ece3
