In [1]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool, CatBoost

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

### Augmented Dataset

In [2]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
aug_train = pd.read_csv("../input/scrapedcommonlit/external_df.csv")
aug_train = aug_train.rename(columns = {"usable_external": "excerpt"})
aug_train["target"] = pd.merge(aug_train, train_data , how='inner', on=['id']).target
aug_train["standard_error"] = pd.merge(aug_train, train_data , how='inner', on=['id']).standard_error

In [3]:
#for kfold  
num_bins = 10 #int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
aug_train.loc[:,'bins'] = pd.cut(aug_train['target'],bins=num_bins, labels=False)
#bins = train_data.bins.to_numpy()

In [4]:
# molto inefficiente, ma le righe sono poche e dovrebbe non metterci molto 

np.random.seed(137)
frac_of_std = 10

for row_idx in tqdm(range(0, aug_train.shape[0])):
    #row_idx = 1
    
    # find substring ".\n", punto a capo per separare stringone scrapato 
    substr_list = []
    fakeid_list = []
    target_list = []
    bins_list = []

    id_ = aug_train.id[row_idx]
    str_ = aug_train.excerpt[row_idx]
    target_ = aug_train.target[row_idx]
    std_ = aug_train.standard_error[row_idx]
    bins_ = aug_train.bins[row_idx]
    
    idx_str = 0 
    counter = 1 #generating fake ids for merging new dataframe

    if row_idx == 0:
        df = pd.DataFrame(list(zip(fakeid_list, substr_list, target_list, bins_list)),
                   columns =['id', 'excerpt', 'target', 'bins'])

    while idx_str > -1:
        new_id = id_ + str(counter)    
        idx_str = str_.find(".\n", int(np.random.normal(971, 117)), len(str_))
        substr_list.append(str_[:idx_str+2])
        fakeid_list.append(new_id)
        bins_list.append(bins_)
        
        new_target = np.random.normal(target_, std_/frac_of_std)
        target_list.append(new_target)
        
        str_ = str_[idx_str+2:]
        counter += 1

    substr_list = substr_list[:-1] #remove last element
    fakeid_list = fakeid_list[:-1] 
    target_list = target_list[:-1]
    bins_list = bins_list[:-1]
    #len(fakeid_list) == len(substr_list) == len(target_list) # safety check

    df_ = pd.DataFrame(list(zip(fakeid_list, substr_list, target_list, bins_list)),   
                   columns =['id', 'excerpt', 'target', 'bins']) # pivot dataframe
    #print(df_.shape) #safety check

    df = df.append(df_)
    
    
new_aug_train = df 
new_train = train_data[~train_data.id.isin(aug_train.id)]
new_train = new_train.drop(columns=['url_legal', 'license', 'standard_error'])
new_train = new_train.append(new_aug_train)
new_train = new_train.sample(frac=1)
new_train = new_train.reset_index(drop = True)

# facciamo in modo di riprodurre la stessa distribuzione di partenza per i target anche 
# nel dataset augmented 

frac_to_keep = 1 / min(new_train.bins.value_counts() / train_data.bins.value_counts())

# sono solo 10 bins, non avevo voglia di fare le cose bene .... 
train_0 = new_train[new_train.bins == 0].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[0]))
train_1 = new_train[new_train.bins == 1].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[1]))
train_2 = new_train[new_train.bins == 2].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[2]))
train_3 = new_train[new_train.bins == 3].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[3]))
train_4 = new_train[new_train.bins == 4].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[4]))
train_5 = new_train[new_train.bins == 5].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[5]))
train_6 = new_train[new_train.bins == 6].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[6]))
train_7 = new_train[new_train.bins == 7].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[7]))
train_8 = new_train[new_train.bins == 8].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[8]))
train_9 = new_train[new_train.bins == 9].sample(n = round((train_data.bins.value_counts() / frac_to_keep)[9]))
train_data =  train_0.append([train_1, train_2, train_3, train_4, train_5, train_6, \
                             train_7, train_8, train_9])

train_data = train_data.sample(frac=1)
train_data = train_data.reset_index(drop = True)

100%|██████████| 325/325 [00:00<00:00, 399.22it/s]


In [5]:
"""
# previous index reset
train_data = train_data.set_index('id')
aug_train = aug_train.set_index('id')
train_data.update(aug_train)
train_data.reset_index(inplace=True)
#aug_train.reset_index(inplace = True)
""";

In [6]:
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

target = train_data['target'].to_numpy()

#for kfold  
#num_bins = 10 #int(np.floor(1 + np.log2(len(train_data))))
#train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
bins = train_data.bins.to_numpy(dtype = np.int16)

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [7]:
config = {
    'batch_size':128,
    'max_len':256,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [8]:
class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [9]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [10]:
train_embeddings1 =  get_embeddings(train_data,'../input/modelf1')
test_embeddings1 = get_embeddings(test_data,'../input/modelf1')


train_embeddings2 =  get_embeddings(train_data,'../input/modelf2')
test_embeddings2 = get_embeddings(test_data,'../input/modelf2')

train_embeddings3 =  get_embeddings(train_data,'../input/modelf3')
test_embeddings3 = get_embeddings(test_data,'../input/modelf3')

train_embeddings4 =  get_embeddings(train_data,'../input/modelf4')
test_embeddings4 = get_embeddings(test_data,'../input/modelf4')

train_embeddings5 =  get_embeddings(train_data,'../input/modelf5')
test_embeddings5 = get_embeddings(test_data,'../input/modelf5')
"""""";

cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf1 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
35it [00:35,  1.00s/it]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf1 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.79it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
35it [00:33,  1.03it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.33it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf3 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
35it [00:34,  1.03it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf3 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.63it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf4 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
35it [00:33,  1.03it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf4 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.38it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf5 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
35it [00:34,  1.03it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf5 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.86it/s]


## Neural Tangent Kernel

In [11]:
!pip install ../input/roberta/frozendict-2.0.2-py3-none-any.whl
!pip install ../input/roberta/neural_tangents-0.3.6-py2.py3-none-any.whl

Processing /kaggle/input/roberta/frozendict-2.0.2-py3-none-any.whl
Installing collected packages: frozendict
Successfully installed frozendict-2.0.2
Processing /kaggle/input/roberta/neural_tangents-0.3.6-py2.py3-none-any.whl
Installing collected packages: neural-tangents
Successfully installed neural-tangents-0.3.6


In [12]:
from jax import random
from neural_tangents import stax
import neural_tangents as nt

def get_preds_ntk(X,y,X_test,bins=bins,nfolds=5):
    
    kfold = StratifiedKFold(n_splits=nfolds)
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]

        ResBlock = stax.serial(
                        stax.FanOut(2),
                        stax.parallel(
                            stax.serial(
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                            ),
                            stax.Identity(),
                        ),
                        stax.FanInSum()
                    )

        init_fn, apply_fn, kernel_fn = stax.serial(
                stax.Dense(1, W_std=1.0, b_std=0),
                ResBlock, ResBlock, stax.Erf(),
                stax.Dense(1, W_std=2.5, b_std=0.1)
        )

        key = random.PRNGKey(10)
        _, params = init_fn(key, input_shape=X_train.shape)
        predict_fn = nt.predict.gradient_descent_mse_ensemble(kernel_fn,
                                                                  X_train,
                                                                  y_train[:,np.newaxis],
                                                                  diag_reg=1e-1,
                                                                  lr=1)
        
        prediction = predict_fn(x_test=X_valid, get='nngp', t=None)#model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += predict_fn(x_test=X_test, get='nngp', t=None)#model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [13]:
ntk_preds1 = get_preds_ntk(train_embeddings1,target,test_embeddings1)

ntk_preds2 = get_preds_ntk(train_embeddings2,target,test_embeddings2)
ntk_preds3 = get_preds_ntk(train_embeddings3,target,test_embeddings3)
ntk_preds4 = get_preds_ntk(train_embeddings4,target,test_embeddings4)
ntk_preds5 = get_preds_ntk(train_embeddings5,target,test_embeddings5)
"""""";

Fold 0 , rmse score: 0.5663396585472567
Fold 1 , rmse score: 0.5698152964930118
Fold 2 , rmse score: 0.6002650004424258
Fold 3 , rmse score: 0.5752597303446432
Fold 4 , rmse score: 0.5582475774325402
mean rmse 0.5739854526519756
Fold 0 , rmse score: 0.5755103583961869
Fold 1 , rmse score: 0.5794721355235051
Fold 2 , rmse score: 0.608997244927859
Fold 3 , rmse score: 0.5833743779899584
Fold 4 , rmse score: 0.5614220771076076
mean rmse 0.5817552387890235
Fold 0 , rmse score: 0.5318793235676887
Fold 1 , rmse score: 0.5312760357265813
Fold 2 , rmse score: 0.5609441206619032
Fold 3 , rmse score: 0.5327352236319931
Fold 4 , rmse score: 0.5212616574563316
mean rmse 0.5356192722088996
Fold 0 , rmse score: 0.5493700003215523
Fold 1 , rmse score: 0.5471054897289109
Fold 2 , rmse score: 0.5814445577407403
Fold 3 , rmse score: 0.5597801307737409
Fold 4 , rmse score: 0.5547992207297855
mean rmse 0.558499879858946
Fold 0 , rmse score: 0.5581239344485001
Fold 1 , rmse score: 0.5410996055709191
Fold 2

In [14]:
ntk_preds = (ntk_preds1 + ntk_preds2 + ntk_preds3 + ntk_preds4 + ntk_preds5)/5

In [15]:
sample.target = ntk_preds
sample.to_csv('submission.csv',index=False)