In [1]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool, CatBoost

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [2]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
train_data = train_data[train_data["target"] != 0]

target = train_data['target'].to_numpy()

#for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))


In [3]:
config = {
    'batch_size':128,
    'max_len':256,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [4]:
class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [5]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [6]:
train_embeddings1 =  get_embeddings(train_data,'../input/modelf1')
test_embeddings1 = get_embeddings(test_data,'../input/modelf1')

train_embeddings2 =  get_embeddings(train_data,'../input/modelf2')
test_embeddings2 = get_embeddings(test_data,'../input/modelf2')

train_embeddings3 =  get_embeddings(train_data,'../input/modelf3')
test_embeddings3 = get_embeddings(test_data,'../input/modelf3')

train_embeddings4 =  get_embeddings(train_data,'../input/modelf4')
test_embeddings4 = get_embeddings(test_data,'../input/modelf4')

train_embeddings5 =  get_embeddings(train_data,'../input/modelf5')
test_embeddings5 = get_embeddings(test_data,'../input/modelf5')

cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf1 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
23it [00:22,  1.01it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf1 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  6.05it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
23it [00:21,  1.06it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.80it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf3 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
23it [00:21,  1.06it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf3 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.07it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf4 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
23it [00:21,  1.05it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf4 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.88it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf5 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
23it [00:21,  1.06it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf5 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.68it/s]


## neural tangent kernel

In [7]:
!pip install ../input/roberta/frozendict-2.0.2-py3-none-any.whl
!pip install ../input/roberta/neural_tangents-0.3.6-py2.py3-none-any.whl

Processing /kaggle/input/roberta/frozendict-2.0.2-py3-none-any.whl
Installing collected packages: frozendict
Successfully installed frozendict-2.0.2
Processing /kaggle/input/roberta/neural_tangents-0.3.6-py2.py3-none-any.whl
Installing collected packages: neural-tangents
Successfully installed neural-tangents-0.3.6


In [8]:
from jax import random
from neural_tangents import stax
import neural_tangents as nt

def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    kfold = StratifiedKFold(n_splits=nfolds)
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]

        ResBlock = stax.serial(
                        stax.FanOut(2),
                        stax.parallel(
                            stax.serial(
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                            ),
                            stax.Identity(),
                        ),
                        stax.FanInSum()
                    )

        init_fn, apply_fn, kernel_fn = stax.serial(
                stax.Dense(1, W_std=1.0, b_std=0),
                ResBlock, ResBlock, stax.Erf(),
                stax.Dense(1, W_std=2.5, b_std=0.1)
        )

        key = random.PRNGKey(10)
        _, params = init_fn(key, input_shape=X_train.shape)
        predict_fn = nt.predict.gradient_descent_mse_ensemble(kernel_fn,
                                                                  X_train,
                                                                  y_train[:,np.newaxis],
                                                                  diag_reg=1e-1,
                                                                  lr=1)
        prediction = predict_fn(x_test=X_valid, get='nngp', t=None)#model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += predict_fn(x_test=X_test, get='nngp', t=None)#model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [9]:
svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)

Fold 0 , rmse score: 0.3533070781162649
Fold 1 , rmse score: 0.3507230838219165
Fold 2 , rmse score: 0.3397112689980713
Fold 3 , rmse score: 0.3753341700861172
Fold 4 , rmse score: 0.36457406824600735
mean rmse 0.35672993385367546
Fold 0 , rmse score: 0.3664070926170059
Fold 1 , rmse score: 0.36622260558839487
Fold 2 , rmse score: 0.3406317445293064
Fold 3 , rmse score: 0.36522604334069825
Fold 4 , rmse score: 0.37621146467913735
mean rmse 0.3629397901509085
Fold 0 , rmse score: 0.32081083473943894
Fold 1 , rmse score: 0.30368101359758487
Fold 2 , rmse score: 0.257158602494956
Fold 3 , rmse score: 0.3019013086614748
Fold 4 , rmse score: 0.308016349598385
mean rmse 0.2983136218183679
Fold 0 , rmse score: 0.42375648572370384
Fold 1 , rmse score: 0.41970820210181
Fold 2 , rmse score: 0.37810095317003733
Fold 3 , rmse score: 0.41231838465252707
Fold 4 , rmse score: 0.4364009493152341
mean rmse 0.4140569949926626
Fold 0 , rmse score: 0.35939226525594736
Fold 1 , rmse score: 0.36355843732353

## Topological Features + LGBM


In [10]:
# thinking about augmentation
# !pip install nlpaug --no-index --find-links=file:///kaggle/input/roberta/nlpaug-1.1.3-py3-none-any.whl

#library for topological data analysis
!pip install pyflagser --no-index --find-links=file:///kaggle/input/roberta/pyflagser-0.4.4-cp37-cp37m-manylinux2010_x86_64.whl
!pip install giotto-tda --no-index --find-links=file:///kaggle/input/roberta/giotto_tda-0.4.0-cp37-cp37m-manylinux2010_x86_64.whl

Looking in links: file:///kaggle/input/roberta/pyflagser-0.4.4-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/roberta/pyflagser-0.4.4-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: pyflagser
Successfully installed pyflagser-0.4.4
Looking in links: file:///kaggle/input/roberta/giotto_tda-0.4.0-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/roberta/giotto_tda-0.4.0-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: giotto-tda
Successfully installed giotto-tda-0.4.0


In [11]:
import os
import tqdm
import random
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# some gradient boosting algos
import optuna
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool, CatBoost

# Cross validation
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

## Possible augmentation strategies
#import nlpaug.augmenter.word as naw

In [12]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
train_data = train_data[train_data["target"] != 0]

In [13]:
target = train_data['target'].to_numpy()

#for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

In [14]:
config = {
    'batch_size':128,
    'max_len':256,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [15]:
class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [16]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [17]:
train_embeddings =  get_embeddings(train_data,'../input/modelf1')
test_embeddings = get_embeddings(test_data,'../input/modelf1')

cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf1 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
23it [00:22,  1.04it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/modelf1 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.23it/s]


In [18]:
#giotto-tda , topological features extraction
from gtda.time_series import TakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import Amplitude, NumberOfPoints, PersistenceEntropy

def extract_features(X, time_delay=1, dimension=1, stride=10, h_dim=(0,1), n_jobs=-1):
    
    samples = X
    
    TE = TakensEmbedding(time_delay=time_delay, dimension=dimension, stride=stride)   
    te = TE.fit_transform(samples)
        
    VR = VietorisRipsPersistence(homology_dimensions=h_dim, n_jobs=n_jobs)
    X_vr = VR.fit_transform(te)
    
    X_features = []
    
    #amplitude
    metrics =  ['bottleneck','wasserstein', 'betti', 'landscape', 'silhouette', 'heat']
    for metric in metrics:
        Ampl = Amplitude(metric=metric, n_jobs=n_jobs)
        X_a = Ampl.fit_transform(X_vr)
        X_features.append(X_a[:,0])
        #X_features.append(X_a[:,1])
        
    #entropy
    PE = PersistenceEntropy(normalize=True, nan_fill_value=-1, n_jobs=n_jobs)
    X_pe = PE.fit_transform(X_vr)
    
    X_features.append(X_pe[:,0])
    #X_features.append(X_pe[:,1])

    PE = PersistenceEntropy(normalize=False, nan_fill_value=-1, n_jobs=n_jobs)
    X_pe = PE.fit_transform(X_vr)

    X_features.append(X_pe[:,0])
    #X_features.append(X_pe[:,1])

    #number of points
    #NOP = NumberOfPoints(n_jobs=n_jobs)
    #X_nop = NOP.fit_transform(X_vr)

    #X_features.append(X_nop[:,0])
    #X_features.append(X_nop[:,1])
    
    X_features = np.array(X_features).T
      
    return X_features

In [19]:
tda_features_train = extract_features(train_embeddings, stride = 2)
tda_features_test = extract_features(test_embeddings, stride = 2)

In [20]:
# standard scaler on tda features 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(tda_features_train)

StandardScaler()

In [21]:
tda_train_norm = scaler.transform(tda_features_train)
tda_test_norm = scaler.transform(tda_features_test)

In [22]:
# stacking roberta features (embedding) and tda_features
#train_embeddings_final = np.hstack((train_embeddings, tda_features_train))
#test_embeddings_final = np.hstack((test_embeddings, tda_features_test))

#train_embeddings_final = np.hstack((train_embeddings, tda_train_norm))
#test_embeddings_final = np.hstack((test_embeddings, tda_test_norm))

train_embeddings_final = train_embeddings
test_embeddings_final = test_embeddings

In [23]:
# lgbm on roberta features
lgbm_preds = np.zeros(test_data.shape[0])

params = {
 'reg_alpha':6.147694913504962,
 'reg_lambda':  0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 1e-2,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
 'random_state': 42,
 'verbose':-1,
 'n_estimators': 160000,
 'metric': 'rmse',
 'cat_smooth': 39}

nfolds=5
lgb_models = list()
kfold = StratifiedKFold(n_splits=nfolds)
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['bins'])):
    
    lgb_train = lgb.Dataset(train_embeddings_final[train_idx],target[train_idx].ravel())
    lgb_valid = lgb.Dataset(train_embeddings_final[valid_idx],target[valid_idx].ravel())
    
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      verbose_eval=10000,
                      early_stopping_rounds=800,
                      )

    lgbm_preds += lgb_model.predict(test_embeddings_final)/nfolds

Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[3357]	training's rmse: 0.248122	valid_1's rmse: 0.359864
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[2124]	training's rmse: 0.272074	valid_1's rmse: 0.356901
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[4020]	training's rmse: 0.237888	valid_1's rmse: 0.353208
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[1679]	training's rmse: 0.276484	valid_1's rmse: 0.386539
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[929]	training's rmse: 0.304554	valid_1's rmse: 0.363999


In [24]:
# lgbm on topological features
lgbm_preds_tda = np.zeros(test_data.shape[0])

params = {
 'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 1e-2,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
 'random_state': 42,
 'verbose':-1,
 'n_estimators': 160000,
 'metric': 'rmse',
 'cat_smooth': 39}

nfolds=5
lgb_models = list()
kfold = StratifiedKFold(n_splits=nfolds)
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['bins'])):
    
    lgb_train = lgb.Dataset(tda_train_norm[train_idx],target[train_idx].ravel())
    lgb_valid = lgb.Dataset(tda_train_norm[valid_idx],target[valid_idx].ravel())
    
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      verbose_eval=10000,
                      early_stopping_rounds=800,
                      )

    lgbm_preds_tda += lgb_model.predict(tda_test_norm)/nfolds

Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[4897]	training's rmse: 0.410148	valid_1's rmse: 0.445451
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[4294]	training's rmse: 0.4064	valid_1's rmse: 0.453956
Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.415631	valid_1's rmse: 0.41028
Early stopping, best iteration is:
[14382]	training's rmse: 0.415036	valid_1's rmse: 0.41006
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[2377]	training's rmse: 0.414467	valid_1's rmse: 0.435023
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[5173]	training's rmse: 0.413439	valid_1's rmse: 0.416422


In [25]:
### Yum Yum Ensembling
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [26]:
### Notebook forkato da Yum Yum Yum 
class Dataset:
    def __init__(self, excerpt, tokenizer, max_len):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

    
    
def generate_predictions(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.detach().cpu().numpy().ravel().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)


preds1 = generate_predictions("../input/a81653/", max_len=256)
preds2 = generate_predictions("../input/a81656/", max_len=256)
preds3 = generate_predictions("../input/a81657/", max_len=256)
preds4 = generate_predictions("../input/a81660/", max_len=256)
preds5 = generate_predictions("../input/a81675/", max_len=192)
preds6 = generate_predictions("../input/a87832/", max_len=256)

## RoBERTa Large - Fine tuned 

In [27]:
import pandas as pd
import numpy as np
import torch

torch.cuda.empty_cache()

train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
train = train[train["target"] != 0]

In [28]:
%matplotlib inline
from glob import glob
import os
import matplotlib.pyplot as plt
import json
from collections import defaultdict
import gc
gc.enable()

In [29]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import RobertaConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup
)
from transformers import RobertaTokenizer
from transformers import RobertaModel
from IPython.display import clear_output
from tqdm import tqdm, trange

In [30]:
def convert_examples_to_features(data, tokenizer, max_len, is_test=False):
    data = data.replace('\n', '')
    tok = tokenizer.encode_plus(
        data, 
        max_length=max_len, 
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True
    )
    curr_sent = {}
    padding_length = max_len - len(tok['input_ids'])
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent

In [31]:
class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.values.tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.double),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [32]:
class CommonLitModel(nn.Module):
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        super(CommonLitModel, self).__init__()
        self.config = config
        self.roberta = RobertaModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        self.regressor = nn.Linear(config.hidden_size, 1)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
 
        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

In [33]:
def make_model(model_name='roberta-large', num_labels=1):
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    config = RobertaConfig.from_pretrained(model_name)
    config.update({'num_labels':num_labels})
    model = CommonLitModel(model_name, config=config)
    return model, tokenizer

def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
):
    
    test_dataset = DatasetRetriever(data, tokenizer, max_len, is_test=True)
    test_sampler = SequentialSampler(test_dataset)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size // 2, 
        sampler=test_sampler, 
        pin_memory=False, 
        drop_last=False, 
        num_workers=0
    )

    return test_loader

In [34]:
class Evaluator:
    def __init__(self, model, scalar=None):
        self.model = model
        self.scalar = scalar

    def evaluate(self, data_loader, tokenizer):
        preds = []
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                input_ids, attention_mask, token_type_ids = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids']
                input_ids, attention_mask, token_type_ids = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda()
                
                if self.scalar is not None:
                    with torch.cuda.amp.autocast():
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids
                    )
                
                logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
                preds += logits
        return preds

In [35]:
def config(fold, model_name, load_model_path):
    torch.manual_seed(2021)
    torch.cuda.manual_seed(2021)
    torch.cuda.manual_seed_all(2021)
    
    max_len = 250
    batch_size = 8

    model, tokenizer = make_model(
        model_name=model_name, 
        num_labels=1
    )
    model.load_state_dict(
        torch.load(f'{load_model_path}/model{fold}.bin')
    )
    test_loader = make_loader(
        test, tokenizer, max_len=max_len,
        batch_size=batch_size
    )

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')

    # scaler = torch.cuda.amp.GradScaler()
    scaler = None
    return (
        model, tokenizer, 
        test_loader, scaler
    )

In [36]:
def run(fold=0, model_name=None, load_model_path=None):
    model, tokenizer, \
        test_loader, scaler = config(fold, model_name, load_model_path)
    
    import time

    evaluator = Evaluator(model, scaler)

    test_time_list = []

    torch.cuda.synchronize()
    tic1 = time.time()

    preds = evaluator.evaluate(test_loader, tokenizer)

    torch.cuda.synchronize()
    tic2 = time.time() 
    test_time_list.append(tic2 - tic1)
    
    del model, tokenizer, test_loader, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return preds

In [37]:
pred_df1 = pd.DataFrame()
pred_df2 = pd.DataFrame()
for fold in tqdm(range(5)):
    pred_df1[f'fold{fold}'] = run(fold, '../input/roberta/roberta-base/roberta-base/', '../input/commonlit-roberta-base-i/')
    pred_df2[f'fold{fold}'] = run(fold, '../input/robertalarge/', '../input/robertalargeitptfit/')

  0%|          | 0/5 [00:00<?, ?it/s]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 20%|██        | 1/5 [00:50<03:20, 50.17s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 40%|████      | 2/5 [01:36<02:24, 48.17s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 60%|██████    | 3/5 [02:06<01:19, 39.64s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 80%|████████  | 4/5 [02:37<00:36, 36.19s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


100%|██████████| 5/5 [03:07<00:00, 37.59s/it]


In [38]:
last_preds1 = pred_df1.mean(axis=1).values.tolist()
last_preds2 = pred_df2.mean(axis=1).values.tolist()

In [39]:
preds = (svm_preds1[:,0] + svm_preds2[:,0] + svm_preds3[:,0] + svm_preds4[:,0] + svm_preds5[:,0] 
         + lgbm_preds + lgbm_preds_tda 
         + preds1 + preds2 + preds3 + preds4 + preds5 + preds6 
         + last_preds1 + last_preds2)/15

In [40]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission.target = preds
submission.to_csv("submission.csv", index=False)