In [1]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool, CatBoost

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [2]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [3]:
config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [4]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [5]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [6]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x

In [7]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [8]:
train_embeddings1 =  get_embeddings(train_data,'../input/clr-roberta/model0/model0.bin')
test_embeddings1 = get_embeddings(test_data,'../input/clr-roberta/model0/model0.bin')

train_embeddings2 =  get_embeddings(train_data,'../input/clr-roberta/model1/model1.bin')
test_embeddings2 = get_embeddings(test_data,'../input/clr-roberta/model1/model1.bin')

train_embeddings3 =  get_embeddings(train_data,'../input/clr-roberta/model2/model2.bin')
test_embeddings3 = get_embeddings(test_data,'../input/clr-roberta/model2/model2.bin')

train_embeddings4 =  get_embeddings(train_data,'../input/clr-roberta/model3/model3.bin')
test_embeddings4 = get_embeddings(test_data,'../input/clr-roberta/model3/model3.bin')

train_embeddings5 =  get_embeddings(train_data,'../input/clr-roberta/model4/model4.bin')
test_embeddings5 = get_embeddings(test_data,'../input/clr-roberta/model4/model4.bin')

cuda is used


23it [00:23,  1.01s/it]


cuda is used


1it [00:00,  6.55it/s]


cuda is used


23it [00:21,  1.06it/s]


cuda is used


1it [00:00,  6.49it/s]


cuda is used


23it [00:21,  1.06it/s]


cuda is used


1it [00:00,  5.47it/s]


cuda is used


23it [00:22,  1.04it/s]


cuda is used


1it [00:00,  5.07it/s]


cuda is used


23it [00:21,  1.05it/s]


cuda is used


1it [00:00,  4.81it/s]


## svm

In [9]:
def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [10]:
svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)

Fold 0 , rmse score: 0.47727497027807975
Fold 1 , rmse score: 0.2578880377682382
Fold 2 , rmse score: 0.2541082644861084
Fold 3 , rmse score: 0.24054325983483563
Fold 4 , rmse score: 0.2551938404392728
mean rmse 0.297001674561307
Fold 0 , rmse score: 0.2425235616562364
Fold 1 , rmse score: 0.5016510069411838
Fold 2 , rmse score: 0.23847703706225037
Fold 3 , rmse score: 0.23492862892763802
Fold 4 , rmse score: 0.24930508381707772
mean rmse 0.2933770636808773
Fold 0 , rmse score: 0.3908661185328736
Fold 1 , rmse score: 0.4135395361379846
Fold 2 , rmse score: 0.4895740066653232
Fold 3 , rmse score: 0.3756053322147557
Fold 4 , rmse score: 0.4008420352519307
mean rmse 0.4140854057605735
Fold 0 , rmse score: 0.2878583154409844
Fold 1 , rmse score: 0.27540087825175247
Fold 2 , rmse score: 0.2809937810667762
Fold 3 , rmse score: 0.45763179894261946
Fold 4 , rmse score: 0.2852643347020617
mean rmse 0.31742982168083883
Fold 0 , rmse score: 0.4013671778501825
Fold 1 , rmse score: 0.42662641489545

In [11]:
#library for topological data analysis
!pip install pyflagser --no-index --find-links=file:///kaggle/input/roberta/pyflagser-0.4.4-cp37-cp37m-manylinux2010_x86_64.whl
!pip install giotto-tda --no-index --find-links=file:///kaggle/input/roberta/giotto_tda-0.4.0-cp37-cp37m-manylinux2010_x86_64.whl

Looking in links: file:///kaggle/input/roberta/pyflagser-0.4.4-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/roberta/pyflagser-0.4.4-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: pyflagser
Successfully installed pyflagser-0.4.4
Looking in links: file:///kaggle/input/roberta/giotto_tda-0.4.0-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/roberta/giotto_tda-0.4.0-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: giotto-tda
Successfully installed giotto-tda-0.4.0


In [12]:
#giotto-tda , topological features extraction
from gtda.time_series import TakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import Amplitude, NumberOfPoints, PersistenceEntropy

def extract_features(X, time_delay=1, dimension=1, stride=10, h_dim=(0,1), n_jobs=-1):
    
    samples = X
    
    TE = TakensEmbedding(time_delay=time_delay, dimension=dimension, stride=stride)   
    te = TE.fit_transform(samples)
        
    VR = VietorisRipsPersistence(homology_dimensions=h_dim, n_jobs=n_jobs)
    X_vr = VR.fit_transform(te)
    
    X_features = []
    
    #amplitude
    metrics =  ['bottleneck','wasserstein', 'betti', 'landscape', 'silhouette', 'heat']
    for metric in metrics:
        Ampl = Amplitude(metric=metric, n_jobs=n_jobs)
        X_a = Ampl.fit_transform(X_vr)
        X_features.append(X_a[:,0])
        #X_features.append(X_a[:,1])
        
    #entropy
    PE = PersistenceEntropy(normalize=True, nan_fill_value=-1, n_jobs=n_jobs)
    X_pe = PE.fit_transform(X_vr)
    
    X_features.append(X_pe[:,0])
    #X_features.append(X_pe[:,1])

    PE = PersistenceEntropy(normalize=False, nan_fill_value=-1, n_jobs=n_jobs)
    X_pe = PE.fit_transform(X_vr)

    X_features.append(X_pe[:,0])
    #X_features.append(X_pe[:,1])

    #number of points
    #NOP = NumberOfPoints(n_jobs=n_jobs)
    #X_nop = NOP.fit_transform(X_vr)

    #X_features.append(X_nop[:,0])
    #X_features.append(X_nop[:,1])
    
    X_features = np.array(X_features).T
      
    return X_features

In [13]:
tda_features_train1 = extract_features(train_embeddings1, stride = 2)
tda_features_test1 = extract_features(test_embeddings1, stride = 2)

tda_features_train2 = extract_features(train_embeddings2, stride = 2)
tda_features_test2 = extract_features(test_embeddings2, stride = 2)

tda_features_train3 = extract_features(train_embeddings3, stride = 2)
tda_features_test3 = extract_features(test_embeddings3, stride = 2)

tda_features_train4 = extract_features(train_embeddings4, stride = 2)
tda_features_test4 = extract_features(test_embeddings4, stride = 2)

tda_features_train5 = extract_features(train_embeddings5, stride = 2)
tda_features_test5 = extract_features(test_embeddings5, stride = 2)

In [14]:
# standard scaler on tda features 
from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler()
scaler1.fit(tda_features_train1)

scaler2 = StandardScaler()
scaler2.fit(tda_features_train2)

scaler3 = StandardScaler()
scaler3.fit(tda_features_train3)

scaler4 = StandardScaler()
scaler4.fit(tda_features_train4)

scaler5 = StandardScaler()
scaler5.fit(tda_features_train5)

StandardScaler()

In [15]:
tda_train_norm1 = scaler1.transform(tda_features_train1)
tda_test_norm1 = scaler1.transform(tda_features_test1)

tda_train_norm2 = scaler2.transform(tda_features_train2)
tda_test_norm2 = scaler2.transform(tda_features_test2)

tda_train_norm3 = scaler3.transform(tda_features_train3)
tda_test_norm3 = scaler3.transform(tda_features_test3)

tda_train_norm4 = scaler4.transform(tda_features_train4)
tda_test_norm4 = scaler4.transform(tda_features_test4)

tda_train_norm5 = scaler5.transform(tda_features_train5)
tda_test_norm5 = scaler5.transform(tda_features_test5)

In [16]:
### LightGBM on Top Features 

# lgbm on topological features
lgbm_preds_tda1 = np.zeros(test_data.shape[0])

params = {
 'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 1e-2,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
 'random_state': 42,
 'verbose':-1,
 'n_estimators': 160000,
 'metric': 'rmse',
 'cat_smooth': 39}

nfolds=5
kfold = StratifiedKFold(n_splits=nfolds)
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['bins'])):
    
    lgb_train = lgb.Dataset(tda_train_norm1[train_idx],target[train_idx].ravel())
    lgb_valid = lgb.Dataset(tda_train_norm1[valid_idx],target[valid_idx].ravel())
    
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      verbose_eval=10000,
                      early_stopping_rounds=800,
                      )

    lgbm_preds_tda1 += lgb_model.predict(tda_test_norm1)/nfolds
    
####

Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.404787	valid_1's rmse: 0.50171
[20000]	training's rmse: 0.404282	valid_1's rmse: 0.501281
[30000]	training's rmse: 0.404282	valid_1's rmse: 0.501281
[40000]	training's rmse: 0.404282	valid_1's rmse: 0.501281
Early stopping, best iteration is:
[48361]	training's rmse: 0.404282	valid_1's rmse: 0.501281
Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.408614	valid_1's rmse: 0.505876
[20000]	training's rmse: 0.407509	valid_1's rmse: 0.505058
[30000]	training's rmse: 0.407509	valid_1's rmse: 0.505056
Early stopping, best iteration is:
[31968]	training's rmse: 0.407509	valid_1's rmse: 0.505056
Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.416634	valid_1's rmse: 0.415634
Early stopping, best iteration is:
[11772]	training's rmse: 0.415705	valid_1's rmse: 0.415267
Training until validation scores don't improve for 800 rounds

In [17]:
# lgbm on topological features
lgbm_preds_tda2 = np.zeros(test_data.shape[0])

params = {
 'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 1e-2,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
 'random_state': 42,
 'verbose':-1,
 'n_estimators': 160000,
 'metric': 'rmse',
 'cat_smooth': 39}

nfolds=5
kfold = StratifiedKFold(n_splits=nfolds)
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['bins'])):
    
    lgb_train = lgb.Dataset(tda_train_norm2[train_idx],target[train_idx].ravel())
    lgb_valid = lgb.Dataset(tda_train_norm2[valid_idx],target[valid_idx].ravel())
    
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      verbose_eval=10000,
                      early_stopping_rounds=800,
                      )

    lgbm_preds_tda2 += lgb_model.predict(tda_test_norm2)/nfolds

Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.502021	valid_1's rmse: 0.656131
Early stopping, best iteration is:
[11728]	training's rmse: 0.501561	valid_1's rmse: 0.655667
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[8587]	training's rmse: 0.503213	valid_1's rmse: 0.639628
Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.53403	valid_1's rmse: 0.491294
Early stopping, best iteration is:
[11427]	training's rmse: 0.532508	valid_1's rmse: 0.490417
Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.528062	valid_1's rmse: 0.517515
Early stopping, best iteration is:
[11380]	training's rmse: 0.526618	valid_1's rmse: 0.517124
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[8814]	training's rmse: 0.510829	valid_1's rmse: 0.596892


In [18]:
# lgbm on topological features
lgbm_preds_tda3 = np.zeros(test_data.shape[0])

params = {
 'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 1e-2,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
 'random_state': 42,
 'verbose':-1,
 'n_estimators': 160000,
 'metric': 'rmse',
 'cat_smooth': 39}

nfolds=5
kfold = StratifiedKFold(n_splits=nfolds)
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['bins'])):
    
    lgb_train = lgb.Dataset(tda_train_norm3[train_idx],target[train_idx].ravel())
    lgb_valid = lgb.Dataset(tda_train_norm3[valid_idx],target[valid_idx].ravel())
    
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      verbose_eval=10000,
                      early_stopping_rounds=800,
                      )

    lgbm_preds_tda3 += lgb_model.predict(tda_test_norm3)/nfolds

Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[4414]	training's rmse: 0.538014	valid_1's rmse: 0.656611
Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.542316	valid_1's rmse: 0.629098
Early stopping, best iteration is:
[10688]	training's rmse: 0.542072	valid_1's rmse: 0.628995
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[6103]	training's rmse: 0.553817	valid_1's rmse: 0.562122
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[6898]	training's rmse: 0.552445	valid_1's rmse: 0.559584
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[6620]	training's rmse: 0.53712	valid_1's rmse: 0.633589


In [19]:
# lgbm on topological features
lgbm_preds_tda4 = np.zeros(test_data.shape[0])

params = {
 'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 1e-2,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
 'random_state': 42,
 'verbose':-1,
 'n_estimators': 160000,
 'metric': 'rmse',
 'cat_smooth': 39}

nfolds=5
kfold = StratifiedKFold(n_splits=nfolds)
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['bins'])):
    
    lgb_train = lgb.Dataset(tda_train_norm4[train_idx],target[train_idx].ravel())
    lgb_valid = lgb.Dataset(tda_train_norm4[valid_idx],target[valid_idx].ravel())
    
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      verbose_eval=10000,
                      early_stopping_rounds=800,
                      )

    lgbm_preds_tda4 += lgb_model.predict(tda_test_norm4)/nfolds

Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[9075]	training's rmse: 0.572141	valid_1's rmse: 0.789692
Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.601167	valid_1's rmse: 0.683834
Early stopping, best iteration is:
[9263]	training's rmse: 0.601181	valid_1's rmse: 0.683826
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[8286]	training's rmse: 0.605616	valid_1's rmse: 0.650489
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[6638]	training's rmse: 0.617769	valid_1's rmse: 0.621675
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[8526]	training's rmse: 0.593702	valid_1's rmse: 0.671693


In [20]:
# lgbm on topological features
lgbm_preds_tda5 = np.zeros(test_data.shape[0])

params = {
 'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 1e-2,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
 'random_state': 42,
 'verbose':-1,
 'n_estimators': 160000,
 'metric': 'rmse',
 'cat_smooth': 39}

nfolds=5
kfold = StratifiedKFold(n_splits=nfolds)
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['bins'])):
    
    lgb_train = lgb.Dataset(tda_train_norm5[train_idx],target[train_idx].ravel())
    lgb_valid = lgb.Dataset(tda_train_norm5[valid_idx],target[valid_idx].ravel())
    
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      verbose_eval=10000,
                      early_stopping_rounds=800,
                      )

    lgbm_preds_tda5 += lgb_model.predict(tda_test_norm5)/nfolds

Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.515625	valid_1's rmse: 0.626008
Early stopping, best iteration is:
[11538]	training's rmse: 0.515612	valid_1's rmse: 0.625981
Training until validation scores don't improve for 800 rounds
[10000]	training's rmse: 0.530877	valid_1's rmse: 0.575071
Early stopping, best iteration is:
[11924]	training's rmse: 0.529506	valid_1's rmse: 0.574773
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[3866]	training's rmse: 0.541741	valid_1's rmse: 0.538031
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[2808]	training's rmse: 0.544336	valid_1's rmse: 0.587953
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[6396]	training's rmse: 0.522228	valid_1's rmse: 0.615413


In [21]:
svm_preds = 0.75*(svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5 + 0.25*(lgbm_preds_tda1+ lgbm_preds_tda2 + lgbm_preds_tda3 + lgbm_preds_tda4 + lgbm_preds_tda5 )/5

**The second notebook**

In [22]:
%matplotlib inline
from glob import glob
import os
import matplotlib.pyplot as plt
import json
from collections import defaultdict
import gc
gc.enable()

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import RobertaConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup
)
from transformers import RobertaTokenizer
from transformers import RobertaModel
from IPython.display import clear_output
from tqdm import tqdm, trange

In [24]:
def convert_examples_to_features(data, tokenizer, max_len, is_test=False):
    data = data.replace('\n', '')
    tok = tokenizer.encode_plus(
        data, 
        max_length=max_len, 
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True
    )
    curr_sent = {}
    padding_length = max_len - len(tok['input_ids'])
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent

In [25]:
class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.values.tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.double),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [26]:
class CommonLitModel(nn.Module):
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        super(CommonLitModel, self).__init__()
        self.config = config
        self.roberta = RobertaModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        self.regressor = nn.Linear(config.hidden_size, 1)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
 
        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

In [27]:
def make_model(model_name, num_labels=1):
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    config = RobertaConfig.from_pretrained(model_name)
    config.update({'num_labels':num_labels})
    model = CommonLitModel(model_name, config=config)
    return model, tokenizer

def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
):
    
    test_dataset = DatasetRetriever(data, tokenizer, max_len, is_test=True)
    test_sampler = SequentialSampler(test_dataset)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size // 2, 
        sampler=test_sampler, 
        pin_memory=False, 
        drop_last=False, 
        num_workers=0
    )

    return test_loader

In [28]:
class Evaluator:
    def __init__(self, model, scalar=None):
        self.model = model
        self.scalar = scalar

    def evaluate(self, data_loader, tokenizer):
        preds = []
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                input_ids, attention_mask, token_type_ids = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids']
                input_ids, attention_mask, token_type_ids = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda()
                
                if self.scalar is not None:
                    with torch.cuda.amp.autocast():
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids
                    )
                
                logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
                preds += logits
        return preds

In [29]:
def config(fold, model_name, load_model_path):
    torch.manual_seed(2021)
    torch.cuda.manual_seed(2021)
    torch.cuda.manual_seed_all(2021)
    
    max_len = 250
    batch_size = 8

    model, tokenizer = make_model(
        model_name=model_name, 
        num_labels=1
    )
    model.load_state_dict(
        torch.load(f'{load_model_path}/model{fold}.bin')
    )
    test_loader = make_loader(
        test, tokenizer, max_len=max_len,
        batch_size=batch_size
    )

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')

    # scaler = torch.cuda.amp.GradScaler()
    scaler = None
    return (
        model, tokenizer, 
        test_loader, scaler
    )

In [30]:
def run(fold=0, model_name=None, load_model_path=None):
    model, tokenizer, \
        test_loader, scaler = config(fold, model_name, load_model_path)
    
    import time

    evaluator = Evaluator(model, scaler)

    test_time_list = []

    torch.cuda.synchronize()
    tic1 = time.time()

    preds = evaluator.evaluate(test_loader, tokenizer)

    torch.cuda.synchronize()
    tic2 = time.time() 
    test_time_list.append(tic2 - tic1)
    
    del model, tokenizer, test_loader, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return preds

In [31]:
#%%time

import pandas as pd
import numpy as np

train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

pred_df1 = pd.DataFrame()
pred_df2 = pd.DataFrame()
pred_df3 = pd.DataFrame()
for fold in tqdm(range(5)):
    pred_df1[f'fold{fold}'] = run(fold, '../input/roberta-base/', '../input/commonlit-roberta-base-i/')
    pred_df2[f'fold{fold+5}'] = run(fold, '../input/robertalarge/', '../input/roberta-large-itptfit/')
    pred_df3[f'fold{fold+10}'] = run(fold, '../input/robertalarge/', '../input/commonlit-roberta-large-ii/')

  0%|          | 0/5 [00:00<?, ?it/s]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 20%|██        | 1/5 [01:08<04:33, 68.46s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 40%|████      | 2/5 [02:02<03:00, 60.10s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 60%|██████    | 3/5 [02:56<01:54, 57.28s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 80%|████████  | 4/5 [03:53<00:57, 57.01s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


100%|██████████| 5/5 [04:48<00:00, 57.66s/it]


In [32]:
sample['target'] = (pred_df2.mean(axis=1)*0.35) + (pred_df1.mean(axis=1)*0.20) + (pred_df3.mean(axis=1) * 0.15) + (svm_preds * 0.30)
sample.to_csv('submission.csv', index=False)