In [3]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1
env: TOKENIZERS_PARALLELISM=false


In [4]:
class CFG0:
    model = "microsoft/deberta-v3-base"
    path = "/kaggle/input/fb3-single-pytorch-model-train/20221128-161025-deberta-v3-base/"
    base = "../input/fb3models/microsoft-deberta-v3-base/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=24
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG1:
    model = "microsoft/deberta-v3-base"
    path = "../input/0911-deberta-v3-base/"
    base = "../input/fb3models/microsoft-deberta-v3-base/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=24
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0

    
class CFG2:
    model = "microsoft/deberta-v3-large"
    path = "../input/0911-deberta-v3-large/"
    base = "../input/fb3models/microsoft-deberta-v3-large/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=16
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG3:
    model = "microsoft/deberta-v2-xlarge"
    path = "../input/0911-deberta-v2-xlarge/"
    base = "../input/fb3models/microsoft-deberta-v2-xlarge/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0

class CFG4:
    model = "microsoft/deberta-v3-base"
    path = "../input/0913-deberta-v3-base-fgm/"
    base = "../input/fb3models/microsoft-deberta-v3-base/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=24
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG5:
    model = "microsoft/deberta-v3-large"
    path = "../input/0914-deberta-v3-large-fgm/"
    base = "../input/fb3models/microsoft-deberta-v3-large/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=16
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG6:
    model = "microsoft/deberta-v2-xlarge"
    path = "../input/0919-deberta-v2-xlarge/"
    base = "../input/fb3models/microsoft-deberta-v2-xlarge/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG7:
    model = "microsoft/deberta-v2-xlarge-mnli"
    path = "../input/0919-deberta-v2-xlarge-mnli/"
    base = "../input/fb3models/microsoft-deberta-v2-xlarge/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG8:
    model = "microsoft/deberta-v3-large"
    path = "../input/0925-deberta-v3-large-unscale/"
    base = "../input/fb3models/microsoft-deberta-v3-large/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG9:
    model = "microsoft/deberta-v3-large"
    path = "../input/0926-deberta-v3-large-unscale/"
    base = "../input/fb3models/microsoft-deberta-v3-large/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG10:
    model = "microsoft/deberta-v3-large"
    path = "../input/0927-deberta-v3-large-unscale/"
    base = "../input/fb3models/microsoft-deberta-v3-large/"
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
CFG_list = [CFG0, CFG1, CFG2, CFG3, CFG4, CFG5, CFG6, CFG7, CFG8, CFG9, CFG10]

In [5]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [20]:
# ====================================================
# oof
# ====================================================
repeats = ['text_id','full_text','cohesion','syntax','vocabulary','phraseology','grammar','conventions','fold']
all_oofs = []
for CFG in CFG_list:
    oof_df = pd.read_pickle(CFG.path+'oof_df.pkl').sort_values("text_id").reset_index(drop=True)
    all_oofs.append(oof_df.drop(columns = repeats))
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Model: {CFG.model} Score: {score:<.4f}  Scores: {scores}')

Model: microsoft/deberta-v3-base Score: 0.4535  Scores: [0.4819319573874839, 0.4463208426293747, 0.4160686326130345, 0.45453883048349986, 0.473678143709784, 0.44875544473089224]
Model: microsoft/deberta-v3-base Score: 0.4595  Scores: [0.4933251819697969, 0.4502769020486089, 0.4195917881022107, 0.4616825211894006, 0.47917149434101597, 0.4531265111349054]
Model: microsoft/deberta-v3-large Score: 0.4553  Scores: [0.4825651519863177, 0.4513317839900914, 0.41614874855477335, 0.4570395272138213, 0.474128019225801, 0.450479894985895]
Model: microsoft/deberta-v2-xlarge Score: 0.4604  Scores: [0.4917071615090481, 0.45037595310481654, 0.41863623824576296, 0.4599613387949261, 0.48437918084586873, 0.45728281977714613]
Model: microsoft/deberta-v3-base Score: 0.4590  Scores: [0.4927071121286561, 0.449759196538063, 0.41900989606208666, 0.46100316932040336, 0.47875430698135113, 0.45294619259154856]
Model: microsoft/deberta-v3-large Score: 0.4564  Scores: [0.4858750389533292, 0.4521088407160486, 0.4164

In [21]:
features = np.concatenate(all_oofs,axis=1)

In [23]:
features.shape

(3911, 66)

In [26]:
feats =[f"col{i}" for i in range(features.shape[1])]

In [27]:
oof_df[feats] = features
oof_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,fold,pred_cohesion,...,col56,col57,col58,col59,col60,col61,col62,col63,col64,col65
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,9,2.980735,...,3.125459,3.190478,3.196349,2.759905,2.980735,2.835174,3.170818,3.228327,3.237290,2.741138
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,2,2.617372,...,2.964808,2.631736,2.597791,2.538102,2.617372,2.424425,2.787720,2.749041,2.504684,2.660148
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,1,2.927259,...,3.102823,2.969282,3.015122,2.955758,2.927259,2.872589,3.009427,2.989432,3.138277,3.040892
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,1,3.609747,...,4.027184,3.896751,3.941613,3.769177,3.609747,3.580019,3.711982,3.699846,3.702923,3.738539
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,0,2.433649,...,2.649640,2.740055,2.587462,2.310569,2.433649,2.372660,2.812410,2.606741,2.464537,2.190254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5,5,2.676923,...,2.847502,2.849612,2.645768,2.241242,2.676923,2.744390,2.670801,2.791678,2.739287,2.175364
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0,3,3.742900,...,3.556110,3.405059,3.109952,3.174611,3.742900,3.415915,3.511111,3.421340,3.148365,3.035161
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0,8,3.040246,...,3.295261,3.373523,3.541609,2.943113,3.040246,3.022676,3.148523,3.220136,3.445342,2.865385
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5,5,3.753411,...,3.922573,3.876611,3.640419,3.903100,3.753411,3.827273,3.852465,3.807061,3.847542,3.894267


In [29]:
from sklearn.linear_model import Ridge
import pandas as pd
from sklearn import model_selection
from tqdm.auto import tqdm
import sys
sys.path.append('/kaggle/input/multilabel-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
def create_folds(data, num_splits):
    data["kfold"] = -1
    mskf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=7)
    labels = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    data_labels = data[labels].values
    for f, (t_, v_) in enumerate(mskf.split(data, data_labels)):
        data.loc[v_, "kfold"] = f

    return data

data = create_folds(oof_df,5)

In [40]:
oof_score = 0.0
repeats = ['text_id','full_text','cohesion','syntax','vocabulary','phraseology','grammar','conventions','fold','kfold'] + [col for col in data.columns if col.startswith("p")]
labels = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
features = [col for col in data.columns if col not in repeats]
oof_df[features]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col56,col57,col58,col59,col60,col61,col62,col63,col64,col65
0,2.963322,2.851745,3.079232,3.097821,3.035727,2.800188,2.905647,2.712398,3.098406,3.137756,...,3.125459,3.190478,3.196349,2.759905,2.980735,2.835174,3.170818,3.228327,3.237290,2.741138
1,2.843607,2.688936,2.899376,2.858508,2.490706,2.679547,2.845474,2.702387,2.963810,2.712420,...,2.964808,2.631736,2.597791,2.538102,2.617372,2.424425,2.787720,2.749041,2.504684,2.660148
2,2.877301,2.900978,3.044960,2.898928,2.955909,3.149401,2.868120,2.874635,3.114243,2.881338,...,3.102823,2.969282,3.015122,2.955758,2.927259,2.872589,3.009427,2.989432,3.138277,3.040892
3,3.709563,3.703306,3.871459,3.771405,3.729018,3.824048,3.635967,3.620107,3.801861,3.653327,...,4.027184,3.896751,3.941613,3.769177,3.609747,3.580019,3.711982,3.699846,3.702923,3.738539
4,2.401905,2.360212,2.802807,2.705976,2.706934,2.256478,2.427323,2.465084,2.687973,2.440547,...,2.649640,2.740055,2.587462,2.310569,2.433649,2.372660,2.812410,2.606741,2.464537,2.190254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3906,2.528209,2.611595,2.770499,2.813644,2.664971,2.180138,2.594011,2.495946,2.827858,2.846410,...,2.847502,2.849612,2.645768,2.241242,2.676923,2.744390,2.670801,2.791678,2.739287,2.175364
3907,3.627993,3.334414,3.623081,3.451083,3.133774,3.339598,3.523348,3.240285,3.494922,3.322443,...,3.556110,3.405059,3.109952,3.174611,3.742900,3.415915,3.511111,3.421340,3.148365,3.035161
3908,3.012970,2.960456,3.121825,3.276449,3.418097,2.793657,2.873064,2.993003,3.303614,3.369499,...,3.295261,3.373523,3.541609,2.943113,3.040246,3.022676,3.148523,3.220136,3.445342,2.865385
3909,3.921140,3.916481,3.982096,4.007552,4.094439,4.109852,3.812394,3.758009,3.962001,3.932942,...,3.922573,3.876611,3.640419,3.903100,3.753411,3.827273,3.852465,3.807061,3.847542,3.894267


In [41]:
len(all_oofs)

11

In [42]:
data[features]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col56,col57,col58,col59,col60,col61,col62,col63,col64,col65
0,2.963322,2.851745,3.079232,3.097821,3.035727,2.800188,2.905647,2.712398,3.098406,3.137756,...,3.125459,3.190478,3.196349,2.759905,2.980735,2.835174,3.170818,3.228327,3.237290,2.741138
1,2.843607,2.688936,2.899376,2.858508,2.490706,2.679547,2.845474,2.702387,2.963810,2.712420,...,2.964808,2.631736,2.597791,2.538102,2.617372,2.424425,2.787720,2.749041,2.504684,2.660148
2,2.877301,2.900978,3.044960,2.898928,2.955909,3.149401,2.868120,2.874635,3.114243,2.881338,...,3.102823,2.969282,3.015122,2.955758,2.927259,2.872589,3.009427,2.989432,3.138277,3.040892
3,3.709563,3.703306,3.871459,3.771405,3.729018,3.824048,3.635967,3.620107,3.801861,3.653327,...,4.027184,3.896751,3.941613,3.769177,3.609747,3.580019,3.711982,3.699846,3.702923,3.738539
4,2.401905,2.360212,2.802807,2.705976,2.706934,2.256478,2.427323,2.465084,2.687973,2.440547,...,2.649640,2.740055,2.587462,2.310569,2.433649,2.372660,2.812410,2.606741,2.464537,2.190254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3906,2.528209,2.611595,2.770499,2.813644,2.664971,2.180138,2.594011,2.495946,2.827858,2.846410,...,2.847502,2.849612,2.645768,2.241242,2.676923,2.744390,2.670801,2.791678,2.739287,2.175364
3907,3.627993,3.334414,3.623081,3.451083,3.133774,3.339598,3.523348,3.240285,3.494922,3.322443,...,3.556110,3.405059,3.109952,3.174611,3.742900,3.415915,3.511111,3.421340,3.148365,3.035161
3908,3.012970,2.960456,3.121825,3.276449,3.418097,2.793657,2.873064,2.993003,3.303614,3.369499,...,3.295261,3.373523,3.541609,2.943113,3.040246,3.022676,3.148523,3.220136,3.445342,2.865385
3909,3.921140,3.916481,3.982096,4.007552,4.094439,4.109852,3.812394,3.758009,3.962001,3.932942,...,3.922573,3.876611,3.640419,3.903100,3.753411,3.827273,3.852465,3.807061,3.847542,3.894267


In [43]:
for i in range(5):
  feats = data[data['kfold'] != i][features].reset_index(drop=True).values
  val = data[data['kfold'] == i][features].reset_index(drop=True).values
  yt =  data[data['kfold'] != i][labels].reset_index(drop=True)
  yv = data[data['kfold'] == i][labels].reset_index(drop=True)
  stacker = Ridge()
  stacker.fit(feats,yt)
  train_preds = stacker.predict(feats)
  train_score = get_score(yt.values,train_preds)
  print(f"Fold: {i} ====  Train Score: {train_score}")
  val_preds = stacker.predict(val)
  val_score = get_score(yv.values,val_preds)
  print(f"Fold: {i} ====  Valid Score: {val_score}")
  pickle.dump(stacker, open(f"ridge{i}.pkl", "wb" ) )
  oof_score += val_score[0]/5
print(f"OOF Score: {oof_score}")

Fold: 0 ====  Train Score: (0.44108933047298976, [0.471853719006943, 0.4353774506681814, 0.39950799741801063, 0.4442589937992913, 0.46064413930874565, 0.434893682636767])
Fold: 0 ====  Valid Score: (0.4458452912195179, [0.46611777331592047, 0.44164689252119893, 0.42730461276817877, 0.4441585092596175, 0.4576129136457933, 0.43823104580639843])
Fold: 1 ====  Train Score: (0.4386519954710159, [0.46580855073284655, 0.43583772922830855, 0.4008231689293841, 0.43874632890575427, 0.45580855202239834, 0.4348876430074036])
Fold: 1 ====  Valid Score: (0.4545741201654851, [0.489406156230043, 0.43800823053151106, 0.4207828601418334, 0.4643178928542624, 0.4768438199450416, 0.438085761290219])
Fold: 2 ====  Train Score: (0.4393389163449302, [0.465822335682409, 0.4330931393706596, 0.4061281176566383, 0.44019156494411826, 0.4573467351807594, 0.433451605234997])
Fold: 2 ====  Valid Score: (0.4517206257988255, [0.48994487519813323, 0.44974400889095134, 0.3991201749361936, 0.458082451222782, 0.46979737851

In [45]:
feats.shape

(3129, 66)

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        #max_length=CFG.max_len,
        #pad_to_max_length=True,
        #truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

In [None]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings
    
class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e-4
        min_embeddings, _ = torch.min(embeddings, dim = 1)
        return min_embeddings
        

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [None]:
class FB3Model(nn.Module):
    def __init__(self, CFG, config_path = None, pretrained = False):
        super().__init__()
        self.CFG = CFG
        self.CFG.init_weight = "normal"

        self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        self.model = AutoModel.from_config(self.config)
        
        self.pool = MeanPooling()      
        self.dropout = nn.Dropout(0.2)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        
        self.output = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.output)

        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data)
                
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data)
                
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
    
            
        return feature
    
    def forward(self, inputs):
        embeds = self.feature(inputs)
        logits1 = self.output(self.dropout1(embeds))
        logits2 = self.output(self.dropout2(embeds))
        logits3 = self.output(self.dropout3(embeds))
        logits4 = self.output(self.dropout4(embeds))
        logits5 = self.output(self.dropout5(embeds))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        
        return logits

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
for _idx, CFG in enumerate(CFG_list):
    test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
    submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')
    # sort by length to speed up inference
    test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['full_text'].values]
    test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)

    test_dataset = TestDataset(CFG, test)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    predictions = []
    for fold in CFG.trn_fold:
        if _idx == 0:
            model = FB3Model(CFG,config_path=CFG.config_path,pretrained = False)
        else:
            model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
        state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
    predictions = np.mean(predictions, axis=0)
    test[CFG.target_cols] = predictions
    submission = submission.drop(columns=CFG.target_cols).merge(test[['text_id'] + CFG.target_cols], on='text_id', how='left')
    #display(submission.head())
    submission[['text_id'] + CFG.target_cols].to_csv(f'submission_{_idx}.csv', index=False)
    del test, submission, predictions, test_dataset, test_loader; gc.collect()
    torch.cuda.empty_cache() 

In [None]:
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')
sub0 = pd.read_csv("submission_0.csv").drop(columns="text_id")
sub1 = pd.read_csv("submission_1.csv").drop(columns="text_id")
sub2 = pd.read_csv(f'submission_2.csv').drop(columns="text_id")
sub3 = pd.read_csv("submission_3.csv").drop(columns="text_id")
sub4 = pd.read_csv(f'submission_4.csv').drop(columns="text_id")
sub5 = pd.read_csv(f'submission_5.csv').drop(columns="text_id")
sub6 = pd.read_csv(f'submission_6.csv').drop(columns="text_id")
sub7 = pd.read_csv(f'submission_7.csv').drop(columns="text_id")
sub8 = pd.read_csv(f'submission_8.csv').drop(columns="text_id")
sub9 = pd.read_csv(f'submission_9.csv').drop(columns="text_id")
sub10 = pd.read_csv("submission_10.csv").drop(columns="text_id")

In [None]:
all_subs = []
all_subs.append(sub0)
all_subs.append(sub1)
all_subs.append(sub2)
all_subs.append(sub3)
all_subs.append(sub4)
all_subs.append(sub5)
all_subs.append(sub6)
all_subs.append(sub7)
all_subs.append(sub8)
all_subs.append(sub9)
all_subs.append(sub10)

In [None]:
#features = pd.concat(all_subs,axis=1)
total_predictions = pd.concat(all_subs,axis=1)
final_preds = []
for i in range(5):
    ridge = pickle.load(open(f'ridge{i}.pkl','rb'))
    predictions = ridge.predict(total_predictions)
    final_preds.append(predictions)
ens = np.mean(final_preds, axis=0)

In [None]:
submission[CFG1.target_cols] = ens
display(submission.head())
submission.to_csv('submission.csv', index=False)