In [1]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [2]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = 10 #int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [3]:
config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [4]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [5]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [6]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x

In [7]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [8]:
train_embeddings1 =  get_embeddings(train_data,'../input/d/maunish/clr-roberta/model0/model0.bin')
test_embeddings1 = get_embeddings(test_data,'../input/d/maunish/clr-roberta/model0/model0.bin')

train_embeddings2 =  get_embeddings(train_data,'../input/d/maunish/clr-roberta/model1/model1.bin')
test_embeddings2 = get_embeddings(test_data,'../input/d/maunish/clr-roberta/model1/model1.bin')

train_embeddings3 =  get_embeddings(train_data,'../input/d/maunish/clr-roberta/model2/model2.bin')
test_embeddings3 = get_embeddings(test_data,'../input/d/maunish/clr-roberta/model2/model2.bin')

train_embeddings4 =  get_embeddings(train_data,'../input/d/maunish/clr-roberta/model3/model3.bin')
test_embeddings4 = get_embeddings(test_data,'../input/d/maunish/clr-roberta/model3/model3.bin')

train_embeddings5 =  get_embeddings(train_data,'../input/d/maunish/clr-roberta/model4/model4.bin')
test_embeddings5 = get_embeddings(test_data,'../input/d/maunish/clr-roberta/model4/model4.bin')
"""""";

cuda is used


23it [00:23,  1.01s/it]


cuda is used


1it [00:00,  6.60it/s]


cuda is used


23it [00:21,  1.06it/s]


cuda is used


1it [00:00,  6.44it/s]


cuda is used


23it [00:21,  1.06it/s]


cuda is used


1it [00:00,  6.15it/s]


cuda is used


23it [00:21,  1.06it/s]


cuda is used


1it [00:00,  5.87it/s]


cuda is used


23it [00:21,  1.05it/s]


cuda is used


1it [00:00,  4.97it/s]


In [9]:
train_embeddings = (train_embeddings1 + train_embeddings2 + train_embeddings3 + train_embeddings4 + train_embeddings5) / 5
test_embeddings = (test_embeddings1 + test_embeddings2 + test_embeddings3 + test_embeddings4 + test_embeddings5) / 5

### UMAP

In [10]:
#import umap # nonlinear dimensionality reduction
#from sklearn.preprocessing import StandardScaler

In [11]:
#from sklearn.model_selection import train_test_split
#X_train, X_val, y_train, y_val = train_test_split(train_embeddings, target, test_size=0.05, random_state=22)

In [12]:
#umap_emb = umap.UMAP(n_neighbors= 700, min_dist = 0.1, n_components = 2, metric = 'cosine', target_metric = 'l1' , n_epochs = 1500).fit(X_train, y= y_train)
#umap_emb = umap.UMAP(n_neighbors= 700, min_dist = 0.1, n_components = 2, metric = 'cosine', target_metric = 'l1' , n_epochs = 1500, random_state=137).fit(train_embeddings, y= target)

In [13]:
"""
fig, ax = plt.subplots(1, figsize=(14, 10))
#plt.scatter(*umap_emb.embedding_.T,y = np.zeros(umap_emb.embedding_.shape[0]),  s=3, c=target, cmap='Spectral', alpha=1.0)
plt.scatter(*umap_emb.embedding_.T,  s=3, c=target, cmap='Spectral', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
ax.patch.set_facecolor('black')
fg_color = 'black'
cbar = plt.colorbar()
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color=fg_color)
""";

In [14]:
#val_emb = umap_emb.transform(X_val)
#val_embedding = umap_emb.transform(test_embeddings)

In [15]:
"""
fig, ax = plt.subplots(1, figsize=(14, 10))
#plt.scatter(*umap_emb.embedding_.T,y = np.zeros(umap_emb.embedding_.shape[0]), s=3, c=target, cmap='Spectral', alpha=1.0)

#plt.scatter(*val_embedding.T,y = np.zeros(val_embedding.shape[0]), s=30, alpha=1.0)
plt.scatter(*val_emb.T, s=60, c = y_val, vmin = target.min(), vmax = target.max(),cmap='Spectral', alpha=1.0)
plt.scatter(*umap_emb.embedding_.T,  s=3, c=y_train, cmap='Spectral', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
ax.patch.set_facecolor('black')
fg_color = 'black'
cbar = plt.colorbar()
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color=fg_color)
""";

In [16]:
"""
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

regr = RandomForestRegressor(n_estimators = 1000, max_depth=6, random_state=0, n_jobs = -1)
regr.fit(umap_emb.embedding_, y_train)

y_pred = regr.predict(val_emb)

np.sqrt(mean_squared_error(y_val, y_pred))

# 0.39 best, n_components = 700, min_dist = 0.1
""";

## Inference

In [17]:
#from sklearn.ensemble import RandomForestRegressor

In [18]:
"""
X_train = umap_emb.embedding_
y_train = target 

## Augmentation for tree models 
X_train = np.append(X_train, np.reshape((X_train[:,0]*X_train[:,1]),(-1,1)), axis = 1)
X_train = np.append(X_train, np.reshape((X_train[:,0]*X_train[:,0]),(-1,1)), axis = 1)
X_train = np.append(X_train, np.reshape((X_train[:,1]*X_train[:,1]),(-1,1)), axis = 1)

def mean_normalization(df):
    result = df.copy()
    for feature_name in df.columns:
        mean_value = df[feature_name].mean()
        std_value = df[feature_name].std()
        result[feature_name] = (df[feature_name] - mean_value) / std_value
    return result
    
X_train_df = pd.DataFrame(X_train)
X_train_norm = mean_normalization(X_train_df)
X_train_norm.head()

X_test = val_embedding

#
X_test = np.append(X_test, np.reshape((X_test[:,0]*X_test[:,1]),(-1,1)), axis = 1)
X_test = np.append(X_test, np.reshape((X_test[:,0]*X_test[:,0]),(-1,1)), axis = 1)
X_test = np.append(X_test, np.reshape((X_test[:,1]*X_test[:,1]),(-1,1)), axis = 1)

X_test_df = pd.DataFrame(X_test)
X_test_norm = mean_normalization(X_test_df)
X_test_norm.head()


from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train_norm, target, test_size=0.10, random_state=22)



from sklearn.metrics import mean_squared_error
regr = RandomForestRegressor(n_estimators = 1000, max_depth=5, random_state=0, n_jobs = -1)
regr.fit(X_train, y_train)

y_pred = regr.predict(X_val)

np.sqrt(mean_squared_error(y_val, y_pred))
""";

In [19]:
# k fold CV 
"""
from sklearn.metrics import mean_squared_error

rfr_preds =  np.zeros(test_data.shape[0])
nfolds=5
kfold = StratifiedKFold(n_splits=nfolds)
train_data = umap_emb.embedding_
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=bins)):
    
    X_train, X_val = train_data[train_idx], train_data[valid_idx]
    y_train, y_val = bins[train_idx], bins[valid_idx]

    rfr_model = RandomForestRegressor(max_depth=6, n_estimators = 1000, random_state=0, n_jobs = -1)
    rfr_model.fit(X_train, y_train)
    cv_pred = rfr_model.predict(X_val)
    cv_score = np.sqrt(mean_squared_error(cv_pred, y_val))
    print(cv_score)
    rfr_preds += rfr_model.predict(val_embedding)/nfolds
    """;

In [20]:
#regr = RandomForestRegressor(max_depth=6, n_estimators = 1000, random_state=100, n_jobs = -1)
#regr.fit(umap_emb.embedding_, target)

In [21]:
#y_hat = regr.predict(val_embedding)
#y_hat

In [22]:
"""
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*umap_emb.embedding_.T, s=3, c=target, cmap='Spectral', alpha=1.0)
plt.scatter(*val_embedding.T, s=60, c = y_test, cmap='Spectral', vmin = target.min(), vmax = target.max(), alpha=1.0)
#plt.scatter(X_test_norm.values[:,0],X_test_norm.values[:,1], s=60, c = y_test, vmin = target.min(), vmax = target.max(),cmap='Spectral', alpha=1.0)
#plt.scatter(X_train_norm.values[:,0],X_train_norm.values[:,1], s=3, c=target, cmap='Spectral', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
ax.patch.set_facecolor('black')
fg_color = 'black'
cbar = plt.colorbar()
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color=fg_color)
""";

In [23]:
import umap
from sklearn.ensemble import RandomForestRegressor# nonlinear dimensionality reduction
from tqdm.notebook import tqdm
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [24]:
umap_emb_ = umap.UMAP(n_neighbors= 750, min_dist = 0.7, n_components = 2, metric = 'cosine', target_metric = 'l2' , n_epochs = 1500, random_state=0).fit(train_embeddings, y= target)
val_embedding_ = umap_emb_.transform(test_embeddings)

In [25]:
scaler = StandardScaler()
scaler.fit(umap_emb_.embedding_)

scaled_umap_emb = scaler.transform(umap_emb_.embedding_)
scaled_val_emb = scaler.transform(val_embedding_)

In [26]:
svr_ = SVR(kernel="rbf", C =1)
svr_.fit(scaled_umap_emb, target)
y_hat = svr_.predict(scaled_val_emb)

In [27]:
"""
y_hat = np.zeros(test_data.shape[0])
n_times = 100 
for i in tqdm(range(0, n_times)):
    umap_emb = umap.UMAP(n_neighbors= 700, min_dist = 0.1, n_components = 2, metric = 'cosine', target_metric = 'l1' , n_epochs = 1500, random_state=i).fit(train_embeddings, y= target)
    val_embedding = umap_emb.transform(test_embeddings)
    regr = RandomForestRegressor(max_depth=6, n_estimators = 1000, random_state=100, n_jobs = -1)
    regr.fit(umap_emb.embedding_, target)
    y_hat += regr.predict(val_embedding)
    
y_hat = y_hat/n_times
""";

In [28]:
sample['target'] = y_hat
sample.to_csv('submission.csv', index=False)

In [29]:
y_hat

array([-0.46821733, -0.34707585, -0.54230431, -2.15246364, -1.83822487,
       -1.31840768,  0.85924359])