# library

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Sep 24 13:22:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    27W /  70W |  15100MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
 !pip install transformers[sentencepiece]



In [19]:
import numpy as np
import pandas as pd

from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold as SFK

import torch
from torch import tensor
from torch.nn import Module
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import transformers
from transformers import AutoTokenizer, AutoModel

from transformers import AdamW


# settings

In [20]:
def set_random_seed(random_seed):
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True

set_random_seed(100)

# hyper parameter

In [21]:
n_fold = 5
num_epochs = 4
delta_val = 200
#delta_val = 10
gamma = 2.0
epsilon = 0.4
alpha = 0.25
batch_size = 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
max_length = 256
save_path = '/content/drive/MyDrive/deep_learning'

#model_version = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
model_version = '/content/drive/MyDrive/deep_learning/signate_nlp/sapbert-xlm/'
#model_version = 'xlm-roberta-base'
#model_version = 'TransQuest/monotransquest-da-en_de-wiki'
do_lower_case = True
print(DEVICE)

cuda


# simple EDA

In [22]:
train_df = pd.read_csv('/content/drive/MyDrive/deep_learning/signate_nlp/train.csv', encoding = "ISO-8859-1")
test_df = pd.read_csv('/content/drive/MyDrive/deep_learning/signate_nlp/test.csv', encoding = "ISO-8859-1")
print('train_data　{}'.format(len(train_df)))
print('target 0:{} 1:{}'.format(len(train_df[train_df['judgement']==0]),len(train_df[train_df['judgement']==1])))
print(train_df.isna().sum())
print('test_data　{}'.format(len(test_df)))
print(test_df.isna().sum())
train_df


train_data　27145
target 0:26515 1:630
id              0
title           0
abstract     4390
judgement       0
dtype: int64
test_data　40834
id             0
title          0
abstract    6546
dtype: int64


Unnamed: 0,id,title,abstract,judgement
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0
...,...,...,...,...
27140,27140,The amyloidogenic pathway of amyloid precursor...,Amyloid beta-protein (A beta) is the main cons...,0
27141,27141,Technologic developments in radiotherapy and s...,We present a review of current technological p...,0
27142,27142,Novel screening cascade identifies MKK4 as key...,Phosphorylation of Tau at serine 422 promotes ...,0
27143,27143,Visualization of the gall bladder on F-18 FDOP...,The ability to label dihydroxyphenylalanine (D...,0


In [30]:
batch_size

16

# data set

In [23]:
class SRWS_data:
    
    def __init__(self,tokenizer, df, is_train = True):
        df['abstract'] = df['abstract'].fillna("")
        df['all_text'] = df['title'] + ' ' + df['abstract']
        #df['all_text'] = df['title']
        df['len'] = df['all_text'].map(lambda x: len(x.split(' ')))
        print(df['len'].max(),
             df['len'].mean(),
             df['len'].median())
        inputs = tokenizer(
            df['all_text'].values.tolist(),
            max_length=max_length,
            truncation=True,
            padding=True,
            return_tensors="pt")
        
        self.input_ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        
        if is_train:
            target = np.array(df['judgement'].values.tolist())*(1-epsilon) + epsilon/2
            self.target = tensor(target).float()
            self.cpu_target = np.array(df['judgement'].values.tolist())
        
        
        

# model

In [24]:
class SRWS_roberta_Model(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(model_version)
        
        '''
        self.roberta.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.1,
                       "layer_norm_eps": 1e-7})      
        '''
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        self.dropout = nn.Dropout(p = 0.5)
        
        self.sigmoid = nn.Sigmoid()
        

    def forward(self, input_ids, attention_mask):
        model_output = self.model(input_ids=input_ids,
                                     attention_mask=attention_mask)
        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = model_output.last_hidden_state

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        context_vector = self.dropout(context_vector)
        # Now we reduce the context vector to the prediction score.
        return torch.sigmoid(self.regressor(context_vector))
        #return self.sigmoid(roberta_output.logits).squeeze()

# optim

In [25]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 1e-5

        lr = 1e-6

        if layer_num >= 69:        
            lr = 1e-5

        if layer_num >= 133:
            lr = 2e-5

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [26]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [27]:
def f_beta_score(model, data_loader, target, threshold):
    
    pred = predict(model, data_loader)
    pred = np.where(pred > threshold,1,0)
    return fbeta_score(target.tolist(), pred, beta = 7.0)

In [28]:
def create_best_thresholds(target, preds):
    
    best_score = 0
    best_threshold = 0
    n = 10000
    for threshold in [i*1/n for i in range(n)]:
        
        score = fbeta_score(target, np.where(preds > threshold, 1, 0), beta = 7.0)
        if best_score < score:
            
            best_score = score
            best_threshold = threshold
            
    return best_threshold, best_score

# train

In [None]:

def stratified_CV_data(n_fold,train_x,train_y):
    
    sfk=SFK(n_splits=n_fold, random_state=None, shuffle=False)
    return sfk.split(train_x,train_y)

def train():
    sub_df = pd.read_csv('/content/drive/MyDrive/deep_learning/signate_nlp/sample_submit.csv', encoding = "ISO-8859-1")
    tokenizer = AutoTokenizer.from_pretrained(model_version,  use_fast = True)
    train_data = SRWS_data(tokenizer, train_df)
    test_data = SRWS_data(tokenizer, test_df, is_train = False)
    
    del tokenizer
    
    

    test_set = TensorDataset(
            test_data.input_ids,
            test_data.attention_mask,
        )
    
    test_loader = DataLoader(
            test_set,
            batch_size=batch_size,
            drop_last=False, 
            shuffle=False, 
            num_workers=4)

    
    for fold, (tr_index, val_index) in enumerate(stratified_CV_data(n_fold, train_data.cpu_target, train_data.cpu_target)):
        
        tr_set = TensorDataset(
            train_data.input_ids[tr_index],
            train_data.attention_mask[tr_index],
            train_data.target[tr_index]
        )
        tr_loader = DataLoader(
            tr_set,
            batch_size=batch_size,
            drop_last=False, 
            shuffle=True, 
            num_workers=4)
        
        val_set = TensorDataset(
            train_data.input_ids[val_index],
            train_data.attention_mask[val_index],
            )
        val_loader = DataLoader(
            val_set,
            batch_size=batch_size,
            drop_last=False, 
            shuffle=False, 
            num_workers=4)   
        
        best_score = -1000000000
        criterion = nn.BCELoss()
        model = SRWS_roberta_Model().to(DEVICE)
        model.train()
        optim = create_optimizer(model)
        #optim = AdamW(model.parameters(),lr=2e-5,weight_decay=1e-5)
        
        for epoch in range(num_epochs):
        
            for batch_num, (input_ids, attention_mask, target) in enumerate(tr_loader):
                
                input_ids = input_ids.to(DEVICE)
                attention_mask = attention_mask.to(DEVICE)
                target = target.to(DEVICE)
                optim.zero_grad()
                preds = model(input_ids, attention_mask).reshape(-1)
                loss = (-(target*((1-preds)**gamma)*torch.log(preds) + (1-target)*((preds)**gamma)*torch.log(1-preds))).mean()
                #loss = criterion(preds, target)
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 0.1)
                optim.step()
                #preds = np.where(np.array(preds.to('cpu').detach()) > threshold, 1, 0)
                #score = fbeta_score(np.array(target.to('cpu').detach()), preds, beta = 7.0)
                #print(score)
               
                if batch_num % delta_val == 0:
                    
                    #score = f_beta_score(model, val_loader, train_data.cpu_target[val_index], threshold)
                    
                    with torch.no_grad():
                        
                        preds = np.array(predict(model, val_loader).reshape(-1))
                        target = train_data.cpu_target[val_index].reshape(-1)
                        loss = (-(target*((1-preds)**gamma)*np.log(preds) + (1-target)*((preds)**gamma)*np.log(1-preds))).mean()
                        #loss = (-target*np.log(preds) - (1-target)*np.log(1-preds)).mean()
                        threshold, score = create_best_thresholds(target, preds)
                    
                    #if score > best_score:
                    if best_score < score:
                        
                        best_score = score
                        torch.save(model.state_dict(), '/content/drive/MyDrive/deep_learning/model')
                        best_threshold = threshold

                    print('fold:{} epoch:{} batch:{} loss:{} score:{} best_loss:{}'.format(fold, epoch, batch_num, loss, score, best_score))
                    
                  
           
            with torch.no_grad():
                model.eval()
                preds = np.array(predict(model, val_loader).reshape(-1))
                target = train_data.cpu_target[val_index].reshape(-1)
                loss = (-(target*((1-preds)**gamma)*np.log(preds) + (1-target)*((preds)**gamma)*np.log(1-preds))).mean()                    
                #loss = (-target*np.log(preds) - (1-target)*np.log(1-preds)).mean()
                threshold, score = create_best_thresholds(target, preds)

            #if score > best_score:
            if best_score < score:
                        
                best_score = score
                torch.save(model.state_dict(), '/content/drive/MyDrive/deep_learning/model')
                best_threshold = threshold

            print('fold:{} epoch:{} batch:{} loss:{} score:{} best_loss:{}'.format(fold, epoch, batch_num, loss, score, best_score))
                
            
                
        
        with torch.no_grad():

            model.load_state_dict(torch.load( '/content/drive/MyDrive/deep_learning/model'))
            model.eval()
            preds = np.array(predict(model, test_loader).reshape(-1))
            sub_df['fold'.format(fold)] = np.where(preds > threshold, 1.0, 0.0)
            


        
         
    return sub_df

sub_df = train()
sub_df.to_csv('/content/drive/MyDrive/deep_learning/sub.csv', index = False, header = False)

            
        
    
    
    
    
    

1622 199.08134094676737 216.0
1543 199.46757603957485 215.0
fold:0 epoch:0 batch:0 loss:0.10889065444209892 score:0.5430566330488751 best_loss:0.5430566330488751


In [None]:
def submit():
    
    for i in range(n_fold):
        
        model = SRWS_roberta_Model().to(DEVICE)
        model.load_state_dict(torch.load('./{}'.format(i)))
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        test_data = SRWS_data(tokenizer, test_df, is_train=False)
        del tokenizer
        test_set = TensorDataset(
                test_data.input_ids,
                test_data.attention_mask,
                )
        test_loader = DataLoader(
                test_set,
                batch_size=32,
                drop_last=False, 
                shuffle=False, 
                num_workers=4)
        preds = np.zeros((len(test_df)))
        print(len(test_df))
        with torch.no_grad():
            
            preds = preds + predict(model, test_loader)/n_fold
            print(len(preds))
        del model
        
    sub = pd.DataFrame({'id':[i+27145 for i in range(40834)], 'ans':preds})
    sub['ans'] = sub['ans'].map(lambda x: 1 if x > threshold else 0)
    sub.to_csv('./submission.csv', index=False, header=False)
#submit()
        

In [None]:
import gensim

# Load Google's pre-trained Word2Vec model.
#model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
