In [None]:
!pip install transformers pytorch_lightning

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.4 MB/s 
[?25hCollecting pytorch_lightning
  Downloading pytorch_lightning-1.6.0-py3-none-any.whl (582 kB)
[K     |████████████████████████████████| 582 kB 44.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 35.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 4.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |███████████████████████

In [None]:
! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

In [None]:
import torch
import torch.nn as nn
import os
import numpy as np
from torch.utils.data import Dataset,DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM,get_constant_schedule_with_warmup, AdamW
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.core.lightning import LightningModule
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold

seed=45
seed_everything(seed)

Global seed set to 45


45

In [None]:
train_data=pd.read_csv("/content/train.csv")
dev_data=pd.read_csv("/content/dev.csv")
train_data=train_data.append(dev_data)
test_data=pd.read_csv("/content/test.csv")

In [None]:
class DatasetRetriever(Dataset):

    def __init__(self, data, model_type="aubmindlab/bert-base-arabertv02",target_column="class_label",input_column="tweet_text"):
        self.data_len=len(data)
        self.input_text = data[input_column].values
        self.label=data[target_column].values
        self.tokenizer =  AutoTokenizer.from_pretrained(model_type)
        self.label_incuded = target_column in data
        
    def encode(self, text):
        encoded = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=155, 
            pad_to_max_length=True)
        return torch.tensor(encoded['input_ids'],dtype=torch.long).squeeze(), torch.tensor(encoded['attention_mask'],dtype=torch.long).squeeze()

    def __len__(self):
        return self.data_len

    def __getitem__(self, idx):
        target=self.label[idx]
        input_ids, attention_mask = self.encode(self.input_text[idx])
        return (input_ids, attention_mask,target) if self.label_incuded else (input_ids, attention_mask)

In [None]:
class Model(LightningModule):
    
    def __init__(self, model_type="aubmindlab/bert-base-arabertv02"):
        super().__init__()
        self.transformer= AutoModelForMaskedLM.from_pretrained(model_type)
        self.dropout = nn.Dropout(0.2)
        self.sigmoid = nn.Sigmoid() 

    def forward(self,inputs):
        input_ids, attention_masks=inputs
        seq, _ = self.transformer(input_ids=input_ids, attention_mask=attention_masks)
        avg_pool = torch.mean(seq, 1)
        max_pool, _ = torch.max(seq, 1)
        x = torch.cat((avg_pool, max_pool), 1)
        x = self.dropout(x)
        return self.sigmoid(x)

    def training_step(self, batch,batch_idx):
        input_ids, attention_mask, target = batch
        prediction = self([input_ids, attention_mask])
        loss = self.loss_fn(prediction, target)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch,batch_idx):
        input_ids, attention_mask, target = batch
        prediction = self([input_ids, attention_mask])
        loss = self.loss_fn(prediction, target)
        self.log('val_loss', loss)
        
    def configure_optimizers(self):
        optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
        lr_scheduler = get_constant_schedule_with_warmup(optimizer, 100)
        return [optimizer], [lr_scheduler]
    
    def loss_fn(self, prediction, target):
        return nn.BCELoss()(prediction,target)

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=seed)

valid_predictions = np.zeros_like(train_data["class_label"].values, dtype=float)
test_predictions = np.zeros((test_data.shape[0],), dtype=float)

for cv_idx, (train_idx, valid_idx) in enumerate(cv.split(train_data)):
    model = Model()
    train_loader = DataLoader(DatasetRetriever(train_data.iloc[train_idx]), shuffle=True, batch_size=64)
    valid_loader = DataLoader(DatasetRetriever(train_data.iloc[valid_idx]), shuffle=False, batch_size=64*2)

    trainer = Trainer(max_epochs=20, tpu_cores=[8], logger=False, checkpoint_callback=False,callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=2)])
    trainer.fit(model, train_loader, valid_loader)

    valid_loader = DataLoader(DatasetRetriever(train_data.iloc[valid_idx].drop("class_label", axis=1)),shuffle=False,batch_size=64*2)
    test_loader = DataLoader(DatasetRetriever(test_data), shuffle=False, batch_size=64*2)
    valid_predictions[valid_idx] = np.concatenate(trainer.predict(model, valid_loader))
    test_predictions += np.concatenate(trainer.predict(model, test_loader)) / 5
    
    trainer.save_checkpoint(f'xlm-roberta-base_cv{cv_idx}.ckpt')

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

MisconfigurationException: ignored