**Importing the libraries we'll be using for this project.**

In [1]:
import platform
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm
import wandb
from pathlib import Path

import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
import torch.utils.data as data_utils #We're going to be making a custom dataset so we can't just rely on Dataset from previous line.

from sklearn.model_selection import GroupShuffleSplit

import os
import gc
import re

Defining a Config class that'll help in keeping track of 'global' parameters.

In [2]:
if torch.cuda.is_available(): #Checking to see if it can use cuda. If it does then we'll use that device.
    print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    DEVICE = torch.device('cuda:0')
else: #If we can't use cuda then defaults to using the cpu.
    print("\n[INFO] GPU not found. Using CPU: {}\n".format(platform.processor()))
    DEVICE = torch.device('cpu')

class Config:
    data_dir = Path('../input/feedback-prize-effectiveness')
    MODEL = transformers.DistilBertModel.from_pretrained('../input/transformers/distilbert-base-uncased') #We're not supposed to use the internet for this competition so we're getting the model from the transformers library Kaggle provides
    TOKENIZER = transformers.DistilBertTokenizerFast.from_pretrained('../input/transformers/distilbert-base-uncased') #Same comment as above
    MAX_LEN = 256 #Our longest text chunk is 836 words long but the 99% quantile is 222. So for efficiency it makes sense to limit our input length.
    TRAIN_BS = 32 
    VALID_BS = 32
    T_0 = 50
    η_min = 1e-5
    LR = 3e-5
    NB_EPOCHS = 100
    scaler = GradScaler()

[INFO] Using GPU: Tesla P100-PCIE-16GB



Some weights of the model checkpoint at ../input/transformers/distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Now I'm going to make the necessary calls to link this notebook with my wandb account

In [3]:
# WANDB_CONFIG = {
#     'TRAIN_BS': Config.TRAIN_BS,
#     'VALID_BS': Config.VALID_BS,
#     'N_EPOCHS': Config.NB_EPOCHS,
#     'ARCH': Config.MODEL,
#     'MAX_LEN': Config.MAX_LEN,
#     'LR': Config.LR,
#     'NUM_WORKERS': 2,
#     'OPTIM': "AdamW",
#     'LOSS': "MSELoss",
#     'DEVICE': "cuda",
#     'T_0': 20,
#     'η_min': 1e-4,
#     'infra': "Kaggle",
#     'competition': 'feedbackprize',
#     '_wandb_kernel': 'tanaym'
# }

# def wandb_log(**kwargs):
#     """
#     Logs a key-value pair to W&B
#     """
#     for k, v in kwargs.items():
#         wandb.log({k: v})

# # Start W&B logging
# # W&B Login
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wb_key = user_secrets.get_secret("WANDB_API_KEY")

# wandb.login(key=wb_key)

# run = wandb.init(
#     project='pytorch_feedbackprize',
#     config=WANDB_CONFIG,
#     group='nlp',
#     job_type='train',
# )

Loading in the data and transforming it to be useable by the model.

In [4]:
def csv_to_df(path):
    return pd.read_csv(path)


def cleanup_text(text):
    words = re.sub(pattern = '[^a-zA-Z]',repl = ' ', string = text)
    words = words.lower()
    return words

df = csv_to_df(Config.data_dir / 'train.csv') #Reading in the training data and making it into a dataframe

#Adding the discourse type to the front of the text and using the sep token so distilbert knows what it's trying to rate it as
df['input'] = df['discourse_type'] + Config.TOKENIZER.sep_token + df['discourse_text']

In [5]:
temp = pd.Series([len(x.split()) for x in df['discourse_text']]) #Making a Series out of the discourse_text lengths.
print(temp.describe()) #Using describe to get info on the lengths.
temp.quantile(0.99) #99% of the data has a length less than this.
del(temp)

count    36765.000000
mean        44.654073
std         46.669682
min          1.000000
25%         16.000000
50%         28.000000
75%         57.000000
max        836.000000
dtype: float64


# Creating the training and validation sets, and the data loader.

In [6]:
temp = df.apply(lambda x: Config.TOKENIZER.encode_plus( #Tokenizing each sentence and storing the result as a dataframe
    cleanup_text(str(x['input'])),
    None,
    add_special_tokens=True,
    max_length=Config.MAX_LEN,
    padding="max_length",
    return_token_type_ids=True,
    truncation=True
).values(), axis = 1, result_type = 'expand') 

temp = temp.rename(columns = {0:'ids',1:'token_type_ids', 2:'mask'}) #The output is 3 columns and we name them for later use
temp = temp.drop('token_type_ids', axis = 1) #We drop token type ids since we don't use them in our model

df = df.join(temp) #We join this temporary dataframe with our actual dataframe


temp = np.stack(np.array(df['ids'])) #We are taking all of the id elements in df and making them into an array so we can then make it into a tensor
temp2 = np.stack(np.array(df['mask'])) #Doing the same for mask
ids_tensor = torch.tensor(temp).to(DEVICE) #Making id array into a tensor and storing them in the GPU
mask_tensor = torch.tensor(temp2).to(DEVICE) #Same line as above but for mask

In [7]:
distil_model = Config.MODEL.to(DEVICE) #Loading in the distilbert model and adding it to the GPU

with torch.no_grad(): #We aren't going to be doing back prop on distilbert for efficiency so we're turning gradients off
    for i in range(0,ids_tensor.shape[0],1000): #To avoid running out of memory on the GPU we pass our inputs into distilbert 1000 tensors at a time
        if(i == 0): #We are storing the resulting tensors in temp and concatenating as we go along
            temp = distil_model(ids_tensor[i:i+1000,:], mask_tensor[i:i+1000,:],return_dict = False)[0][:,0,:]
        else:    
            temp = torch.cat((temp, distil_model(ids_tensor[i:i+1000,:], mask_tensor[i:i+1000,:],return_dict = False)[0][:,0,:]))    

In [8]:
NVALID = 0.1 #We want a validation/train split of 1/9
splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)
#It's creating "splitter" which stores the desired split info

train_ind, val_ind = next(splitter.split(df, groups=df["essay_id"]))
#Then, when we call .split() on splitter, it makes an iterator that has n_splits number of iterations. In each iteration,
#it splits df into training and validation indices using the group info provided. For our purposes we only need one.


#We need reset_index because otherwise it'll have gaps in the index. Drop = True because otherwise it'll create a new column we don't need.
train_df = df.loc[train_ind].reset_index(drop = True) #Now that we have the indices from the previous line, we can make a training set
val_df = df.loc[val_ind].reset_index(drop = True) #Making a validation set using the complement to the previous indices

train_tensor = temp[train_ind, :] #Using the indices of train_ind to make a train_tensor set
val_tensor = temp[val_ind, :] #Doing the same but for a validation set

We are going to use cross entropy loss so we need to convert our categories into tensors for the true value.

In [9]:
temp = []
for i, sample in train_df.iterrows():
    if (sample['discourse_effectiveness'] == 'Ineffective'):
            temp.append(torch.tensor([1, 0, 0]))
    elif (sample['discourse_effectiveness'] ==  'Adequate'):
            temp.append(torch.tensor([0, 1, 0]))
    elif (sample['discourse_effectiveness'] ==  'Effective'):
            temp.append(torch.tensor([0, 0, 1]))
train_targets = torch.stack(temp)


temp = []
for i, sample in val_df.iterrows():
    if (sample['discourse_effectiveness'] == 'Ineffective'):
            temp.append(torch.tensor([1, 0, 0]))
    elif (sample['discourse_effectiveness'] ==  'Adequate'):
            temp.append(torch.tensor([0, 1, 0]))
    elif (sample['discourse_effectiveness'] ==  'Effective'):
            temp.append(torch.tensor([0, 0, 1]))
val_targets = torch.stack(temp)

# Creating the model we'll be using

In [10]:
class FeedbackPrizeModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.drop = nn.Dropout(0.3)
        self.drop2 = nn.Dropout(0.3)
        
        self.lin = nn.Linear(768, 384) #Input should be 768 dimensional and output should be 3 dimensional.
        self.lin2 = nn.Linear(384, 3)
        
        self.softmax = nn.Softmax(dim = 1)
    
    def forward(self, input_tensor):
        output = self.drop(input_tensor)
        output = self.lin(output)
        output = self.drop2(output)
        output = self.lin2(output)
        
        output = self.softmax(output)
        return output


# Creating a Trainer class that will make training easier.
Still working on it.

In [11]:
class Trainer:
    def __init__(self, config, dataloaders, optimizer, model, loss_fns, scheduler, device="cuda:0"):
        self.train_loader, self.valid_loader = dataloaders
        self.train_loss_fn, self.valid_loss_fn = loss_fns
        self.scheduler = scheduler
        self.optimizer = optimizer
        self.model = model
        self.device = torch.device(device)
        self.config = config

    def train_one_epoch(self):
        """
        Trains the model for 1 epoch
        """
        self.model.train() #Turns on certain layers like the drop layer which are deactivated during evaluation.
        train_pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader)) #Simply making a progress bar for tqdm
        train_preds, train_targets = [], [] #Initializing empty lists so we can store the training results at each iteration

        for idx, cache in train_pbar: #It's going to iterate through each minibatch
            
            input_tensors = self._convert_if_not_tensor(cache[0], dtype=torch.float) #Ensuring that our input tensors are actually tensors
            targets = self._convert_if_not_tensor(cache[1], dtype=torch.float) #Same but for the target tensors
            
            
            with autocast(enabled=True): #Using autocast to make training faster
                outputs = self.model(input_tensors)
                
                loss = self.train_loss_fn(outputs, targets)
                loss_itm = loss.item()
                
#                 wandb_log( #Logging the loss at each minibatch to wandb for visualization4
#                     train_batch_loss = loss_itm
#                 )
                
                train_pbar.set_description('loss: {:.2f}'.format(loss_itm)) #Using tqdm to format an output text to keep track of where we are in the loop and the loss

                Config.scaler.scale(loss).backward()
                Config.scaler.step(self.optimizer)
                Config.scaler.update()
                self.optimizer.zero_grad()
                self.scheduler.step()
                            

            train_targets.append(targets)
            train_preds.append(outputs)
        
        # Tidy
        del outputs, targets, input_tensors, loss
        gc.collect()
        torch.cuda.empty_cache()
        
        return train_preds, train_targets

    @torch.no_grad() #Ensuring no_grad is turned on whenever we call valid_one_epoch
    def valid_one_epoch(self):
        """
        Validates the model for 1 epoch
        """
        self.model.eval() #Setting it to eval mode so dropout is turned off
        valid_pbar = tqdm(enumerate(self.valid_loader), total=len(self.valid_loader))
        valid_preds, valid_targets = [], []

        for idx, cache in valid_pbar:
            input_tensors = self._convert_if_not_tensor(cache[0], dtype=torch.float)
            targets = self._convert_if_not_tensor(cache[1], dtype=torch.float)

            outputs = self.model(input_tensors)
            valid_loss = self.valid_loss_fn(outputs, targets)
            
#             wandb_log(
#                 valid_batch_loss = valid_loss.item()
#             )
            
            valid_pbar.set_description(desc=f"val_loss: {valid_loss.item():.4f}")

            valid_targets.append(targets)
            valid_preds.append(outputs)

        # Tidy
        del outputs, targets, input_tensors, valid_loss
        gc.collect()
        torch.cuda.empty_cache()
        
        return valid_preds, valid_targets



    def fit(self, epochs: int = 10, output_dir: str = "/kaggle/working/", custom_name: str = 'model.pth'):
        """
        Low-effort alternative for doing the complete training and validation process
        """
        best_loss = int(1e+7) #We want to keep track of our best loss so we set it to be arbitrarily high as the start
        
        for epx in range(epochs):
            print(f"{'='*20} Epoch: {epx+1} / {epochs} {'='*20}")

            train_preds, train_targets = self.train_one_epoch() #Loading in the list of predictions and targets from train_one_epoch
            train_preds = torch.cat(train_preds) #We are turning the list of prediction tensors into one tensor so we can apply the built in cross-entropy function
            train_targets = torch.cat(train_targets) #Same but for the target tensors

            train_error = self.train_loss_fn(train_targets, train_preds) #Computing the overall cross entropy loss for the epoch
            
            print(f"Training loss: {train_error:.4f}")
    
            valid_preds, valid_targets = self.valid_one_epoch() #This is the same process as before but for the validation epoch
            valid_preds = torch.cat(valid_preds)
            valid_targets = torch.cat(valid_targets)
            
            valid_error = self.valid_loss_fn(valid_targets, valid_preds)
            
            print(f"Validation loss: {valid_error:.4f}")
            
#             wandb_log( #Storing our resulting errors to wandb
#                 train_error = train_error,
#                 valid_error = valid_error
#             )
            
            
            if valid_error < best_loss: #Updating what our current best model is and saving a copy of it, overriding the previous one.
                best_loss = valid_error
                self.save_model(output_dir, custom_name)
                print(f"Saved model with val_loss: {best_loss:.4f}")
            
    def save_model(self, path, name, verbose=False):
        """
        Saves the model at the provided destination
        """
        try:
            if not os.path.exists(path):
                os.makedirs(path)
        except:
            print("Errors encountered while making the output directory")

        torch.save(self.model.state_dict(), os.path.join(path, name))
        if verbose:
            print(f"Model Saved at: {os.path.join(path, name)}")

    def _convert_if_not_tensor(self, x, dtype):
        if self._tensor_check(x):
            return x.to(self.device, dtype=dtype)
        else:
            return torch.tensor(x, dtype=dtype, device=self.device)

    def _tensor_check(self, x):
        return isinstance(x, torch.Tensor)

In [12]:
def yield_optimizer(model):
    """
    Returns optimizer for specific parameters
    """
    param_optimizer = list(model.named_parameters()) #Gets a list of named parameters. So this is a list of tuples.
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] #A list of names for parameters that we don't want to decay over time.
    optimizer_parameters = [ #This is creating a list of dictionaries that the AdamW optimizer will use for initialization.
        {
            "params": [ #Stores the names of named parameters not listed in no_decay. These will have a decay weight of 0.003.
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0003,
        },
        {
            "params": [ #Stores the names of named parameters that also appear in no_decay.
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    return torch.optim.AdamW(optimizer_parameters, lr=Config.LR)

In [13]:
# Training Code
if __name__ == '__main__':
    
    train = data_utils.TensorDataset(train_tensor, train_targets) #Since our dataset is just tensors consisting of tensors, we need to use TensorDataset to create our dataset.
    train_loader = DataLoader(train, batch_size=Config.TRAIN_BS, shuffle=True) #Creating our dataloader using our training dataset
        
    val = data_utils.TensorDataset(val_tensor, val_targets) #Same but for validation set 
    valid_loader = DataLoader(val, batch_size=Config.VALID_BS, shuffle=False)
    
    
    model = FeedbackPrizeModel().to(DEVICE) #This is to ensure our model parameters are available to the GPU for use.
    optimizer = yield_optimizer(model) #Initializes the AdamW optimizer we'll be using.
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( #Initializing the scheduler using the AdamW optimizer.
        optimizer, 
        T_0=Config.T_0, #After 20 steps the LR stops decaying.
        eta_min=Config.η_min #The lowest the LR will drop to
    )
    train_loss_fn, valid_loss_fn = nn.CrossEntropyLoss(), nn.CrossEntropyLoss() #Setting which loss function we want to use. Cross Entropy Loss in both cases.
    
#     wandb.watch(model, criterion=train_loss_fn)
    
    trainer = Trainer(
        config = Config,
        dataloaders = (train_loader, valid_loader),
        loss_fns = (train_loss_fn, valid_loss_fn),
        optimizer = optimizer,
        model = model,
        scheduler = scheduler,
    ) #Initializing our trainer

    best_pred = trainer.fit(
        epochs = Config.NB_EPOCHS,
        custom_name = f"feedbackprize_distilbert.bin"
    ) #Beginning the training



  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 1.0210


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9828
Saved model with val_loss: 0.9828


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9647


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9591
Saved model with val_loss: 0.9591


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9466


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9495
Saved model with val_loss: 0.9495


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9384


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9448
Saved model with val_loss: 0.9448


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9325


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9387
Saved model with val_loss: 0.9387


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9299


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9378
Saved model with val_loss: 0.9378


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9264


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9374
Saved model with val_loss: 0.9374


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9248


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9349
Saved model with val_loss: 0.9349


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9226


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9350


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9216


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9335
Saved model with val_loss: 0.9335


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9193


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9303
Saved model with val_loss: 0.9303


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9186


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9366


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9177


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9324


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9168


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9293
Saved model with val_loss: 0.9293


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9166


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9293
Saved model with val_loss: 0.9293


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9165


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9291
Saved model with val_loss: 0.9291


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9145


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9285
Saved model with val_loss: 0.9285


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9146


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9271
Saved model with val_loss: 0.9271


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9147


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9286


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9137


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9274


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9140


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9294


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9141


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9282


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9138


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9269
Saved model with val_loss: 0.9269


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9138


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9257
Saved model with val_loss: 0.9257


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9116


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9255
Saved model with val_loss: 0.9255


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9117


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9254
Saved model with val_loss: 0.9254


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9119


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9280


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9118


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9270


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9119


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9246
Saved model with val_loss: 0.9246


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9117


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9255


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9120


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9250


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9117


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9266


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9112


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9245
Saved model with val_loss: 0.9245


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9101


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9259


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9108


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9255


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9105


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9266


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9098


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9241
Saved model with val_loss: 0.9241


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9096


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9234
Saved model with val_loss: 0.9234


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9106


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9248


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9104


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9234


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9094


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9255


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9093


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9236


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9101


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9240


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9090


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9250


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9097


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9242


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9090


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9235


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9099


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9238


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9090


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9253


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9099


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9235


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9091


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9232
Saved model with val_loss: 0.9232


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9089


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9249


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9083


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9247


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9096


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9244


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9090


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9258


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9107


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9254


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9099


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9219
Saved model with val_loss: 0.9219


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9092


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9256


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9088


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9210
Saved model with val_loss: 0.9210


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9080


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9213


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9069


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9204
Saved model with val_loss: 0.9204


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9067


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9207


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9065


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9214


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9058


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9209


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9052


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9209


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9054


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9192
Saved model with val_loss: 0.9192


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9057


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9195


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9053


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9184
Saved model with val_loss: 0.9184


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9061


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9191


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9053


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9220


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9055


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9201


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9047


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9199


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9038


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9182
Saved model with val_loss: 0.9182


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9045


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9181
Saved model with val_loss: 0.9181


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9044


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9185


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9046


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9187


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9044


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9176
Saved model with val_loss: 0.9176


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9038


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9176
Saved model with val_loss: 0.9176


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9050


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9189


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9057


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9165
Saved model with val_loss: 0.9165


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9045


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9182


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9034


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9180


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9042


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9174


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9053


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9179


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9047


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9176


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9052


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9172


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9042


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9166


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9033


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9166


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9035


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9179


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9034


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9179


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9027


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9178


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9036


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9184


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9032


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9188


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9039


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9179


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9033


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9173


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9041


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9174


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9032


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9176


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9033


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9175


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9038


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9185


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9027


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9185


  0%|          | 0/1034 [00:00<?, ?it/s]

Training loss: 0.9040


  0%|          | 0/115 [00:00<?, ?it/s]

Validation loss: 0.9183


In [14]:
# run.finish()

# Using our trained model to make predictions on the test set

In [15]:
test_df = csv_to_df(Config.data_dir / 'test.csv') #Getting the test data and making it into a dataframe
test_df['input'] = test_df['discourse_type'] + Config.TOKENIZER.sep_token + test_df['discourse_text'] #Similar to what we did to the training data
test_df['Ineffective'] = np.nan #Creating the test_df columns that'll store our output and setting it to nan as a placeholder
test_df['Adequate'] = np.nan
test_df['Effective'] = np.nan

In [16]:
temp = test_df.apply(lambda x: Config.TOKENIZER.encode_plus( #Tokenizing each sentence and storing the result as a dataframe
    cleanup_text(str(x['input'])),
    None,
    add_special_tokens=True,
    max_length=Config.MAX_LEN,
    padding="max_length",
    return_token_type_ids=True,
    truncation=True
).values(), axis = 1, result_type = 'expand') 



temp = temp.rename(columns = {0:'ids',1:'token_type_ids', 2:'mask'}) #The output is 3 columns and we name them for later use
temp = temp.drop('token_type_ids', axis = 1) #We drop token type ids since we don't use them in our model

test_df = test_df.join(temp) #We join this temporary dataframe with our actual dataframe


temp = np.stack(np.array(test_df['ids'])) #We are taking all of the id elements in df and making them into an array so we can then make it into a tensor
temp2 = np.stack(np.array(test_df['mask'])) #Doing the same for mask
ids_tensor = torch.tensor(temp).to(DEVICE) #Making id array into a tensor and storing them in the GPU
mask_tensor = torch.tensor(temp2).to(DEVICE) #Same line as above but for mask

with torch.no_grad(): #We aren't going to be doing back prop on distilbert for efficiency so we're turning gradients off
    for i in range(0,ids_tensor.shape[0],1000): #To avoid running out of memory on the GPU we pass our inputs into distilbert 1000 tensors at a time
        if(i == 0): #We are storing the resulting tensors in temp and concatenating as we go along
            temp = distil_model(ids_tensor[i:i+1000,:], mask_tensor[i:i+1000,:],return_dict = False)[0][:,0,:]
        else:    
            temp = torch.cat((temp, distil_model(ids_tensor[i:i+1000,:], mask_tensor[i:i+1000,:],return_dict = False)[0][:,0,:]))

test_tensor = temp

In [17]:
model = FeedbackPrizeModel()
model.load_state_dict(torch.load('./feedbackprize_distilbert.bin'))
model.to(DEVICE)
model.eval()


with torch.no_grad(): #We aren't going to be doing back prop on distilbert for efficiency so we're turning gradients off
    for idx in tqdm(range(test_tensor.shape[0])):
        x = test_tensor[idx, :]
        outputs = model(x.unsqueeze(0))
        outputs = outputs.flatten().tolist()
        
        test_df.at[idx, 'Ineffective'] = outputs[0]
        test_df.at[idx, 'Adequate'] = outputs[1]
        test_df.at[idx, 'Effective'] = outputs[2]

  0%|          | 0/10 [00:00<?, ?it/s]

In [18]:
test_df = test_df.drop(['essay_id','discourse_text', 'discourse_type', 'input', 'ids', 'mask'], axis = 1)

In [19]:
test_df.to_csv("/kaggle/working/submission.csv", index = False)