In [1]:
import gc
import os
import numpy as np

import torch
from sklearn.model_selection import train_test_split
from torch import cuda
from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoModelForTokenClassification, AutoTokenizer

from helpers.dataset import TextDataset
from helpers.download_external_model import get_model_from_external
from helpers.model import ModelHandler
from helpers.process_data import process_dir, process_train_df
from helpers.tokenise import tokenise
from helpers.evaluation import score_feedback_comp

# Set up config

# Number of GPUs, Kaggle only has 1
os.environ["CUDA_VISIBLE_DEVICES"]="1"

# Model version for tracking
VERSION=1

# Download base trained model from internet or use cached
DOWNLOAD_MODEL = False
TOKENISER_PATH = 'tokeniser'

DATA_PATH = 'data'


MODEL_NAME = 'google/bigbird-roberta-base'

# torch config for model training and evaluation
config = {'model_name': MODEL_NAME,
          'tokeniser_kwargs': {
              'max_length': 1024,

              'return_offsets_mapping':True,
              'padding':'max_length',
              'truncation':True,
          },
          'train_batch_split': 0.85,
         'train_batch_size':10,
         'valid_batch_size':10,
         'epochs':5,
         'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
         'max_grad_norm':10,
         'device': 'cuda' if cuda.is_available() else 'cpu'}

In [18]:

config_kwargs = {
    'num_labels': 15,
}

model_kwargs = {
    'add_prefix_space': True
}

if DOWNLOAD_MODEL:
    get_model_from_external(model_name=MODEL_NAME, model_path=TOKENISER_PATH,
                            config_kwargs=config_kwargs, model_kwargs=model_kwargs)

In [None]:
train_df = process_train_df(train_df_path=f'{DATA_PATH}/train.csv')
print(train_df.shape)
print(train_df.head())

In [6]:
test_texts_df = process_dir(f'{DATA_PATH}/test')
train_texts_df = process_dir(f'{DATA_PATH}/train')
train_texts_df.to_csv('data/train_texts.csv')
test_texts_df.to_csv('data/test_texts.csv')

In [8]:
train_texts_df = tokenise(train_df=train_df, texts_df=train_texts_df)

In [9]:
all_entities = set()
for entity in train_texts_df['entities']:
    all_entities = all_entities.union(set(entity))

discourse_labels_to_ids = {v:k for k,v in enumerate(all_entities)}
discourse_ids_to_labels = {v:k for k,v in discourse_labels_to_ids.items()}

In [10]:
train_x, train_y = train_test_split(train_texts_df, train_size=config['train_batch_split'])

In [11]:

tokeniser = AutoTokenizer.from_pretrained(TOKENISER_PATH)
training_set = TextDataset(tokeniser=tokeniser, data=train_x.reset_index(drop=True), discourse_label_to_id=discourse_labels_to_ids, tokeniser_kwargs=config['tokeniser_kwargs'], validate=False)
testing_set = TextDataset(tokeniser=tokeniser, data=train_y.reset_index(drop=True), tokeniser_kwargs=config['tokeniser_kwargs'], validate=True)

In [None]:

train_params = {'batch_size': config['train_batch_size'],
                'shuffle': True,
                'num_workers': 2,
                'pin_memory':True
                }

test_params = {'batch_size': config['valid_batch_size'],
                'shuffle': False,
                'num_workers': 2,
                'pin_memory':True
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Test data
test_texts_set = TextDataset(data=test_texts_df, tokeniser=tokeniser, tokeniser_kwargs=config['tokeniser_kwargs'], validate=True)
test_texts_loader = DataLoader(test_texts_set, **test_params)

In [13]:
# Create Model
config_model = AutoConfig.from_pretrained(TOKENISER_PATH+'/config.json')
model = AutoModelForTokenClassification.from_pretrained(
                   TOKENISER_PATH+'/pytorch_model.bin',config=config_model)
model.to(config['device'])

#### Load model from saved state
# model_state_dict = torch.load(f'bigbird_v{VERSION}.pt')
# model.load_state_dict(model_state_dict)

optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rates'][0])

In [None]:
model_handler = ModelHandler(model=model, optimizer=optimizer, config=config)

In [None]:
# Run training
for epoch in range(config['epochs']):
    model_handler.train(epoch ,training_loader=training_loader, max_batch_iter=500)
    torch.cuda.empty_cache()
    gc.collect()
    # Save model state
    torch.save(model.state_dict(), f'bigbird_v{VERSION}.pt')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

# Get predictions
df_pred = model_handler.get_all_predictions(train_y, testing_loader, discourse_ids_to_labels)

# Get labels
valid = train_df.loc[train_df['id'].isin(train_y.id)]

# Overall score
f1s = []

# Calculate score for each class
classes = df_pred['class'].unique()
for c in classes:
    df_pred = df_pred.loc[df_pred['class']==c].copy()
    df_true = valid.loc[valid['discourse_type']==c].copy()
    f1 = score_feedback_comp(df_pred, df_true)
    print(c,f1)
    f1s.append(f1)

# Overall score
print('\nOverall',np.mean(f1s))

In [None]:
# Get predictions for the test set
sub = model_handler.get_all_predictions(test_texts_df, test_texts_loader, discourse_ids_to_labels)
sub.to_csv(f"submission_{VERSION}.csv", index=False)