In [1]:
# !pip3 install scikit-learn
# !pip3 install torch torchvision torchaudio
# !pip3 install simpletransformers

In [2]:
import pandas as pd
from functools import partial
import sklearn 
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
import datetime
pd.set_option('display.max_colwidth', None)
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
torch.multiprocessing.set_sharing_strategy('file_system')
cuda_available = torch.cuda.is_available()
print("Is cuda available?", cuda_available)

Is cuda available? True


# load and prepare data

In [3]:
train = pd.read_csv('data/train_clean_concat_200.csv')
test = pd.read_csv('data/test_clean_concat_200.csv')
print('Number of issues: ', train.shape, test.shape)

Number of issues:  (696679, 2) (80518, 2)


# set the classification layer and training paramteres

In [4]:
lr = 3e-5
drp = 0
epochs = 4
batch_t = 100
batch_e = 100
max_seq = 200
name = 'roberta'
ver = 'roberta-base'
output_name = 'outputs/' + name

def create_model(name, ver, lr, drp, epochs, batch_t, batch_e, max_seq):
    model_args = ClassificationArgs()
    model_name = name
    model_version = ver
    model_args.learning_rate = lr
    model_args.num_train_epochs = epochs
    model_args.eval_batch_size = batch_t
    model_args.train_batch_size = batch_e
    model_args.max_seq_length = max_seq
    model_args.n_gpu = 2
    model_args.output_dir = output_name +'/'
    model_args.overwrite_output_dir = True
    model_args.reprocess_input_data = True
    model_args.preprocess_inputs = True
    model_args.save_steps = -1
    model_args.save_model_every_epoch = False
    
    model = ClassificationModel(model_name, model_version, args = model_args, 
                                num_labels = 3, 
                                use_cuda=cuda_available)
 
    return model

# define evaluation metrics

In [5]:
def calc(p1, p2, func, **kwargs):
    return func(p1, p2, **kwargs)

metrics_recom = {
    "accuracy": partial(calc,func=sklearn.metrics.accuracy_score) ,
    "p_micro": partial(calc,func=sklearn.metrics.precision_score,average='micro'),
    "p_macro": partial(calc,func=sklearn.metrics.precision_score,average='macro'),
    "p_w": partial(calc,func=sklearn.metrics.precision_score,average='weighted'),
    "r_micro": partial(calc,func=sklearn.metrics.recall_score,average='micro'),
    "r_macro": partial(calc,func=sklearn.metrics.recall_score,average='macro'),
    "r_w": partial(calc,func=sklearn.metrics.recall_score,average='weighted'),     
    "f_micro": partial(calc,func=sklearn.metrics.f1_score,average='micro'),
    "f_macro": partial(calc,func=sklearn.metrics.f1_score,average='macro'),
    "f_w": partial(calc,func=sklearn.metrics.f1_score,average='weighted'),
    "classificationReport": partial(calc,func=sklearn.metrics.classification_report, output_dict=True)
}

# train the model

In [6]:
model = create_model(name, ver, lr, drp, epochs, batch_t, batch_e, max_seq)
start = datetime.datetime.now()
print('-'*5,  name, ', start time:', 
datetime.datetime.strftime(datetime.datetime.today(), '%d/%m/%Y-%H:%M'), '-'*5)
model.train_model(train_df=train, **metrics_recom)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

----- roberta , start time: 21/02/2022-21:36 -----


INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/696679 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_200_3_2


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/6967 [00:00<?, ?it/s]



Running Epoch 1 of 4:   0%|          | 0/6967 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/6967 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/6967 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/roberta_ep4_processed_len200/.


(27868, 0.3226250217546024)

# evaluate the model

In [7]:
hours, remainder = divmod((datetime.datetime.now() - start).total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)
print('-'*5,  name, ', duration is:', '%dh:%dm:%ds' % (hours, minutes, seconds), '-'*5, '\n\n')
results, model_outputs, wrong_pred = model.eval_model(test, verbose=True, **metrics_recom)
results

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


----- roberta , duration is: 4h:20m:43s ----- 




  0%|          | 0/80518 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_200_3_2


Running Evaluation:   0%|          | 0/806 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7746695099579798, 'accuracy': 0.87161876847413, 'p_micro': 0.87161876847413, 'p_macro': 0.8293039327691621, 'p_w': 0.8705323901384183, 'r_micro': 0.87161876847413, 'r_macro': 0.8151699661701185, 'r_w': 0.87161876847413, 'f_micro': 0.87161876847413, 'f_macro': 0.8218442012533379, 'f_w': 0.8709611393974825, 'classificationReport': {'0.0': {'precision': 0.8939259039422268, 'recall': 0.8971654090548054, 'f1-score': 0.8955427268898194, 'support': 40288}, '1.0': {'precision': 0.8740105933464263, 'recall': 0.8846188597415896, 'f1-score': 0.8792827313685282, 'support': 33203}, '2.0': {'precision': 0.719975301018833, 'recall': 0.6637256297139604, 'f1-score': 0.690707145501666, 'support': 7027}, 'accuracy': 0.87161876847413, 'macro avg': {'precision': 0.8293039327691621, 'recall': 0.8151699661701185, 'f1-score': 0.8218442012533379, 'support': 80518}, 'weighted avg': {'precision': 0.8705323901384183, 'recall': 0.87161876847413,

{'mcc': 0.7746695099579798,
 'accuracy': 0.87161876847413,
 'p_micro': 0.87161876847413,
 'p_macro': 0.8293039327691621,
 'p_w': 0.8705323901384183,
 'r_micro': 0.87161876847413,
 'r_macro': 0.8151699661701185,
 'r_w': 0.87161876847413,
 'f_micro': 0.87161876847413,
 'f_macro': 0.8218442012533379,
 'f_w': 0.8709611393974825,
 'classificationReport': {'0.0': {'precision': 0.8939259039422268,
   'recall': 0.8971654090548054,
   'f1-score': 0.8955427268898194,
   'support': 40288},
  '1.0': {'precision': 0.8740105933464263,
   'recall': 0.8846188597415896,
   'f1-score': 0.8792827313685282,
   'support': 33203},
  '2.0': {'precision': 0.719975301018833,
   'recall': 0.6637256297139604,
   'f1-score': 0.690707145501666,
   'support': 7027},
  'accuracy': 0.87161876847413,
  'macro avg': {'precision': 0.8293039327691621,
   'recall': 0.8151699661701185,
   'f1-score': 0.8218442012533379,
   'support': 80518},
  'weighted avg': {'precision': 0.8705323901384183,
   'recall': 0.87161876847413,