In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import wandb
import logging

In [2]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [3]:
trainfile = "triples.train.10m.utf8.simpletransformers.tsv"
devfile   = "mydevset_triples.tsv"

train_df = pd.read_csv(trainfile, sep="\t", names=["text_a", "text_b", "labels"], nrows=2000000)
eval_df = pd.read_csv(devfile, sep="\t", names=["text_a", "text_b", "labels"])

In [4]:
model_args = ClassificationArgs()
model_args.lazy_loading = True
model_args.lazy_loading_start_line = 0
model_args.lazy_text_a_column = 0
model_args.lazy_text_b_column = 1
model_args.lazy_labels_column = 2
model_args.eval_batch_size = 16
model_args.dataloader_num_workers = 1
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 3000
model_args.save_eval_checkpoints = True
model_args.save_model_every_epoch = False
model_args.learning_rate = .0000274
model_args.manual_seed = 4
model_args.max_seq_length = 512
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = False
model_args.no_save = False
model_args.num_train_epochs = 1
model_args.output_dir = "outputs-biobert"
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 2
model_args.gradient_accumulation_steps = 8
model_args.train_custom_parameter_only = False
model_args.n_gpu = 1
model_args.dataloader_num_workers = 1
model_args.labels_list = [0, 1]

In [5]:
def evalAcc(x,y):
    acc = sklearn.metrics.accuracy_score(x,[round(p) for p in y])
    wandb.log({"accuracy": acc})
    return acc

In [6]:
wandb.init()

model = ClassificationModel(
    "bert",
    "monologg/biobert_v1.0_pubmed_pmc",
    args=model_args,
    cuda_device = 1
)
model.train_model(trainfile, eval_df=devfile, accuracy=evalAcc)

metrics = model.eval_model(devfile,
    accuracy=lambda x,y: sklearn.metrics.accuracy_score(x,[round(p) for p in y]))[0]
print(f"ACC: {metrics['accuracy']}")
wandb.log(metrics)

INFO:wandb.run_manager:system metrics and metadata threads started
INFO:wandb.run_manager:checking resume status, waiting at most 10 seconds
INFO:wandb.run_manager:resuming run from id: UnVuOnYxOjE0eWk2MzhnOnVuY2F0ZWdvcml6ZWQ6a2hpdHVyYXM=
INFO:wandb.run_manager:upserting run before process can begin, waiting at most 10 seconds
INFO:wandb.run_manager:saving pip packages
INFO:wandb.run_manager:initializing streaming files api
INFO:wandb.run_manager:unblocking file change observer, beginning sync with W&B servers
INFO:wandb.run_manager:file/dir modified: /home/faessler/work/Research/trec-covid/wandb/run-20200802_172823-14yi638g/config.yaml
INFO:wandb.run_manager:file/dir created: /home/faessler/work/Research/trec-covid/wandb/run-20200802_172823-14yi638g/wandb-metadata.json
INFO:wandb.run_manager:file/dir created: /home/faessler/work/Research/trec-covid/wandb/run-20200802_172823-14yi638g/wandb-history.jsonl
INFO:wandb.run_manager:file/dir created: /home/faessler/work/Research/trec-covid/wa

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


RuntimeError: CUDA error: out of memory

INFO:wandb.run_manager:shutting down system stats and metadata service
INFO:wandb.run_manager:file/dir modified: /home/faessler/work/Research/trec-covid/wandb/run-20200802_172823-14yi638g/wandb-events.jsonl
INFO:wandb.run_manager:stopping streaming files and file change observer
INFO:wandb.run_manager:file/dir modified: /home/faessler/work/Research/trec-covid/wandb/run-20200802_172823-14yi638g/wandb-metadata.json
