# Setting up the GPU Environment

#### Install Dependencies and Restart Runtime

In [1]:
!pip install -q transformers
!pip install -q simpletransformers

[K     |████████████████████████████████| 5.5 MB 14.2 MB/s 
[K     |████████████████████████████████| 7.6 MB 61.5 MB/s 
[K     |████████████████████████████████| 163 kB 71.4 MB/s 
[K     |████████████████████████████████| 250 kB 11.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 48.7 MB/s 
[K     |████████████████████████████████| 441 kB 8.5 MB/s 
[K     |████████████████████████████████| 9.2 MB 53.3 MB/s 
[K     |████████████████████████████████| 1.9 MB 47.0 MB/s 
[K     |████████████████████████████████| 43 kB 2.3 MB/s 
[K     |████████████████████████████████| 182 kB 70.6 MB/s 
[K     |████████████████████████████████| 166 kB 61.5 MB/s 
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
[K     |████████████████████████████████| 166 kB 66.3 MB/s 
[K     |████████████████████████████████| 162 kB 73.6 MB/s 
[K     |████████████████████████████████| 162 kB 77.1 MB/s 
[K     |████████████████████████████████| 158 kB 74.1 MB/s 
[K     |████████████████████

You might see the error `ERROR: google-colab X.X.X has requirement ipykernel~=X.X, but you'll have ipykernel X.X.X which is incompatible` after installing the dependencies. **This is normal** and caused by the `simpletransformers` library.

The **solution** to this will be to **reset the execution environment** now. Go to the menu `Runtime` > `Restart runtime` then continue on from the next section to download and process the data.

#importing custom data

In [2]:
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/Project/combined_train_data.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/Project/combined_valid_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Project/combined_test_data.csv')

# Training and Testing the Model

#### Set up the Training Arguments

In [6]:
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_window': True,
    'max_seq_length': 64,
    'num_train_epochs': 10,
    'train_batch_size': 32,
    'fp16': True,
    'output_dir': '/outputs/',
    'best_model_dir': '/outputs/best_model/',
    'evaluate_during_training': True,
}

The following line of code saves (to the variable `custom_labels`) a set of all the NER tags/labels in the dataset.

In [7]:
custom_labels = list(train_df['labels'].unique())
print(custom_labels)

['O', 'B-Disease', 'I-Disease', 'B-Chemical', 'I-Chemical']


#### Train the Model


In [8]:
train_df.isna().sum(),dev_df.isna().sum(),test_df.isna().sum()

(Unnamed: 0      0
 words          74
 labels          0
 sentence_id     0
 dtype: int64, Unnamed: 0     0
 words          9
 labels         0
 sentence_id    0
 dtype: int64, Unnamed: 0      0
 words          21
 labels          0
 sentence_id     0
 dtype: int64)

In [9]:
train_df = train_df[['words','labels','sentence_id']]
dev_df = dev_df[['words','labels','sentence_id']]
test_df = test_df[['words','labels','sentence_id']]

In [10]:
#remove nan in df
remove_ids = []
for ind,word in enumerate(train_df['words']):
    if type(word) is not str:
        remove_ids.append(train_df['sentence_id'][ind])



for ind,word in enumerate(dev_df['words']):
    if type(word) is not str:
        remove_ids.append(dev_df['sentence_id'][ind])

for ind,word in enumerate(test_df['words']):
    if type(word) is not str:
        remove_ids.append(test_df['sentence_id'][ind])

In [None]:
remove_ids

In [12]:
train_df = train_df[~train_df['sentence_id'].isin(remove_ids)]
dev_df = dev_df[~dev_df['sentence_id'].isin(remove_ids)]
test_df = test_df[~test_df['sentence_id'].isin(remove_ids)]

In [13]:
train_df.shape,test_df.shape,dev_df.shape

((1652664, 3), (328194, 3), (220196, 3))

In [14]:
from simpletransformers.ner import NERModel
from transformers import AutoTokenizer
import pandas as pd
import logging

logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)

# We use the bio BERT pre-trained model.
#model = NERModel('luke', 'studio-ousia/luke-base', labels=custom_labels, args=train_args)
#model = NERModel('bert', 'dmis-lab/biobert-base-cased-v1.2', labels=custom_labels, args=train_args)
model = NERModel('bert', 'allenai/scibert_scivocab_uncased', labels=custom_labels, args=train_args)



Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

Downloading:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [15]:

# Train the model
# https://simpletransformers.ai/docs/tips-and-tricks/#using-early-stopping
model.train_model(train_df, eval_data=dev_df)

# Evaluate the model in terms of accuracy score
result, model_outputs, preds_list = model.eval_model(test_df)

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/1740 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/928 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1391 [00:00<?, ?it/s]

"luke":(LukeConfig, LukeForTokenClassification,LukeTokenizer),

In [16]:
result

{'eval_loss': 0.12989646806697566,
 'precision': 0.8459028669234061,
 'recall': 0.8305761252035082,
 'f1_score': 0.838169435832207}

# Using the Model (Running Inference)

In [19]:
# import random
# test_id = random.choice(test_df['sentence_id'].unique())

# sample = test_df[test_df.sentence_id == test_id].words.str.cat(sep=' ')
# print(sample)
test_text = 'cholera has severe trauma and dengue is caused by mosquiotes and can be treated with amaxin'

samples = [test_text]
predictions, _ = model.predict(samples)
for idx, sample in enumerate(samples):
  print('{}: '.format(idx))
  for word in predictions[idx]:
    print('{}'.format(word))

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

0: 
{'cholera': 'B-Disease'}
{'has': 'O'}
{'severe': 'O'}
{'trauma': 'O'}
{'and': 'O'}
{'dengue': 'B-Disease'}
{'is': 'O'}
{'caused': 'O'}
{'by': 'O'}
{'mosquiotes': 'O'}
{'and': 'O'}
{'can': 'O'}
{'be': 'O'}
{'treated': 'O'}
{'with': 'O'}
{'amaxin': 'B-Chemical'}
