In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings 
import datasets
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import re
from Classifier import data_cleaning # created method to clean and extract data

In [2]:
import torch

In [3]:
# from transformers import AutoTokenizer, BioGptModel
from transformers import BioGptTokenizer, BioGptModel

In [4]:
# tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")

In [5]:
model = BioGptModel.from_pretrained("microsoft/biogpt")

Some weights of the model checkpoint at microsoft/biogpt were not used when initializing BioGptModel: ['output_projection.weight']
- This IS expected if you are initializing BioGptModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BioGptModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
#-----------------------------------------------------
# Loading dataset
#-----------------------------------------------------
OGD_FakeSet = pd.read_csv("../data/OGD_FakeSet.csv")
#-----------------------------------------------------
# Cleaning dataset
#-----------------------------------------------------
df = data_cleaning(OGD_FakeSet)
#-----------------------------------------------------
# Vectorizing text in findings features
#-----------------------------------------------------
df['findings']

0      The patient has Barrett's oesophagus. It is a ...
1      There is a polyp in the antrum which is sessil...
2      The patient has inflammation in the second par...
3                   Normal gastroscopy to the duodenum. 
4      There is an ulcer in the second part of the du...
                             ...                        
995    The patient has a polyp in the second part of ...
996    There is a nodule in the second part of the du...
997    The patient has a 8mm nodule in the GOJ which ...
998               Normal gastroscopy to the duodenum. NA
999    list("The stricture will need to be dilatated ...
Name: findings, Length: 1000, dtype: object

In [7]:
findings = df['findings']

In [8]:
findings.shape

(1000,)

In [9]:
type(findings)

pandas.core.series.Series

In [10]:
def reverse(row):
    row = row[::-1]
    return row

findings_reverse = findings.apply(reverse)

In [11]:
findings_reverse[0]

".htworg pylop fo ksir eht esaercni dna hcamots eht ni htworgrevo lairetcab fo ksir eht esaercni nac siht sa ,ragus hcum oot gnimusnoc diova ot desivda eb dluohs tneitap ehT  .trofmocsid ro niap sa hcus ,eludon eht htiw detaicossa smotpmys yna eganam ot noitacidem debircserp eb yam tneitap ehT :PU WOLLOF .spylop erom gnipoleved fo ksir eht ecuder ot noitacidem debircserp eb yam tneitap ehT  .nalp tnemtaert dednemmocer eht ot gnirehda fo ecnatropmi eht dna sisongaid eht fo demrofni eb dluohs tneitap ehT :NOITADNEMMOCER AN ..ypocsodne ecnallievrus erutuf ni dia ot oottat a htiw dekram saw pylop ehT.ecnaraeppa ralunarg a htiw ,suotamede dna demalfni si seludon eht gnidnuorrus asocum ehT .nrettap tip lamronba na htiw deklats si hcihw munedoud eht fo trap driht eht ni pylop a si erehT .ylno tnemges trohS .nees osla saw  gninetihw-oteca fo ssol oN .tnemges gnol a si tI .sugahposeo s'tterraB sah tneitap ehT"

In [12]:
sentences = pd.concat([findings, findings_reverse]) 

In [13]:
sentences.shape

(2000,)

In [14]:
sentences.head()

0    The patient has Barrett's oesophagus. It is a ...
1    There is a polyp in the antrum which is sessil...
2    The patient has inflammation in the second par...
3                 Normal gastroscopy to the duodenum. 
4    There is an ulcer in the second part of the du...
Name: findings, dtype: object

In [15]:
sentences = sentences.to_frame()

In [16]:
type(sentences)

pandas.core.frame.DataFrame

In [17]:
sentences.head()

Unnamed: 0,findings
0,The patient has Barrett's oesophagus. It is a ...
1,There is a polyp in the antrum which is sessil...
2,The patient has inflammation in the second par...
3,Normal gastroscopy to the duodenum.
4,There is an ulcer in the second part of the du...


In [18]:
sentences['label'] = 0

In [19]:
sentences.head()

Unnamed: 0,findings,label
0,The patient has Barrett's oesophagus. It is a ...,0
1,There is a polyp in the antrum which is sessil...,0
2,The patient has inflammation in the second par...,0
3,Normal gastroscopy to the duodenum.,0
4,There is an ulcer in the second part of the du...,0


In [20]:
sentences.reset_index(drop=True, inplace=True) 

In [21]:
sentences.tail()

Unnamed: 0,findings,label
1995,.sag fo noitcudorp eht gnisaercni yb spylop er...,0
1996,".eludon eht evomer ot yregrus sa hcus ,tnemtae...",0
1997,.tnemtaert dna noitaulave rehtruf rof tsigoloc...,0
1998,AN .munedoud eht ot ypocsortsag lamroN,0
1999,"\r ,""noollab ERC a htiw mm 5 ot detalid"" ,""noo...",0


In [22]:
for index in range(0,1000):
    sentences.at[index,'label']=1

In [23]:
sentences.head()

Unnamed: 0,findings,label
0,The patient has Barrett's oesophagus. It is a ...,1
1,There is a polyp in the antrum which is sessil...,1
2,The patient has inflammation in the second par...,1
3,Normal gastroscopy to the duodenum.,1
4,There is an ulcer in the second part of the du...,1


In [24]:
sentences.loc[999:1000]

Unnamed: 0,findings,label
999,"list(""The stricture will need to be dilatated ...",1
1000,.htworg pylop fo ksir eht esaercni dna hcamots...,0


In [25]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(sentences,test_size=0.3,random_state=1)

In [26]:
train.reset_index(drop=True)
train.shape

(1400, 2)

In [27]:
test.reset_index(drop=True)
test.shape

(600, 2)

In [28]:
train_dataset = datasets.Dataset.from_pandas(train)
test_dataset = datasets.Dataset.from_pandas(test)

In [29]:
train_dataset

Dataset({
    features: ['findings', 'label', '__index_level_0__'],
    num_rows: 1400
})

In [30]:
train_dataset = train_dataset.remove_columns(["__index_level_0__"])
train_dataset

Dataset({
    features: ['findings', 'label'],
    num_rows: 1400
})

In [31]:
test_dataset = test_dataset.remove_columns(["__index_level_0__"])
test_dataset

Dataset({
    features: ['findings', 'label'],
    num_rows: 600
})

In [32]:
Dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

In [33]:
Dict

DatasetDict({
    train: Dataset({
        features: ['findings', 'label'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['findings', 'label'],
        num_rows: 600
    })
})

In [34]:
type(Dict)

datasets.dataset_dict.DatasetDict

In [83]:
text = findings[0]
inp = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inp['input_ids'].size()}")

Input tensor shape: torch.Size([1, 182])


In [89]:
Dict['train'][0]['findings']

' ..ecnacifingis suoibud fo hctap telni lacivrec a si erehT'

In [93]:
out = model(**tokenizer(Dict['train'][1]['findings'], padding = True, truncation = True, return_tensors="pt"))

In [94]:
out.last_hidden_state

tensor([[[-0.1223,  0.0990, -0.9851,  ..., -0.8175,  0.1483, -0.5747],
         [ 0.1783,  0.0219, -0.3594,  ..., -0.2498,  0.2242,  0.7763],
         [ 1.0239,  0.1950, -1.0766,  ..., -2.7160, -0.2078,  0.5325],
         ...,
         [ 0.2696,  0.8153,  0.2468,  ...,  1.7727,  0.5821, -0.1390],
         [ 0.9711,  1.7036,  0.7942,  ..., -1.0668,  0.5000,  1.0490],
         [-0.2661,  0.1247, -0.5845,  ..., -0.2103,  0.2842, -0.2851]]],
       grad_fn=<NativeLayerNormBackward0>)

In [67]:
inp

{'input_ids': tensor([[    2,    18,   125,    57, 15676,   121, 18456,     4,   212,    21,
            14,   281,  2112,     4,   668,   435,     5, 11136,    13,     9,
         21366,  8268,    17,    72,   724,     4,  5603,  2112,   129,     4,
           271,    21,    14, 15032,    10,     6,  1202,   672,     5,     6,
          9691,    46,    21, 19947,  7031,    15,    32,  1447, 22054,   673,
             4,    18,  2626,  3476,     6,  4463,    21, 12803,     8, 26797,
             7,    15,    14,  7623, 13369,  6126,  6626, 10158,    18, 15032,
            17,  1663,    15,    14, 22667,  4495,    13,  3482,    10,   894,
          2876,  5423, 34150,  3272,  2630, 17081, 27061,  8382, 10395,    20,
            18,   125,   282,    33,  5099,     5,     6,   227,     8,     6,
           923,     5, 25809,    13,     6,  1728,    53,  4529,     4,    18,
           125,    63,    33,  4272,  1817,    13,   897,     6,   105,     5,
          1096,    77,  7965,     4, 2

In [36]:
output = model(**inp)

In [37]:
output.last_hidden_state

tensor([[[-0.1223,  0.0990, -0.9851,  ..., -0.8175,  0.1483, -0.5747],
         [ 0.1783,  0.0219, -0.3594,  ..., -0.2498,  0.2242,  0.7763],
         [ 1.0239,  0.1950, -1.0766,  ..., -2.7160, -0.2078,  0.5325],
         ...,
         [-1.7015, -0.2715,  1.2106,  ..., -1.4069,  3.0727, -1.0163],
         [ 0.7524,  0.1573,  1.7277,  ..., -2.8870,  1.4384, -1.5730],
         [-0.3130,  0.0777, -0.5479,  ..., -0.2827,  0.3639, -0.2331]]],
       grad_fn=<NativeLayerNormBackward0>)

In [80]:
def tokenize(var):
    return tokenizer(var["findings"], padding = True, truncation = True, return_tensors="pt")

In [81]:
# sentences_encoded = Dict.map(tokenize, batched=True, batch_size=None)
sentences_encoded = Dict.map(tokenize)

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [82]:
sentences_encoded['train'][0]

{'findings': ' ..ecnacifingis suoibud fo hctap telni lacivrec a si erehT',
 'label': 0,
 'input_ids': [[2,
   34150,
   1792,
   30399,
   3947,
   3469,
   21,
   4249,
   581,
   38289,
   636,
   29096,
   696,
   704,
   16689,
   6664,
   26934,
   8199,
   6517,
   407,
   457,
   14,
   13949,
   7375,
   696,
   157]],
 'attention_mask': [[1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1]]}

In [74]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [75]:
inp = {k:v.to(device) for k,v in sentences_encoded['train'][0].items() if k in tokenizer.model_input_names}

AttributeError: 'list' object has no attribute 'to'

In [72]:
inp['attention_mask']

[[1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]]

In [73]:
model(**input)

AttributeError: 'list' object has no attribute 'size'

In [58]:
model(**sentences_encoded['train'][0])

TypeError: BioGptModel.forward() got an unexpected keyword argument 'label'

In [40]:
# outputs = model(**sentences_encoded)

In [41]:
# sentences_encoded

In [42]:
sentences_encoded.set_format("torch",columns=["input_ids", "attention_mask", "label"])

In [43]:
# def extract_hidden_states(batch):
#     last_hidden_state = model(**batch).last_hidden_state
# # #     with torch.no_grad():
        

In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    # Place model inputs on the GPU
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state   
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [45]:
# sentences_hidden = sentences_encoded.map(extract_hidden_states)

In [46]:
# Why format tensors? Probably to package many tensors
# Let's format one tensor and input into model and extract hidden state

In [47]:
# To pull last hiidden state, we needed the bare BioGptModel

In [48]:
# last_hidden_state = model(**sentences_encoded).last_hidden_state

In [49]:
# last_hidden_state.shape

In [50]:
# def extract_hidden_states(batch):
# with torch.no_grad():
#     last_hidden_state = model(**inputs).last_hidden_state

In [51]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# def extract_hidden_states(batch):
#     inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
#     # Place model inputs on the GPU
#     # Extract last hidden states
#     with torch.no_grad():
#         last_hidden_state = model(**inputs).last_hidden_state   
#     # Return vector for [CLS] token
#     return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [52]:
# unpack how to extract one hidden state, line by line
# sentences_hidden = sentences_encoded.map(extract_hidden_states, batched=True)

In [53]:
# inputs = tokenizer(text, return_tensors="pt")
# print(f"Input tensor shape: {inputs['input_ids'].size()}")