In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import re
from Classifier import data_cleaning # created method to clean and extract data

In [3]:
import torch

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

In [6]:
model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

In [7]:
#-----------------------------------------------------
# Loading dataset
#-----------------------------------------------------
OGD_FakeSet = pd.read_csv("../data/OGD_FakeSet.csv")
#-----------------------------------------------------
# Cleaning dataset
#-----------------------------------------------------
df = data_cleaning(OGD_FakeSet)
#-----------------------------------------------------
# Vectorizing text in findings features
#-----------------------------------------------------
df['findings']

0      The patient has Barrett's oesophagus. It is a ...
1      There is a polyp in the antrum which is sessil...
2      The patient has inflammation in the second par...
3                   Normal gastroscopy to the duodenum. 
4      There is an ulcer in the second part of the du...
                             ...                        
995    The patient has a polyp in the second part of ...
996    There is a nodule in the second part of the du...
997    The patient has a 8mm nodule in the GOJ which ...
998               Normal gastroscopy to the duodenum. NA
999    list("The stricture will need to be dilatated ...
Name: findings, Length: 1000, dtype: object

In [8]:
findings = df['findings']

In [9]:
findings.shape

(1000,)

In [10]:
type(findings)

pandas.core.series.Series

In [11]:
findings[0][::-1]

".htworg pylop fo ksir eht esaercni dna hcamots eht ni htworgrevo lairetcab fo ksir eht esaercni nac siht sa ,ragus hcum oot gnimusnoc diova ot desivda eb dluohs tneitap ehT  .trofmocsid ro niap sa hcus ,eludon eht htiw detaicossa smotpmys yna eganam ot noitacidem debircserp eb yam tneitap ehT :PU WOLLOF .spylop erom gnipoleved fo ksir eht ecuder ot noitacidem debircserp eb yam tneitap ehT  .nalp tnemtaert dednemmocer eht ot gnirehda fo ecnatropmi eht dna sisongaid eht fo demrofni eb dluohs tneitap ehT :NOITADNEMMOCER AN ..ypocsodne ecnallievrus erutuf ni dia ot oottat a htiw dekram saw pylop ehT.ecnaraeppa ralunarg a htiw ,suotamede dna demalfni si seludon eht gnidnuorrus asocum ehT .nrettap tip lamronba na htiw deklats si hcihw munedoud eht fo trap driht eht ni pylop a si erehT .ylno tnemges trohS .nees osla saw  gninetihw-oteca fo ssol oN .tnemges gnol a si tI .sugahposeo s'tterraB sah tneitap ehT"

In [12]:
def reverse(row):
    row = row[::-1]
    return row

findings_reverse = findings.apply(reverse)

In [13]:
findings_reverse[0]

".htworg pylop fo ksir eht esaercni dna hcamots eht ni htworgrevo lairetcab fo ksir eht esaercni nac siht sa ,ragus hcum oot gnimusnoc diova ot desivda eb dluohs tneitap ehT  .trofmocsid ro niap sa hcus ,eludon eht htiw detaicossa smotpmys yna eganam ot noitacidem debircserp eb yam tneitap ehT :PU WOLLOF .spylop erom gnipoleved fo ksir eht ecuder ot noitacidem debircserp eb yam tneitap ehT  .nalp tnemtaert dednemmocer eht ot gnirehda fo ecnatropmi eht dna sisongaid eht fo demrofni eb dluohs tneitap ehT :NOITADNEMMOCER AN ..ypocsodne ecnallievrus erutuf ni dia ot oottat a htiw dekram saw pylop ehT.ecnaraeppa ralunarg a htiw ,suotamede dna demalfni si seludon eht gnidnuorrus asocum ehT .nrettap tip lamronba na htiw deklats si hcihw munedoud eht fo trap driht eht ni pylop a si erehT .ylno tnemges trohS .nees osla saw  gninetihw-oteca fo ssol oN .tnemges gnol a si tI .sugahposeo s'tterraB sah tneitap ehT"

In [14]:
inputs = tokenizer(findings_reverse[0], return_tensors="pt")

In [15]:
findings[0]

"The patient has Barrett's oesophagus. It is a long segment. No loss of aceto-whitening  was also seen. Short segment only. There is a polyp in the third part of the duodenum which is stalked with an abnormal pit pattern. The mucosa surrounding the nodules is inflamed and edematous, with a granular appearance.The polyp was marked with a tattoo to aid in future surveillance endoscopy.. NA RECOMMENDATION: The patient should be informed of the diagnosis and the importance of adhering to the recommended treatment plan.  The patient may be prescribed medication to reduce the risk of developing more polyps. FOLLOW UP: The patient may be prescribed medication to manage any symptoms associated with the nodule, such as pain or discomfort.  The patient should be advised to avoid consuming too much sugar, as this can increase the risk of bacterial overgrowth in the stomach and increase the risk of polyp growth."

In [16]:
inputs2 = tokenizer(findings[0], return_tensors="pt")

In [17]:
inputs['input_ids'].shape

torch.Size([1, 424])

In [18]:
inputs2

{'input_ids': tensor([[    2,    18,   125,    57, 15676,   121, 18456,     4,   212,    21,
            14,   281,  2112,     4,   668,   435,     5, 11136,    13,     9,
         21366,  8268,    17,    72,   724,     4,  5603,  2112,   129,     4,
           271,    21,    14, 15032,    10,     6,  1202,   672,     5,     6,
          9691,    46,    21, 19947,  7031,    15,    32,  1447, 22054,   673,
             4,    18,  2626,  3476,     6,  4463,    21, 12803,     8, 26797,
             7,    15,    14,  7623, 13369,  6126,  6626, 10158,    18, 15032,
            17,  1663,    15,    14, 22667,  4495,    13,  3482,    10,   894,
          2876,  5423, 34150,  3272,  2630, 17081, 27061,  8382, 10395,    20,
            18,   125,   282,    33,  5099,     5,     6,   227,     8,     6,
           923,     5, 25809,    13,     6,  1728,    53,  4529,     4,    18,
           125,    63,    33,  4272,  1817,    13,   897,     6,   105,     5,
          1096,    77,  7965,     4, 2

In [19]:
outputs = model(**inputs)

In [20]:
sentences = pd.concat([findings, findings_reverse]) 

In [21]:
sentences.shape

(2000,)

In [22]:
sentences.head()

0    The patient has Barrett's oesophagus. It is a ...
1    There is a polyp in the antrum which is sessil...
2    The patient has inflammation in the second par...
3                 Normal gastroscopy to the duodenum. 
4    There is an ulcer in the second part of the du...
Name: findings, dtype: object

In [23]:
sentences = sentences.to_frame()

In [24]:
type(sentences)

pandas.core.frame.DataFrame

In [25]:
sentences.head()

Unnamed: 0,findings
0,The patient has Barrett's oesophagus. It is a ...
1,There is a polyp in the antrum which is sessil...
2,The patient has inflammation in the second par...
3,Normal gastroscopy to the duodenum.
4,There is an ulcer in the second part of the du...


In [26]:
sentences['label'] = 0

In [27]:
sentences.head()

Unnamed: 0,findings,label
0,The patient has Barrett's oesophagus. It is a ...,0
1,There is a polyp in the antrum which is sessil...,0
2,The patient has inflammation in the second par...,0
3,Normal gastroscopy to the duodenum.,0
4,There is an ulcer in the second part of the du...,0


In [28]:
sentences.reset_index(drop=True, inplace=True) 

In [29]:
sentences.tail()

Unnamed: 0,findings,label
1995,.sag fo noitcudorp eht gnisaercni yb spylop er...,0
1996,".eludon eht evomer ot yregrus sa hcus ,tnemtae...",0
1997,.tnemtaert dna noitaulave rehtruf rof tsigoloc...,0
1998,AN .munedoud eht ot ypocsortsag lamroN,0
1999,"\r ,""noollab ERC a htiw mm 5 ot detalid"" ,""noo...",0


In [30]:
for index in range(0,1000):
    sentences.at[index,'label']=1

In [31]:
sentences.head()

Unnamed: 0,findings,label
0,The patient has Barrett's oesophagus. It is a ...,1
1,There is a polyp in the antrum which is sessil...,1
2,The patient has inflammation in the second par...,1
3,Normal gastroscopy to the duodenum.,1
4,There is an ulcer in the second part of the du...,1


In [32]:
sentences.loc[999:1000]

Unnamed: 0,findings,label
999,"list(""The stricture will need to be dilatated ...",1
1000,.htworg pylop fo ksir eht esaercni dna hcamots...,0


In [33]:
sentences["findings"][0]

"The patient has Barrett's oesophagus. It is a long segment. No loss of aceto-whitening  was also seen. Short segment only. There is a polyp in the third part of the duodenum which is stalked with an abnormal pit pattern. The mucosa surrounding the nodules is inflamed and edematous, with a granular appearance.The polyp was marked with a tattoo to aid in future surveillance endoscopy.. NA RECOMMENDATION: The patient should be informed of the diagnosis and the importance of adhering to the recommended treatment plan.  The patient may be prescribed medication to reduce the risk of developing more polyps. FOLLOW UP: The patient may be prescribed medication to manage any symptoms associated with the nodule, such as pain or discomfort.  The patient should be advised to avoid consuming too much sugar, as this can increase the risk of bacterial overgrowth in the stomach and increase the risk of polyp growth."

In [34]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(sentences,test_size=0.3,random_state=1)

In [35]:
train

Unnamed: 0,findings,label
1194,..ecnacifingis suoibud fo hctap telni lacivre...,0
45,The patient has inflammation in the body which...,1
1477,.spylop erom gnipoleved fo ksir eht ecuder ot ...,0
1293,.reclu eht fo gnilaeh eht yaled dna dica hcamo...,0
1736,.munedoud eht ot ypocsortsag lamroN,0
...,...,...
1791,.spylop erom gnipoleved fo ksir eht esaercni n...,0
1096,.yletaidemmi redivorp erachtlaeh rieht tcatnoc...,0
1932,.htworg pylop fo ksir eht esaercni dna hcamots...,0
235,Lax cardia with small hiatus hernia.,1


In [38]:
import datasets
train_dataset = datasets.Dataset.from_pandas(train)
test_dataset = datasets.Dataset.from_pandas(test)

In [39]:
train_dataset

Dataset({
    features: ['findings', 'label', '__index_level_0__'],
    num_rows: 1400
})

In [40]:
test_dataset

Dataset({
    features: ['findings', 'label', '__index_level_0__'],
    num_rows: 600
})

In [41]:
import datasets

Dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

In [42]:
Dict

DatasetDict({
    train: Dataset({
        features: ['findings', 'label', '__index_level_0__'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['findings', 'label', '__index_level_0__'],
        num_rows: 600
    })
})

In [43]:
type(Dict)

datasets.dataset_dict.DatasetDict

In [44]:
def tokenize(batch):
    return tokenizer(batch["findings"], padding = True, truncation = True)

In [45]:
sentences_encoded = Dict.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [46]:
sentences_encoded

DatasetDict({
    train: Dataset({
        features: ['findings', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['findings', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 600
    })
})

In [52]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    # Place model inputs on the GPU
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state   
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}          
    

In [53]:
sentences_encoded.set_format("torch",columns=["input_ids", "attention_mask", "label"])

In [54]:
type(sentences_encoded)

datasets.dataset_dict.DatasetDict

In [None]:
sentences_hidden = sentences_encoded.map(extract_hidden_states, batched=True)

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]