In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings 
import datasets
import torch
import os
import re
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from Classifier import data_cleaning # created method to clean and extract data
from sklearn.model_selection import train_test_split

In [2]:
# from transformers import AutoTokenizer, BioGptModel
from transformers import BioGptTokenizer, BioGptModel
# tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")

In [3]:
model = BioGptModel.from_pretrained("microsoft/biogpt")

Some weights of the model checkpoint at microsoft/biogpt were not used when initializing BioGptModel: ['output_projection.weight']
- This IS expected if you are initializing BioGptModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BioGptModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
#-----------------------------------------------------
# Loading dataset
#-----------------------------------------------------
OGD_FakeSet = pd.read_csv("../data/OGD_FakeSet.csv")
#-----------------------------------------------------
# Cleaning dataset
#-----------------------------------------------------
df = data_cleaning(OGD_FakeSet)
#-----------------------------------------------------
# Vectorizing text in findings features
#-----------------------------------------------------
df['findings']

0      The patient has Barrett's oesophagus. It is a ...
1      There is a polyp in the antrum which is sessil...
2      The patient has inflammation in the second par...
3                   Normal gastroscopy to the duodenum. 
4      There is an ulcer in the second part of the du...
                             ...                        
995    The patient has a polyp in the second part of ...
996    There is a nodule in the second part of the du...
997    The patient has a 8mm nodule in the GOJ which ...
998               Normal gastroscopy to the duodenum. NA
999    list("The stricture will need to be dilatated ...
Name: findings, Length: 1000, dtype: object

In [5]:
findings = df['findings']
findings.shape

(1000,)

In [6]:
def reverse(row):
    row = row[::-1]
    return row

findings_reverse = findings.apply(reverse)

findings_reverse[0]

".htworg pylop fo ksir eht esaercni dna hcamots eht ni htworgrevo lairetcab fo ksir eht esaercni nac siht sa ,ragus hcum oot gnimusnoc diova ot desivda eb dluohs tneitap ehT  .trofmocsid ro niap sa hcus ,eludon eht htiw detaicossa smotpmys yna eganam ot noitacidem debircserp eb yam tneitap ehT :PU WOLLOF .spylop erom gnipoleved fo ksir eht ecuder ot noitacidem debircserp eb yam tneitap ehT  .nalp tnemtaert dednemmocer eht ot gnirehda fo ecnatropmi eht dna sisongaid eht fo demrofni eb dluohs tneitap ehT :NOITADNEMMOCER AN ..ypocsodne ecnallievrus erutuf ni dia ot oottat a htiw dekram saw pylop ehT.ecnaraeppa ralunarg a htiw ,suotamede dna demalfni si seludon eht gnidnuorrus asocum ehT .nrettap tip lamronba na htiw deklats si hcihw munedoud eht fo trap driht eht ni pylop a si erehT .ylno tnemges trohS .nees osla saw  gninetihw-oteca fo ssol oN .tnemges gnol a si tI .sugahposeo s'tterraB sah tneitap ehT"

In [7]:
sentences = pd.concat([findings, findings_reverse]) 
sentences.shape

(2000,)

In [8]:
sentences = sentences.to_frame()
type(sentences)

pandas.core.frame.DataFrame

In [9]:
sentences['label'] = 0
sentences.reset_index(drop=True, inplace=True) 
for index in range(0,1000):
    sentences.at[index,'label']=1
sentences.loc[999:1000]

Unnamed: 0,findings,label
999,"list(""The stricture will need to be dilatated ...",1
1000,.htworg pylop fo ksir eht esaercni dna hcamots...,0


In [11]:
train, test = train_test_split(sentences,test_size=0.3,random_state=1)
train.reset_index(drop=True)
train.shape

(1400, 2)

In [12]:
test.reset_index(drop=True)
test.shape

(600, 2)

In [13]:
train_dataset = datasets.Dataset.from_pandas(train)
test_dataset = datasets.Dataset.from_pandas(test)
train_dataset

Dataset({
    features: ['findings', 'label', '__index_level_0__'],
    num_rows: 1400
})

In [14]:
test_dataset = test_dataset.remove_columns(["__index_level_0__"])
test_dataset

Dataset({
    features: ['findings', 'label'],
    num_rows: 600
})

In [15]:
Dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})
Dict

DatasetDict({
    train: Dataset({
        features: ['findings', 'label', '__index_level_0__'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['findings', 'label'],
        num_rows: 600
    })
})

In [16]:
out = model(**tokenizer(Dict['train'][0]['findings'], padding = True, truncation = True, return_tensors="pt"))
out.last_hidden_state

tensor([[[-0.1223,  0.0990, -0.9851,  ..., -0.8175,  0.1483, -0.5747],
         [-1.0426, -1.1295,  0.0735,  ..., -2.7267,  0.5722,  0.0941],
         [-2.5400, -1.2967,  0.7207,  ..., -2.8811,  3.1484, -1.2877],
         ...,
         [ 0.4616, -1.1923,  1.6625,  ..., -1.1666, -1.2519,  0.3580],
         [ 1.2963, -0.4984,  1.7329,  ..., -1.6832,  1.4270,  0.6104],
         [ 0.3591, -1.6327,  2.6605,  ..., -0.2729,  1.7900, -0.1149]]],
       grad_fn=<NativeLayerNormBackward0>)