In [1]:
import numpy as np
import pandas as pd
import re


In [2]:
df = pd.read_csv("../data/UpperGI.csv")

In [3]:
df = df.rename({"out": "text"}, axis=1)

In [4]:
def regex_indications(string):
    hospital_reg = r"\.*INDICATIONS FOR PROCEDURE:.*"
    line = re.findall(hospital_reg, string)[0]
    retrn_string =  line.replace(',',':').split(":")[1]
    if retrn_string[-1:] == "\r":
        retrn_string= retrn_string[:-1]
    if retrn_string[-8:] == "FINDINGS":
        return retrn_string[:-8]
    else:
        return retrn_string
    
df["indications"] = df['text'].apply(regex_indications)

In [5]:
def regex_extent(string):
    hospital_reg = r"\.*Extent of Exam:.*"
    line = re.findall(hospital_reg, string)[0]
    retrn_string =  line.replace(',',':').split(":")[1]
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
df["extent_of_exam"] = df['text'].apply(regex_extent)

In [6]:
def regex_findings(string):
    hospital_reg = r"\.*FINDINGS:.*"
    line = re.findall(hospital_reg, string)[0][10:]
    return line
df["findings"] = df['text'].apply(regex_findings)

In [14]:
from transformers import pipeline

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

2023-03-10 11:19:39.617521: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-10 11:19:40.817522: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/conda/lib
2023-03-10 11:19:40.817700: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object

In [16]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

In [10]:
generator_untrained = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [11]:
generator_untrained("FINDINGS:", max_new_tokens = 30)

[{'generated_text': 'FINDINGS: The results of this study suggest that the use of a combination of the two methods of analysis is a useful approach to the study of the relationship between the'}]

In [12]:
def tokenize_function(example):
    return tokenizer(example['all'], padding="max_length", truncation=True, return_tensors="pt")

In [13]:
import datasets

In [14]:
from datasets import load_dataset

In [15]:
df['token'] = df.findings.map(tokenize_function)


TypeError: string indices must be integers

In [16]:
new_data = df.indications + df.extent_of_exam + df.findings

In [17]:
new_data = new_data.to_frame('all')

In [None]:
new_small_data = new_data[:1000]
new_small_eval_data = new_data[1000:2000]

In [None]:
train_dataset = datasets.Dataset.from_dict(new_small_data)
val_dataset = datasets.Dataset.from_dict(new_small_eval_data)
my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":val_dataset})

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['all'])

tokenized_eval_dataset = val_dataset.map(tokenize_function)
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(['all'])

In [None]:
tokenized_dataset_dict = datasets.DatasetDict({"train":tokenized_train_dataset,"test":tokenized_eval_dataset})

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_dict['train'],
    eval_dataset=tokenized_dataset_dict['test'],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# Generate random combinations of extent_of_exam and indications

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [9]:
!pwd

/home/jupyter/open_medicine/notebooks


In [11]:
model_path = '/home/jupyter/open_medicine/notebooks/checkpoint-34500/'

In [12]:
model2 = AutoModelForCausalLM.from_pretrained(model_path, from_tf= False)

In [17]:
generator2 = pipeline('text-generation', model=model2, tokenizer=tokenizer)

In [93]:
prompt_new = (df.indications.map(lambda x: 'INDICATIONS FOR PROCEDURE: ' + x.strip())) + (df.extent_of_exam.map(lambda x: 'Extent of Exam: ' + x.strip()))
prompt_new = prompt_new.map(lambda x: x + ' FINDINGS: ').drop_duplicates().sort_values()


In [94]:
prompt_new

721     INDICATIONS FOR PROCEDURE: Abdominal Pain .Ext...
1525    INDICATIONS FOR PROCEDURE: Abdominal Pain .Ext...
2388    INDICATIONS FOR PROCEDURE: Abdominal Pain .Ext...
1610    INDICATIONS FOR PROCEDURE: Abdominal Pain .Ext...
797     INDICATIONS FOR PROCEDURE: Abdominal Pain .Ext...
                              ...                        
3566    INDICATIONS FOR PROCEDURE: ongoing diarhoea in...
625     INDICATIONS FOR PROCEDURE: ongoing diarhoea in...
623     INDICATIONS FOR PROCEDURE: ongoing diarhoea in...
2534    INDICATIONS FOR PROCEDURE: ongoing diarhoea in...
205     INDICATIONS FOR PROCEDURE: ongoing diarhoea in...
Length: 1127, dtype: object

In [28]:
prompt = (df.indications.map(lambda x: x.strip())  + df.extent_of_exam.map(lambda x: x.strip()) + ' FINDINGS:').drop_duplicates()

In [114]:
import random
size = 50
prompt_selected = random.sample(range(1125), size)

In [None]:
findings = [generator2(prompt_new.iloc[i], max_new_tokens = 30, min_new_tokens = 20) for i in prompt_selected]

In [100]:
def get_findings(text):
    start = 'FINDINGS:'
    return text[text.find(start) + len(start):].strip()
    

In [113]:

finding_text = [get_findings(findings[i][0]['generated_text']) for i in range(size)]

 
set(finding_text), len(set(finding_text))

({'Normal gastroscopy to the duodenum.',
  'The patient has a polyp in the GOJ which is stalked.It has an abnormal pit pattern. The oesophageal mucosa appears thickened with a nodular',
  'There is a nodule in the GOJ which is stalked. A nodular lesion is seen in the oesophagus, with surrounding erythema and nodularity of the',
  'There is a nodule in the GOJ which is stalked. NA FOLLOW UP: The patient should be advised to avoid consuming too much sugar',
  'There is a nodule in the third part of the duodenum which is benign-looking. The nodular lesions appear to be sessile, with a smooth surface and',
  'There is a nodule in the third part of the duodenum which is stalked. NA FOLLOW UP: The patient should be advised to avoid consuming'},
 6)

In [35]:
df_findings = pd.DataFrame({'finding': findings})

# Save the DataFrame as a CSV file
findings_csv = df.to_csv('findings50.csv', index=False)


In [24]:
l = len("INDICATIONS FOR PROCEDURE: ")

indication_list = [findings[i][0]['generated_text'].split('Extent of Exam: ')[0][l:].strip() for i in range(len(findings))]
extent_list = [findings[i][0]['generated_text'].split('Extent of Exam: ')[1].split('FINDINGS:')[0].strip() for i in range(len(findings))]
finding_list = [findings[i][0]['generated_text'].split('Extent of Exam: ')[1].split('FINDINGS:')[1].strip() for i in range(len(findings))]

gen_1 = pd.DataFrame({'Indication': indication_list, 'Extent of Exam': extent_list, 'Findings': finding_list})



In [45]:
gen_1['prompt'] =gen_1.Indication + gen_1['Extent of Exam']
gen_1['prompt']

0                                    Abdominal Pain .D1
1                                    Abdominal Pain .D2
2                     Abdominal Pain .Failed intubation
3                                   Abdominal Pain .GOJ
4                            Abdominal Pain .Oesophagus
5                               Abdominal Pain .Pylorus
6                          Abdominal Pain .Stomach body
7         Abdominal Pain Anaemia/Low Iron or VitaminsD1
8         Abdominal Pain Anaemia/Low Iron or VitaminsD2
9     Abdominal Pain Anaemia/Low Iron or VitaminsFai...
10       Abdominal Pain Anaemia/Low Iron or VitaminsGOJ
11    Abdominal Pain Anaemia/Low Iron or VitaminsOes...
12    Abdominal Pain Anaemia/Low Iron or VitaminsPyl...
13    Abdominal Pain Anaemia/Low Iron or VitaminsSto...
14                            Abdominal Pain BloatingD1
15                            Abdominal Pain BloatingD2
16             Abdominal Pain BloatingFailed intubation
17                           Abdominal Pain Bloa

In [46]:
gen_1.sort_values(by = 'prompt').reset_index()

Unnamed: 0,index,Indication,Extent of Exam,Findings,prompt
0,0,Abdominal Pain .,D1,There is a nodule in the GOJ which is stalked....,Abdominal Pain .D1
1,1,Abdominal Pain .,D2,There is a nodule in the GOJ which is stalked....,Abdominal Pain .D2
2,2,Abdominal Pain .,Failed intubation,There is a nodule in the GOJ which is stalked....,Abdominal Pain .Failed intubation
3,3,Abdominal Pain .,GOJ,There is a nodule in the GOJ which is stalked....,Abdominal Pain .GOJ
4,4,Abdominal Pain .,Oesophagus,There is a nodule in the GOJ which is stalked....,Abdominal Pain .Oesophagus
5,5,Abdominal Pain .,Pylorus,There is a nodule in the GOJ which is stalked....,Abdominal Pain .Pylorus
6,6,Abdominal Pain .,Stomach body,There is a nodule in the GOJ which is stalked....,Abdominal Pain .Stomach body
7,7,Abdominal Pain Anaemia/Low Iron or Vitamins,D1,There is a nodule in the GOJ which is stalked....,Abdominal Pain Anaemia/Low Iron or VitaminsD1
8,8,Abdominal Pain Anaemia/Low Iron or Vitamins,D2,There is a nodule in the GOJ which is stalked....,Abdominal Pain Anaemia/Low Iron or VitaminsD2
9,9,Abdominal Pain Anaemia/Low Iron or Vitamins,Failed intubation,There is a nodule in the third part of the duo...,Abdominal Pain Anaemia/Low Iron or VitaminsFai...


In [28]:
df_fake = pd.concat([df['indications'].map(lambda x: x.strip()), df['extent_of_exam'].map(lambda x: x.strip()), df['findings'].map(lambda x: x.strip())], axis=1)

In [30]:
df_fake['prompt'] = df.indications.map(lambda x: x.strip()) + df.extent_of_exam.map(lambda x: x.strip())

In [41]:
df_fake = df_fake.sort_values(by = 'prompt').reset_index()

In [42]:
df_fake.head()

Unnamed: 0,index,indications,extent_of_exam,findings,prompt
0,31916,Abdominal Pain .,D1,There is an ulcer in the GOJ which is scarredI...,Abdominal Pain .D1
1,28528,Abdominal Pain .,D1,Barrett's is present. No loss of aceto-whiteni...,Abdominal Pain .D1
2,19029,Abdominal Pain .,D1,The patient has a polyp in the second part of ...,Abdominal Pain .D1
3,11459,Abdominal Pain .,D1,Normal gastroscopy to the duodenum.,Abdominal Pain .D1
4,46456,Abdominal Pain .,D1,The patient has Barrett's oesophagus. Short se...,Abdominal Pain .D1


In [47]:
grouped = df_fake.groupby('prompt')['findings'].apply(lambda x: pd.Series(x.values[:3])).unstack()

# Rename the columns
grouped.columns = ['Findings 1', 'Findings 2', 'Findings 3']

# Reset the index
grouped = grouped.reset_index()


In [50]:
test_gen = gen_1.merge(grouped, on = 'prompt', how = 'left')

In [110]:
test_gen.drop(columns = ['prompt', 'Findings 3', 'Findings 2'])


Unnamed: 0,Indication,Extent of Exam,Findings,Findings 1
0,Abdominal Pain .,D1,There is a nodule in the GOJ which is stalked....,There is an ulcer in the GOJ which is scarredI...
1,Abdominal Pain .,D2,There is a nodule in the GOJ which is stalked....,Normal gastroscopy to the duodenum.
2,Abdominal Pain .,Failed intubation,There is a nodule in the GOJ which is stalked....,The patient has a 8mm nodule in the fundus whi...
3,Abdominal Pain .,GOJ,There is a nodule in the GOJ which is stalked....,The patient has inflammation in the second par...
4,Abdominal Pain .,Oesophagus,There is a nodule in the GOJ which is stalked....,The patient has a 6mm nodule in the third part...
5,Abdominal Pain .,Pylorus,There is a nodule in the GOJ which is stalked....,Normal gastroscopy to the duodenum.
6,Abdominal Pain .,Stomach body,There is a nodule in the GOJ which is stalked....,The patient has a polyp in the second part of ...
7,Abdominal Pain Anaemia/Low Iron or Vitamins,D1,There is a nodule in the GOJ which is stalked....,Normal gastroscopy to the duodenum.
8,Abdominal Pain Anaemia/Low Iron or Vitamins,D2,There is a nodule in the GOJ which is stalked....,There is an ulcer in the duodenal bulb which i...
9,Abdominal Pain Anaemia/Low Iron or Vitamins,Failed intubation,There is a nodule in the third part of the duo...,The patient has a 8mm nodule in the third part...


('Abdominal Pain .D1', 'Abdominal Pain .D1')

In [103]:
import spacy

nlp = spacy.load('en_core_web_md') # Load pre-trained word embedding model

def get_cosine_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2) # Compute cosine similarity between two documents

# # Example usage
# text1 = "The patient has mild inflammation in the stomach"
# text2 = "There is mild inflammation in the patient's stomach"
# similarity_score = get_cosine_similarity(text1, text2)
# print(similarity_score)



In [107]:
get_cosine_similarity(test_gen.Findings[0], test_gen['Findings 2'][20])

0.7808509741643225