# Load libraries

In [7]:
#------------------------------------------------------
# Regular modules
#------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random
import re
#------------------------------------------------------
# To avoid warnings
#------------------------------------------------------
import os
import warnings 
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#------------------------------------------------------
# Not regular modules
#------------------------------------------------------
import torch
#from torch import datasets
import umap
import spacy
from transformers import pipeline, set_seed
from transformers import AutoTokenizer, AutoModelForCausalLM


# Load dataset

In [8]:
df = pd.read_csv('../data/df_xy.csv', index_col=False)

In [10]:
df['prompts'] =  df.text.map(lambda x: x.split("FINDINGS:")[0])
df['findings'] =  df.text.map(lambda x: x.split("FINDINGS:")[1])


In [39]:
df['extent_of_exam'] = df.prompts.map(lambda x: x.split("INDICATIONS FOR PROCEDURE:")[1].split('Extent of Exam:')[1])
df['indications'] = df.prompts.map(lambda x: x.split("INDICATIONS FOR PROCEDURE:")[0].split('Extent of Exam:')[0])

In [None]:
df.

# Test fine-tuned model loading on hugging face

In [12]:


tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

model = AutoModelForCausalLM.from_pretrained("tombrooks248/EndoGPT")

In [63]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, 
                     min_length = 30,
                     max_length = 120, 
                     temperature = 1.0,
                     num_return_sequences = 1,
                     do_sample=True,
                     top_k = 
                    )

In [64]:
output = generator('INDICATIONS FOR PROCEDURE: Ongoing reflux symptoms.Extent of Exam: D1 FINDINGS: ' )

In [65]:
for i, sample in enumerate(output):
    print(f"Sample {i}: {sample['generated_text']}\n")

Sample 0: INDICATIONS FOR PROCEDURE: Ongoing reflux symptoms.Extent of Exam: D1 FINDINGS:  There is a polyp in the GOJ which is stalked with a normal pit pattern. A nodular lesion with surrounding erythema and nodularity of the overlying mucosa is observed in the mid-esophagus, suggestive of a malignant esophageal neoplasm.The polyp was removed with minimal thermal injury to surrounding tissue using a cold snare technique.. NA



In [16]:
output[0]['generated_text']

'INDICATIONS FOR PROCEDURE: Ongoing reflux symptoms.Extent of Exam: D1 FINDINGS:  The patient has a stricture in the antrum which is stiffScope not held by stricture. The gastric stricture appears to be caused by mucosal changes, despite the initial impression of extrinsic compression.During biopsy, it was noted that the lesion had a friable surface and bled easily..'

In [44]:
df.columns

Index(['Unnamed: 0', 'text', 'prompts', 'findings', 'Indexes',
       'extent_of_exam', 'indications'],
      dtype='object')

---
## Similarity Metrics
---

### we will take 2 approaches:
- intuitive approach based on plagiarism approaches
- numerical metric based on word-embedding based similarities

### Identify repeated sentences

In [17]:
def calculate_appearance(string):
    data_df = df
    data_df["Indexes"]= df["text"].str.find(string)
    return len(data_df[data_df["Indexes"] > 0])
    
def clause_appearance(string):
    total_appearances = []
    split_string = string.replace(",",".").split(".")
    for string in split_string:
        total_appearances.append(calculate_appearance(string))
        
    return total_appearances



In [19]:
df_sample = df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,text,prompts,findings
5106,5106,Dr. Morley Dr. Burns FG2 INDICATIONS FOR PR...,Dr. Morley Dr. Burns FG2 INDICATIONS FOR PR...,The endoscopic findings reveal an ultra-short...
21464,21464,Dr. Daugherty Dr. Nguyen FG6 INDICATIONS FO...,Dr. Daugherty Dr. Nguyen FG6 INDICATIONS FO...,There is a nodule in the body which is benign...
6055,6055,Dr. Spencer Dr. Hall FG4 INDICATIONS FOR PR...,Dr. Spencer Dr. Hall FG4 INDICATIONS FOR PR...,The patient has inflammation in the fundus wh...
32155,32156,Dr. Zumdome Dr. Nguyen FG6 INDICATIONS FOR ...,Dr. Zumdome Dr. Nguyen FG6 INDICATIONS FOR ...,Normal gastroscopy to the duodenum.
41900,41902,Dr. Le Dr. Ortiz FG3 INDICATIONS FOR PROCED...,Dr. Le Dr. Ortiz FG3 INDICATIONS FOR PROCED...,Normal gastroscopy to the duodenum.


In [20]:
df_sample['gen_findings'] = df_sample.prompts.map(generator)

In [22]:
df_sample['gen_findings'] = df_sample.gen_findings.map(lambda x: x[0]['generated_text'].split("FINDINGS:")[1])

In [23]:
df_sample["in_corpus?"] = df_sample.gen_findings.map(calculate_appearance)
df_sample["clause_in_corpus?"] = df_sample.gen_findings.map(clause_appearance)

### Similarity based on text embedding using Spacy

In [25]:
nlp = spacy.load('en_core_web_sm') # Load pre-trained word embedding model

def cosine_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2) # Compute cosine similarity between two documents



In [26]:
def test_similarity(text, real_findings=df_sample.findings):
    similarity = [cosine_similarity(text, real) for real in real_findings]
    return max(similarity)

In [27]:
df_sample['similarity'] = df_sample.gen_findings.apply(test_similarity)

In [31]:
df_sample.prompts[]

Unnamed: 0.1,Unnamed: 0,text,prompts,findings,gen_findings,in_corpus?,clause_in_corpus?,similarity
5106,5106,Dr. Morley Dr. Burns FG2 INDICATIONS FOR PR...,Dr. Morley Dr. Burns FG2 INDICATIONS FOR PR...,The endoscopic findings reveal an ultra-short...,There is a nodule in the body which is sessil...,0,"[97, 113, 384, 20, 20, 0, 0]",0.900487
21464,21464,Dr. Daugherty Dr. Nguyen FG6 INDICATIONS FO...,Dr. Daugherty Dr. Nguyen FG6 INDICATIONS FO...,There is a nodule in the body which is benign...,There is a nodule in the second part of the d...,99,"[99, 0]",0.712294
6055,6055,Dr. Spencer Dr. Hall FG4 INDICATIONS FOR PR...,Dr. Spencer Dr. Hall FG4 INDICATIONS FOR PR...,The patient has inflammation in the fundus wh...,The patient has a polyp in the body which is ...,0,"[29, 56, 56, 349, 0, 0]",0.825763
32155,32156,Dr. Zumdome Dr. Nguyen FG6 INDICATIONS FOR ...,Dr. Zumdome Dr. Nguyen FG6 INDICATIONS FOR ...,Normal gastroscopy to the duodenum.,The patient has a stricture in the duodenal b...,0,"[1, 0, 85, 85, 2597, 1292, 0, 12834]",0.853395
41900,41902,Dr. Le Dr. Ortiz FG3 INDICATIONS FOR PROCED...,Dr. Le Dr. Ortiz FG3 INDICATIONS FOR PROCED...,Normal gastroscopy to the duodenum.,There is a nodule in the stomach which is ben...,0,"[83, 113, 384, 10, 0, 0, 0]",0.926916


In [45]:
df_sample.prompts.iloc[1]

' Dr. Daugherty Dr. Nguyen  FG6  INDICATIONS FOR PROCEDURE: Abdominal Pain Small Bowel Biopsy Positive coeliac serology Extent of Exam:  Stomach body  '

In [47]:


sample_output = df_sample.drop(columns = ['Unnamed: 0','text','clause_in_corpus?', 'prompts'])

In [48]:
sample_output

Unnamed: 0,findings,gen_findings,in_corpus?,similarity
5106,The endoscopic findings reveal an ultra-short...,There is a nodule in the body which is sessil...,0,0.900487
21464,There is a nodule in the body which is benign...,There is a nodule in the second part of the d...,99,0.712294
6055,The patient has inflammation in the fundus wh...,The patient has a polyp in the body which is ...,0,0.825763
32155,Normal gastroscopy to the duodenum.,The patient has a stricture in the duodenal b...,0,0.853395
41900,Normal gastroscopy to the duodenum.,There is a nodule in the stomach which is ben...,0,0.926916
