In [None]:
!pip3 install openai langchain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip3 install --upgrade google-cloud-secret-manager

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install --upgrade weaviate-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import auth
from google.cloud import secretmanager_v1beta1 as secretmanager

import os

In [None]:
auth.authenticate_user()

In [None]:
!gcloud config set project muspelheim

Updated property [core/project].


In [None]:
def access_secret_version(project_id, secret_id, version_id):
    client = secretmanager.SecretManagerServiceClient()
    secret_version_name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
    
    response = client.access_secret_version(request={"name": secret_version_name})
    secret_payload = response.payload.data.decode("UTF-8")
    
    return secret_payload

In [None]:
OPENAPI_KEY = access_secret_version("muspelheim", "openai_key", "2")
os.environ['OPENAI_API_KEY']=OPENAPI_KEY

In [113]:
# WEAVIATE_KEY = "qn4i7LWr5G7NK2nXNYzuHnrV0J824Rj0Ur8R"
WEAVIATE_CLUSTER = "https://pgx-weaviate-hackathon-e702ovox.weaviate.network"

In [None]:
import json
import weaviate
from bs4 import BeautifulSoup
from pprint import pprint
from langchain.prompts import PromptTemplate
from langchain import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain

In [None]:
llm = OpenAI(temperature=0.1, 
             model_name='gpt-3.5-turbo')



In [114]:
client = weaviate.Client(
    url=WEAVIATE_CLUSTER,
    # auth_client_secret=weaviate.auth.AuthApiKey(api_key=WEAVIATE_KEY),
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }
)

client.is_ready()

True

In [112]:
pharmgkb_guideline_schema = {
  "class": "Guideline",
  "description": "A collection of clinical guidelines collected by PharmGKB",
  "vectorizer": "text2vec-openai",
  "moduleConfig": {
    "text2vec-openai": {
      "model": "ada",
      "modelVersion": "002",
      "type": "text"
    },
    "generative-openai": {
        "model": "gpt-3.5-turbo",
    }
  },
  "properties": [
    {
      "name": "guideline_id",
      "description": "PharmGKB Guideline ID",
      "dataType": ["text"]
    },
    {
      "name": "name",
      "description": "Name of PharmGKB Guideline",
      "dataType": ["text"]
    },
    {
      "name": "relatedChemicals",
      "description": "A list of chemicals that are related to the guideline",
      "dataType": ["text[]"]
    },
    {
      "name": "relatedGenes",
      "description": "A list of genes that are related to the guideline",
      "dataType": ["text[]"]
    },
    {
      "name": "guideline",
      "description": "The full text of the guideline in Markdown format",
      "dataType": ["text"]
    },
    {
      "name": "algorithm",
      "description": "The full text of the algorithm",
      "dataType": ["text"]
    }
  ]
}


In [115]:
client.schema.delete_all()
client.schema.get()

client.schema.create_class(pharmgkb_guideline_schema)

client.schema.get()

{'classes': [{'class': 'Guideline',
   'description': 'A collection of clinical guidelines collected by PharmGKB',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'generative-openai': {'model': 'gpt-3.5-turbo'},
    'text2vec-openai': {'model': 'ada',
     'modelVersion': '002',
     'type': 'text',
     'vectorizeClassName': True}},
   'properties': [{'dataType': ['text'],
     'description': 'PharmGKB Guideline ID',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'guideline_id',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': 'Name of PharmGKB Guideline',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName

In [None]:
def html_to_text(html_string):
  soup = BeautifulSoup(html_string, "html.parser")
  text = soup.get_text()
  return text

In [None]:
def transform_guideline(content):
  guideline = content['guideline']
  record = {}
  #   {
  #   "name": "guideline_id",
  #   "description": "PharmGKB Guideline ID",
  #   "dataType": ["text"]
  # },
  record["guideline_id"] = guideline['id']
  # {
  #   "name": "name",
  #   "description": "Name of PharmGKB Guideline",
  #   "dataType": ["text"]
  # },
  record["name"] = guideline['name']
  # {
  #   "name": "relatedChemicals",
  #   "description": "A list of chemicals that are related to the guideline",
  #   "dataType": ["text[]"]
  # },
  record["relatedChemicals"] = [related_chemical['name'] for related_chemical in guideline['relatedChemicals']]
  # {
  #   "name": "relatedGenes",
  #   "description": "A list of genes that are related to the guideline",
  #   "dataType": ["text[]"]
  # },
  record["relatedGenes"] = [related_gene['symbol'] for related_gene in guideline['relatedGenes']]
  # {
  #   "name": "guideline",
  #   "description": "The full text of the guideline in Markdown format",
  #   "dataType": ["text"]
  # }
  record["guideline"] = html_to_text(guideline['summaryMarkdown']['html'])

  return record


In [None]:
guidelines = []

guidelines_directory = "/content/drive/MyDrive/pgx/guidelineAnnotations.json"
files = [f for f in os.listdir(guidelines_directory) if f.endswith(".json")]

for file in files:
  filename = f"{guidelines_directory}/{file}"
  with open(filename) as f:
    content = json.load(f)
    if content['guideline']["recommendation"]:
      guideline = transform_guideline(content)
      guidelines.append(guideline)

In [None]:
def guideline_to_algorithmn(guideline):
  prompt = PromptTemplate(
    input_variables=["guideline"],
    template="Please turn the following guideline into an algorithm for an LLM: {guideline}",
  )
  
  prompt_text = prompt.format(guideline=guideline)

  print(f"\n{prompt_text}:")
  return llm(prompt_text)

In [107]:
guidelines_with_algo = []
for guideline in guidelines:
  try:
    guideline_stmt = """
    This guideline applies to the following medications: {}
    This guideline applies to the following genes: {}

    Guideline:
    {}
    """.format(", ".join(guideline['relatedChemicals']),
               ", ".join(guideline['relatedGenes']),
               guideline['guideline'])
    algo = guideline_to_algorithmn(guideline_stmt)
    guideline['algorithm'] = algo
    guidelines_with_algo.append(guideline)
  except Exception as e:
    print(e)


Please turn the following guideline into an algorithm for an LLM: 
    This guideline applies to the following medications: clomipramine
    This guideline applies to the following genes: CYP2C19

    Guideline:
    The Royal Dutch Pharmacists Association - Pharmacogenetics Working Group (DPWG) recommends to avoid clomipramine in CYP2C19 ultrarapid metabolizer (UM) for Indication OBSESSIVE COMPULSIVE DISORDER or ANXIETY DISORDERS.

    :

Please turn the following guideline into an algorithm for an LLM: 
    This guideline applies to the following medications: warfarin
    This guideline applies to the following genes: VKORC1

    Guideline:
    Patients with the VKORC1 rs9923231 TT genotype (-1639 AA genotype) should be given 60% of the standard initial dose of warfarin. The genotype-specific initial dose and maintenance dose can be calculated using an algorithm. There are no recommendation for patients with the VKORC1 rs9923231 CT genotype (-1639 AG genotype).

    :

Please turn th




Please turn the following guideline into an algorithm for an LLM: 
    This guideline applies to the following medications: warfarin
    This guideline applies to the following genes: CYP2C9

    Guideline:
    Reduce warfarin dose in CYP2C9 poor and intermediate metabolizers (PM and IM) and patients with CYP2C9*1/*3, *2/*3, *2/*2 or *3/*3 genotype. "The genotype-specific initial dose and maintenance dose can be calculated using an algorithm, as used in EU-PACT: see Algorithms coumarins"

    :

Please turn the following guideline into an algorithm for an LLM: 
    This guideline applies to the following medications: nortriptyline
    This guideline applies to the following genes: CYP2D6

    Guideline:
    The CPIC Dosing Guideline update for nortriptyline recommends a 25% dose reduction for CYP2D6 intermediate metabolizers. For CYP2D6 ultrarapid or poor metabolizers, an alternative drug should be considered. If nortriptyline is warranted, consider a 50% dose reduction in CYP2D6 poor

In [110]:
pprint(guidelines_with_algo[:10])

[{'algorithm': '1. Check if the medication being prescribed is clomipramine.\n'
               '2. Check if the patient has been tested for the CYP2C19 gene.\n'
               '3. If the patient is an ultrarapid metabolizer (UM) for '
               'CYP2C19, then avoid prescribing clomipramine for the '
               'indications of Obsessive Compulsive Disorder or Anxiety '
               'Disorders.\n'
               '4. If the patient is not an ultrarapid metabolizer (UM) for '
               'CYP2C19, then clomipramine can be prescribed for the '
               'indications of Obsessive Compulsive Disorder or Anxiety '
               'Disorders.',
  'guideline': 'The Royal Dutch Pharmacists Association - Pharmacogenetics '
               'Working Group (DPWG) recommends to avoid clomipramine in '
               'CYP2C19 ultrarapid metabolizer (UM) for Indication OBSESSIVE '
               'COMPULSIVE DISORDER or ANXIETY DISORDERS.\n',
  'guideline_id': 'PA166184528',
  'name': 'A

In [None]:
len(guidelines_with_algo)

134

In [108]:
guidelines_with_algo_filename = '/content/drive/MyDrive/pgx/guidelines_with_algo_v2.csv'

In [109]:
import csv

header = guidelines_with_algo[0].keys()
# Open a CSV file for writing
with open(guidelines_with_algo_filename, 'w', newline='') as csvfile:
  # Create a writer object
  writer = csv.writer(csvfile)

  # Write the header row
  writer.writerow(header)

  # Write the data rows
  for row in guidelines_with_algo:
      writer.writerow(row.values())

In [116]:
client.batch.configure(
    batch_size=5, 
    dynamic=True,
    timeout_retries=3,
)

<weaviate.batch.crud_batch.Batch at 0x7fed7453beb0>

In [117]:
import time
counter=0

with client.batch as batch:
  for guideline in guidelines_with_algo:
      if (counter %5 == 0):
          print(f"Import {counter} / {len(guidelines)} ")
      
      batch.add_data_object(guideline, "Guideline")
      counter = counter+1
      time.sleep(5)


Import 0 / 134 
Import 5 / 134 
Import 10 / 134 
Import 15 / 134 
Import 20 / 134 
Import 25 / 134 
Import 30 / 134 
Import 35 / 134 
Import 40 / 134 
Import 45 / 134 
Import 50 / 134 
Import 55 / 134 
Import 60 / 134 
Import 65 / 134 
Import 70 / 134 
Import 75 / 134 
Import 80 / 134 
Import 85 / 134 
Import 90 / 134 
Import 95 / 134 
Import 100 / 134 
Import 105 / 134 
Import 110 / 134 
Import 115 / 134 
Import 120 / 134 
Import 125 / 134 
Import 130 / 134 


In [118]:
result = (
    client.query.aggregate("Guideline")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Guideline"], "\n")

Object count:  [{'meta': {'count': 134}}] 



In [None]:
# Test one article has worked by checking one object
test_guideline = (
    client.query
    .get("Guideline", ["name", "relatedGenes", "guideline", "relatedChemicals"])
    .with_limit(1)
    .do()
)["data"]["Get"]["Guideline"][0]

print(test_guideline)


{'guideline': 'The Canadian Pharmacogenomics Network for Drug Safety (CPNDS) clinical recommendation group has published guidelines for the use of pharmacogenetic testing for TPMT gene variants when prescribing cisplatin in pediatric cancer patients. They recommend testing for the TPMT alleles *2, *3A, *3B or *3C in all pediatric cancer patients due to the association of these alleles with an increased risk of cisplatin-induced ototoxicity.\n', 'name': 'Annotation of CPNDS Guideline for cisplatin and TPMT', 'relatedChemicals': ['cisplatin'], 'relatedGenes': ['TPMT']}


In [None]:
nearText = {"concepts": ["Patient is a poor metabolizer of CYP2C19. Patient is taking citalopram."]}

result = (
    client.query
    .get("Guideline", ["name", "relatedGenes", "guideline", "relatedChemicals", "algorithm"])
    .with_near_text(nearText)
    .with_limit(2)
    .do()
)

In [None]:
pprint(result['data']['Get']['Guideline'])

[{'algorithm': '1. Check if the patient is a CYP2C19 ultrarapid metabolizer.\n'
               '2. If the patient is a CYP2C19 ultrarapid metabolizer, '
               'recommend an alternative drug not predominantly metabolized by '
               'CYP2C19.\n'
               '3. If the patient is not a CYP2C19 ultrarapid metabolizer, '
               'check if the patient is a CYP2C19 poor metabolizer.\n'
               '4. If the patient is a CYP2C19 poor metabolizer, consider a '
               '50% reduction of recommended starting dose.\n'
               '5. Titrate the dose to response or select an alternative drug '
               'not predominantly metabolized by CYP2C19.\n'
               '6. Monitor the patient for adverse effects and adjust the dose '
               'as necessary.',
  'guideline': 'The CPIC Dosing Guideline for the selective serotonin reuptake '
               'inhibitors citalopram and escitalopram recommends an '
               'alternative drug not predom

In [None]:
results

["1. Input: Patient's CYP2C19 metabolizer status (intermediate or poor) and the Dutch Pharmacogenetics Working Group (DPWG) document containing specified daily doses for citalopram.\n\n2. Check if the patient's CYP2C19 metabolizer status is either intermediate or poor.\n   a. If the patient's status is not intermediate or poor, proceed with the standard citalopram dosing recommendations.\n   b. If the patient's status is intermediate or poor, proceed to step 3.\n\n3. Retrieve the specified daily doses for citalopram from the DPWG document.\n\n4. Determine the maximum daily dose for the patient based on their CYP2C19 metabolizer status:\n   a. If the patient is an intermediate metabolizer, assign the maximum daily dose for intermediate metabolizers from the DPWG document.\n   b. If the patient is a poor metabolizer, assign the maximum daily dose for poor metabolizers from the DPWG document.\n\n5. Output: The recommended maximum daily dose of citalopram for the patient based on their CYP

In [None]:
prompt = "Turn the following guideline into an algorithm: {algorithm}"

In [None]:
result = (
  client.query
  .get("Guideline", ["algorithm"])
  .with_near_text(nearText)
  .with_limit(1)
  .with_generate(single_prompt=prompt)
).do()

In [None]:
pprint(result)

{'data': {'Get': {'Guideline': None}},
 'errors': [{'locations': [{'column': 6, 'line': 1}],
             'message': 'explorer: get class: extend: extend generate: wrong '
                        'parameters',
             'path': ['Get', 'Guideline']}]}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
