# setup

In [None]:
!pip install openai
!pip install kor
!pip install langchain

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import openai
from langchain.chat_models import ChatOpenAI
from kor import create_extraction_chain, Object, Text



# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv()) # read local .env file



# Dataset Preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
# from IPython.display import display, HTML
# Read Dataset
data = pd.read_excel('/content/drive/MyDrive/Roche/04. ETNBC Utils - Sent to CapeStart.xlsx', usecols=[
                     'Title', 'Abstract', 'Exclusion Code']).dropna(axis=0, subset=['Title', 'Abstract']).fillna('Include')
data['Info'] = data['Title'].astype(str) + data['Abstract'].astype(str)
proc_data = data[['Info', 'Exclusion Code']].copy()

# Prompting

In [None]:
disease = """{'DISEASE': 'if the given text contains TNBC stage I (only), \
                          Unresectable stage III TNBC, \
                          Metastatic disease (stage IV), and \
                          Patients treated in the adjuvant settings \
                          should be excluded'}"""
study_design = """{'STUDY DESIGN': 'If the given text does not include Full economic evaluations, \
                                    including cost-consequence analyses, cost-minimization analyses, \
                                    cost-effectiveness analyses, cost-utility analyses, and cost-benefit analyses, \
                                    then these should be excluded'}"""
outcome = """{'OUTCOME': 'If the given text does not include Cost benefit/comparison, \
                          Costs per utility/clinical outcome, ICER'}"""
intervention = """{'INTERVENTION': 'if the given text contains Non-pharmacological therapies, \
                                    Pharmacological interventions to manage chemotherapy side-effects should be excluded'}"""
exclusion_criteria = [(disease, 'DISEASE'), (study_design, 'STUDY DESIGN'), (outcome, 'OUTCOME'), (intervention, 'INTERVENTION')]

In [None]:
eval_sample = 10

llm = ChatOpenAI(
                model_name="gpt-3.5-turbo-16k",
                temperature=0,
                max_tokens=2000,
                # frequency_penalty=0,
                # presence_penalty=0,
                # top_p=1.0,
                )

eval = proc_data.groupby('Exclusion Code').head(eval_sample).reset_index(drop=True).copy()  # Evaluating sample selection

for criteria, label in exclusion_criteria:
  print(label)
                          # Train sample selection
  train = proc_data[proc_data['Exclusion Code'].isin([label, 'Include'])].groupby('Exclusion Code').head(7).reset_index(drop=True).to_records(False).tolist()

  schema = Object(
                  id="classification",
                  description=(
                      f'''find the apt category of the given medical based documents; \
                      And classify the documents belongs to which category by following the guidelines.;
                      "guidelines": ```{criteria}```'''
                  ),
                  attributes=[
                      Text(
                          id="predicted",
                          description=f"category to classify: ```{label}, Include```",
                          examples = train,
                          )
                            ],
                  many=False,
                  )

  chain = create_extraction_chain(llm, schema, encoder_or_encoder_class='json')
  eval[label] = eval['Info'].apply(lambda txt : chain.run(txt)['data']['classification']['predicted'])

eval['Actual'] = eval['Exclusion Code'].apply(lambda x: 1 if x == 'Include' else 0)
eval['Predicted'] = eval[['DISEASE', 'STUDY DESIGN', 'OUTCOME', 'INTERVENTION']].apply(lambda row: 1 if row.nunique()==1 else 0, axis=1)

eval.to_csv(f'/content/drive/MyDrive/Roche/prediction {str(eval_sample)}.csv', index=False)


# Demo

In [None]:
eval_sample = 2

llm = ChatOpenAI(
                model_name="gpt-3.5-turbo-16k",
                temperature=0,
                max_tokens=2000,
                # frequency_penalty=0,
                # presence_penalty=0,
                # top_p=1.0,
                )

eval = proc_data.groupby('Exclusion Code').head(eval_sample).reset_index(drop=True).copy()  # Evaluating sample selection

for criteria, label in exclusion_criteria:
  print(label)
                          # Train sample selection
  train = proc_data[proc_data['Exclusion Code'].isin([label, 'Include'])].groupby('Exclusion Code').head(2).reset_index(drop=True).to_records(False).tolist()

  schema = Object(
                  id="classification",
                  description=(
                      f'''find the apt category of the given medical based documents; \
                      And classify the documents belongs to which category by following the guidelines.;
                      "guidelines": ```{criteria}```'''
                  ),
                  attributes=[
                      Text(
                          id="predicted",
                          description=f"category to classify: ```{label}, Include```",
                          examples = train,
                          )
                            ],
                  many=False,
                  )

  chain = create_extraction_chain(llm, schema, encoder_or_encoder_class='json')
  for i in eval.itertuples(index=False):
    print(chain.run(list(i)[0])['data'])

  #eval[label] = eval['Info'].apply(lambda txt : chain.run(txt)['data']['classification']['predicted'])

