In [1]:
import pandas as pd
#import boto3
#import anthropic_bedrock
#from anthropic_bedrock import AnthropicBedrock
from tqdm import tqdm
import hashlib
import argparse
import csv
import re
import json
import configparser
import os
import numpy as np
from vllm import LLM, SamplingParams

from tqdm import tqdm
tqdm.pandas()

In [None]:
#Reading the claude qac file
all_qac=pd.read_csv('all_qac_triplets.csv').drop_duplicates(subset=['original_ques','context','filename'])[['original_ques','context','filename']]
#all_qac=pd.read_csv('all_qac_triplets.csv')
all_qac.head(5)

In [None]:
print(len(all_qac))

In [None]:
docs=set(['/'.join(name.split('/')[:4]) for name in all_qac['filename']])
print(docs)

In [None]:
all_docs={}
for name in all_qac['filename']:
  temp='/'.join(name.split('/')[:4])
  if temp in all_docs.keys():
      all_docs[temp]+=1
  else:
      all_docs[temp]=1
all_docs      

In [None]:
all_qac['ID'] = all_qac['filename'].apply(lambda x: x.split("/")[-1].replace(".txt",""))
all_qac.head()

In [None]:
#Getting the unique contexts
context_id = all_qac.drop_duplicates(subset=['context','ID'])
unique_context = context_id["context"].tolist()
unique_id = context_id['ID'].tolist()

In [None]:
context_id.head()

In [None]:
#Prompt for generating the qac from llama
def prompt2(alltext):
  return f'''I'm going to give you a part of clinical trial protocol document. Then I'm going to ask you to generate question-answer pairs using the given document as context. Here is the document:

<document>

{alltext}

</document>

Genrate one or more question answer pairs based on the quote.

The question should “Q”, followed by a space and it’s index such as “ 1:”.

Each question should be followed by an answer, starting with "Answer:". Do not include or reference quoted content verbatim in the answer. Don't say "According to Context" when answering. .

Thus, the format of your overall response should look like what's shown between the tags. Make sure to follow the formatting and spacing exactly.

<example>

Q1: What is the indication for the study treatment?

Answer: The indication is treatment of relapsed/refractory DLBCL, PMBCL, TFL, and HGBCL after ≥2 lines of systemic therapy.

Q2: What type of study is KTE-C19-101?

Answer: KTE-C19-101 is an open-label, multicenter Phase 1-2 study. [4]

Q2: What is being evaluated in the study?

Answer: The study is evaluating safety and efficacy of axicabtagene ciloleucel.

</example>

Include less than or equal to 15 such question-answer pairs based on the context provided , whenever, possible. The questions-answers should be strictly from the context and follow the format provided.'''

In [1]:
def system_prompt(prompt):
    #https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
    return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant. Your carefully follow instructions.<|eot_id|><|start_header_id|>user<|end_header_id|>

{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'''

In [None]:
#Output generation
prompts=[]
for text in unique_context:
    prompt=system_prompt(prompt2(text))
    prompts.append(prompt)
llm = LLM(model="/lockbox/models/Meta-Llama-3-8B-Instruct", gpu_memory_utilization = 0.8)
sampling_params = SamplingParams(temperature=0.05, top_p=0.95, top_k = 40, max_tokens = 2048,)
out=[]
try:
    generated_outputs = llm.generate(prompts, sampling_params)
    for output in generated_outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        out.append(generated_text) 
except Exception:
            print(dir, ':', Exception)  


In [None]:
#adding the output to the text file
for i in range(len(out)):
    temp=out[i]
    filename=f'./Test_Documents/all_context_output/out{i}.txt'
    with open(filename,'w') as f:
        f.write(temp)

In [None]:
#Creating the json from the text files

def createJson(dir):
    QnAs = []
    all_dir = [x[0] for x in os.walk(dir) if not x[0].endswith('.ipynb_checkpoints')]

    for dir in tqdm(all_dir):
        print('<directory> ', dir, ' </directory>')

        # Loop through all response directories
        for file_name in tqdm(os.listdir(dir)):
            file_path = os.path.join(dir, file_name)
            
            # Skip non .txt and empty files
            if file_name.endswith(".txt") and os.stat(file_path).st_size != 0:
                index=int(re.search(r'\d+', file_name).group())
                #print('<file> ', file_path, ' </file>')

                # Check if output_file_path is a file
                if os.path.isfile(file_path):
                    QnAs.append({'filename':context_id.iloc[index]['filename'],'context':unique_context[index], 'qas': []})

                    with open(file_path, "r") as file:
                        alltext = file.read()

                        # Replace A1: with Answer: [Keep only one format]
                        pattern = r"\nA\d+: "
                        alltext = re.sub(pattern, "\nAnswer: ", alltext)
                        p = re.compile(r'Q \d+: (.+?)\nAnswer: (.+?)(?=\nQ \d+: |$)', re.DOTALL)
                        p2 = re.compile(r'Q\d+: (.+?)\nAnswer: (.+?)(?=\nQ\d+: |$)', re.DOTALL)

                        
                        qas = []

                        for m in p.finditer(alltext):
                            question = m.group(1).strip()
                            answer = m.group(2).strip()
                            qas.append({'question': question, 'answer': answer})
                        
                        for m in p2.finditer(alltext):
                            question = m.group(1).strip()
                            answer = m.group(2).strip()
                            qas.append({'question': question, 'answer': answer})    

                        QnAs[-1]['qas'] = qas

    return QnAs



In [None]:
directory_path='./Test_Documents/all_context_output/'
json_data = createJson(directory_path)

# Save JSON data to a file
with open('output_all_qac.json', 'w') as f:
    json.dump(json_data, f, indent=2)

print("JSON data saved successfully.")

In [2]:
import json
with open('output_all_qac.json', 'r') as f:
    json_data=json.load(f)

In [23]:

# Function to remove duplicate questions across all files
import random
from collections import defaultdict

def remove_duplicate_questions(data):
    # Dictionary to store seen questions across all files
    seen_questions = defaultdict(list)
    all_files={}
    # Iterate through each entry in the data
    for entry in data:
        qas = entry['qas']
        
        # Filter out duplicates
        unique_questions = set()
        unique_qas = []
        for qa in qas:
            question = qa['question']
            if question not in unique_questions:
                unique_qas.append(qa)
                unique_questions.add(question)

        # Update the entry with unique QAs
        entry['qas'] = unique_qas
        if(len(unique_qas)==0 and len(qas)!=0):
            print(entry)
        # Store each unique question with its entry
        for qa in unique_qas:
            seen_questions[qa['question']].append(entry)

    # Randomly select one entry per unique question
    sum=set()
    for question, entries in seen_questions.items():
        for entry in entries:
          sum.add(entry['filename'])
        
    print(len(sum))
    processed_data = []
    for question, entries in seen_questions.items():
        num=len(entries)-1
        rand=random.randint(0,num)
        for i in range(num+1):
            if(i!=rand):
                all_qac=entries[i]['qas']
                {entries[i]['qas'].remove(qa) for qa in all_qac if qa['question']==question}

            all_files[entries[i]['filename']]=entries[i]
    for key,value in all_files.items():
            processed_data.append(value) 
                
    return processed_data

# Example JSON data


# Remove duplicate questions
processed_data = remove_duplicate_questions(json_data)

# Print the processed data
#print(processed_data)


14820


In [9]:
print(len(json_data))

15589


In [24]:
print(len(processed_data))

14820


In [36]:
rows=[]
sum=0
for temp in processed_data:
    filename=temp['filename']
    num_ques=len(temp['qas'])
    sum=sum+num_ques
    rows.append([filename,num_ques])

df_testqa_num=pd.DataFrame(rows,columns=['Filename','Num_qa'])
df_testqa_num.head()

Unnamed: 0,Filename,Num_qa
0,CT-Documents-Hub/50-CQAGen/Pending/NSCLC-Paper...,5
1,CT-Documents-Hub/50-CQAGen/Pending/NSCLC-Paper...,2
2,CT-Documents-Hub/50-CQAGen/Pending/CART-Paper/...,14
3,CT-Documents-Hub/50-CQAGen/Pending/NSCLC-Paper...,7
4,CT-Documents-Hub/50-CQAGen/Pending/NSCLC-Paper...,8


In [37]:
df_testqa_num.to_csv("tesqa_count_cleaned.csv")

In [38]:
print(sum)

142209


In [32]:
with open('output_all_qac_cleaned.json', 'w') as f:
    json.dump(processed_data, f, indent=2)


In [None]:
df_count.to_csv('new_qac_count.csv')