# Code for reproducing the experiment of "Exploring the use of Language Models to Enhance Datasets Explainability via Documentation"

In [1]:
## Dependencies
# In order to extract the tables, you need java installed in your system
!java -version

# Install the requirements.txt of the repo
!pip install -r requirements.txt
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip install -q datasets loralib sentencepiece 
!pip -q install bitsandbytes accelerate
!pip -q install langchain
!pip -q install tiktoken
!pip -q install openai
!pip -q install faiss-gpu



java version "20.0.1" 2023-04-18
Java(TM) SE Runtime Environment (build 20.0.1+9-29)
Java HotSpot(TM) 64-Bit Server VM (build 20.0.1+9-29, mixed mode, sharing)
Collecting transformers==4.25.1
  Using cached transformers-4.25.1-py3-none-any.whl (5.8 MB)
INFO: pip is looking at multiple versions of beautifulsoup4 to determine which version is compatible with other requirements. This could take a while.
Collecting beautifulsoup4==4.11.1
  Using cached beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
INFO: pip is looking at multiple versions of backoff to determine which version is compatible with other requirements. This could take a while.
Collecting backoff==2.2.1
  Using cached backoff-2.2.1-py3-none-any.whl (15 kB)
INFO: pip is looking at multiple versions of backcall to determine which version is compatible with other requirements. This could take a while.
Collecting backcall==0.2.0
  Using cached backcall-0.2.0-py2.py3-none-any.whl (11 kB)
INFO: pip is looking at multiple versions of

In [2]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, GenerationConfig, pipeline
from langchain import PromptTemplate, LLMChain

import torch

## Configurable parameters
The following cell contains the parameters you need to configure in order to use this notebook.
In resume: 

1- You need to point the DocumentPath to a .txt file of the dataset you canto analyze
2 - You need to set a name for the Output file via OutputFileName
3 - You need to set your API KEY form OpenAI or HF

Also:

4 - You can selecte the model to use. (Be aware that FLAN-UL2 needs at lest an NVIDIA A100 GPU)

Once finished all the cells, you will find a file with the results in the root folder

In [3]:
import os
from dotenv import load_dotenv
load_dotenv()


## Variables to configure
## Select the cleaned dataset file you want to process
documentPath = "sources/Nature-Scientific-Data/A whole-body FDG.txt"

## Set the name of the generated output file with the results (in .xlsx)
outputfileName = "Whole-body Results"

## Table Extraction
## Set to 0 if no table extraction is neded
## Tables are already extracted in the source files, so keep it to 0.
extract_tables = 0
## The PDF where the tables are.
# pdf_path ="sources/Nature-Scientific-Data/A whole-body FDG-PET:CT.pdf"

## To adapt the approach to your domain specific use-cases, configure your own semantic dictionary
context = {
    "title": documentPath.split("/")[-1],  
    "gathering": ["collection","gathering", "acquisition"],
    "annotation":["labeling", "annotation"],
    "statistics": ["Characteristics", "Statistics", "Features"],
}

## NOTE: You will need a APIKEY for OPENAI to use text-davinci-003, or a Hugginface API TOKEN (free) to download the FLAN-UL2
api_key=os.getenv("OPEN_AI_API_KEY")
api_key_t5=os.getenv("HUGGINGFACEHUB_API_TOKEN")

## Select the model to use during the experiment
## NOTE: Using texct-davinci-003 will not require special hardware requirements. You can execute it in your local setup
model = "text-davinci-003" 
## NOTE: Using FLAN-UL2 will require to have at least 40GB of VRAM on your system. 
## So, FLAN-UL2 will not work on a common local setup. The experiments have beend one using a Nvidia A100 GPU. 
#model = "UL2" 


In [4]:
## Import the libraries
from langchain.llms import OpenAI, Cohere
from langchain.chat_models import ChatOpenAI

if model == "UL2":

    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration
    model_id = 'google/flan-ul2'# go for a smaller model if you dont have the VRAM
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-ul2")
    pipe = pipeline(
        model,
        tokenizer=tokenizer, 
        max_length=128,
        temperature=0.0,
        device_map="auto", 
        model_kwargs={"load_in_8bit": True}
    )

    LLMClient = HuggingFacePipeline(pipeline=pipe)
    retrieved_docs = 10

elif model == "text-davinci-003":
    LLMClient = OpenAI(model_name=model, openai_api_key=api_key,temperature=0)
    retrieved_docs = 10



# Prepare Data
First we prepare the data. For this example we do similarity search over a vector database, but these documents could be fetched in any manner (the point of this notebook to highlight what to do AFTER you fetch the documents).
### Text and Question preparation

In [5]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate

In [6]:
import re

# Import the documentation
with open(documentPath) as f:
    documentation = f.read()

# Configure the text splitter and embeddings
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap=10)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

# Configure semantic dictionary
gathering = {"gathering": 0}
semdict = {"gathering": ["gathering", "collection", "acquisition"]}
for concept, words in semdict.items():
    for word in words:
         print(word)
         count = sum(1 for match in re.finditer(word, documentation))
         print(word+": "+ str(count))
         if count > gathering["gathering"]:
            gathering["gathering"] = count
            context['gathering'] = word


# Split, encode and index the text
texts = text_splitter.split_text(documentation)
for idx, text in enumerate(texts):
    texts[idx] = text.replace('\n',' ')
    
docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])

Created a chunk of size 284, which is longer than the specified 200
Created a chunk of size 564, which is longer than the specified 200
Created a chunk of size 294, which is longer than the specified 200
Created a chunk of size 477, which is longer than the specified 200
Created a chunk of size 247, which is longer than the specified 200
Created a chunk of size 294, which is longer than the specified 200
Created a chunk of size 867, which is longer than the specified 200


gathering
gathering: 0
collection
collection: 2
acquisition
acquisition: 5


### Prompt types preparation

In [9]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import LLMChain

## Incontext prompt and refine prompt
chain_refine = load_qa_chain(LLMClient, chain_type="refine",return_refine_steps=True)
incontext_prompt = load_qa_chain(LLMClient, chain_type="stuff")

## Prompt type to reduce the context size, for LLM with lower contexts window.
chain_reduce = load_qa_chain(LLMClient, chain_type="map_reduce", return_intermediate_steps=True)

## Custom instruction prompt type (Classification and Parsing)
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
{instruction}
###
Context: 
{context}
###
Question: {question}
###
Helpful answer:
"""
instruction_prompt = PromptTemplate(
    input_variables=["instruction","context","question"],
    template=prompt_template,
)
chain_instruction_simple = LLMChain(llm=LLMClient, prompt=instruction_prompt)

## Table prompt to transform parsed tables in natural text
prompt_template = """Given the following table in HTML, and the given context related the table: Translate the content of the table into natural language.
###
Context: 
{context}
###
Table: {table}
###
Table translation:
"""
table_prompt = PromptTemplate(
    input_variables=["context","table"],
    template=prompt_template,
)
chain_table = LLMChain(llm=LLMClient, prompt=table_prompt)

### Table Extraction and preparation

Tables already extracted for this dataset, so not necessary to execute the following cell

In [10]:
# Extract tables
if (extract_tables == 1):
    import tabula ## You need to have the Java Tabula installed in the environment
    table_texts = []
    dfs = tabula.read_pdf(pdf_path, pages='all')
    for idx, table in enumerate(dfs):
        query = "Table "+str(idx+1)+":"
        docs = docsearch.similarity_search(query, k=4)
        result = chain_table({"context":docs,"table":table})
        print(query + " "+ result['text'])
        table_texts.append(query + " "+ result['text'])
        # Building the in-context chain with specific instructions
    docsearch.add_texts(table_texts,metadatas=[{"source": i} for i in range(len(texts))])
    with open(documentPath, 'a') as f:
        for line in table_texts:
            f.write(f"{line}\n")

# Extraction process

## Uses

In [11]:
# Init results
results = []

### Description and Type

In [12]:
# Purposes
specificPart = {
    "id":'metadata.description.purposes', 
    "questions":["""Which are the purpose or purposes of the dataset?
                """],
    "promptStrategy": "refine"
}
docs = docsearch.similarity_search(specificPart["questions"][0], k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": specificPart["questions"][0]}, return_only_outputs=True)
specificPart['result'] = result['output_text']
results.append(specificPart)
print(result['output_text'])

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


 The purpose of the dataset is to provide a publicly available dataset of annotated Positron Emission Tomography/Computed Tomography (PET/CT) studies for deep learning-based automated analysis of PET/CT data.


In [229]:
# Tasks: Get the the tasks of the dataset with the docs of the purposes. 
## We get context from previous answer
specificPart = {
    "id":'metadata.description.tasks', 
    "questions":["""Which of the following tasks is the dataset inteded for?:

            text-classification, question-answering, text-generation, token-classification, translation,
            fill-mask, text-retrieval, conditional-text-generation, sequence-modeling, summarization, other,
            structure-prediction, information-retrieval, text2text-generation, zero-shot-retrieval,
            zero-shot-information-retrieval, automatic-speech-recognition, image-classification, speech-processing,
            text-scoring, audio-classification, conversational, question-generation, image-to-text, data-to-text,
            classification, object-detection, multiple-choice, text-mining, image-segmentation, dialog-response-generation,
            named-entity-recognition, sentiment-analysis, machine-translation, tabular-to-text, table-to-text, simplification,
            sentence-similarity, zero-shot-classification, visual-question-answering, text_classification, time-series-forecasting,
            computer-vision, feature-extraction, symbolic-regression, topic modeling, one liner summary, email subject, meeting title,
            text-to-structured, reasoning, paraphrasing, paraphrase, code-generation, tts, image-retrieval, image-captioning,
            language-modelling, video-captionning, neural-machine-translation, transkation, text-generation-other-common-sense-inference,
            text-generation-other-discourse-analysis, text-to-tabular, text-generation-other-code-modeling, other-text-search

            If you are not sure answer with just with "others".
            Please, answer only with the one or some of the provided tasks separated by commas. """],
    "promptStrategy": "classification"
}
docs = docsearch.similarity_search(specificPart["questions"][0], k=9)
result = incontext_prompt({"input_documents": docs, "question": specificPart['questions'][0]},return_only_outputs=True)
specificPart['result'] = result['output_text']
results.append(specificPart)

In [230]:
# Generate the tags of the dataset from the doc of purposes
## We get context from previous answer
specificPart = {
    "id":'metadata.description.tags', 
    "questions":["""Given the context information can you generate a set of representative keywords of it? Please provide the tags comma separated."""],
    "promptStrategy": "simple"
}
result = incontext_prompt({"input_documents": docs, "question": specificPart['questions'][0]},return_only_outputs=True)
specificPart['result'] = result['output_text']
results.append(specificPart)

In [231]:
# Gaps
specificPart = {
    "id":'metadata.description.gaps', 
    "questions":["""Which are the gaps the  """+ context['title']+ """ dataset intend to fill?
                """],
    "promptStrategy": "reduce"
}
docs = docsearch.similarity_search(specificPart["questions"][0], k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": specificPart["questions"][0],"token_max":1800}, return_only_outputs=True)
specificPart['result'] = result['output_text']
results.append(specificPart)

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


### Recommnedations

In [232]:
## Reccomendations
# Recommended
question = """For which applications the """+ context['title']+ """   dataset is recommended?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
    "id":'metadata.applications.recommended', 
    "questions":[question],
    "promptStrategy": "refine",
    "result": result['output_text']
}
results.append(specificPart)

# Non-Recommended
question = """Is there any non-recommneded application for the """+ context['title']+ """   dataset? If you are not sure, or there is any non-recommended use of the dataset metioned in the context, just answer with "no"."""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
    "id":'metadata.applications.non_recommended', 
    "questions":[question],
    "promptStrategy": "refine",
    "result": result['output_text']
}
results.append(specificPart)


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


### ML benchmarks

In [233]:
from langchain.output_parsers import CommaSeparatedListOutputParser

parser = CommaSeparatedListOutputParser()
## Benchmarking
question1 = """Has the  """+ context['title']+ """  dataset been tested using any Machine learning technique?
                Answer only with a YES or NO
                If you are not sure answer with UNSURE
                """
docs = docsearch.similarity_search(question1, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question1},return_only_outputs=True)
print(result['output_text'])

if "yes" in result['output_text'] or "Yes" in result['output_text'] or "YES" in result['output_text'] :
    questionModelNames = """Which are the name of the models used to test the dataset? If there is more than one, please provide a list of the models name comma separated
                   example answer: Model 1, Model 2, Model 3 
                    
                    """
    #docs = docsearch.similarity_search(question2, k=retrieved_docs)
    result_sub = incontext_prompt({"input_documents": docs, "question": questionModelNames},return_only_outputs=True)
    models_parsed = parser.parse(result_sub['output_text'])
    print (result_sub['output_text'])
    print(models_parsed)
    result_complete = ""
    for model in models_parsed: 
            # Metrics
        questionMetrics = """Which are the metrics mentioned in the context of the """+model+""" approach?
                       If there are no metric just answer with "no metrics"
                        """
        # docs = docsearch.similarity_search(questionMetrics, k=retrieved_docs)
        result_sub = incontext_prompt({"input_documents": docs, "question": questionMetrics},return_only_outputs=True)
        result_complete = result_complete + " " + model +": " + result_sub["output_text"]
        print(result_complete)

    specificPart = {
        "id":'metadata.applications.benchmarking.modelName', 
        "questions":[question1,questionModelNames],
        "promptStrategy": "simple",
        "result": result_complete
        }
    results.append(specificPart)
else:
    specificPart = {
    "id":'metadata.applications.benchmarking.modelName', 
    "questions":[question1],
    "promptStrategy": "simple",
    "result": 'not provided'
    }
    results.append(specificPart)



    

 YES
 nnUNet13
['nnUNet13']
 nnUNet13:  Dice score, false positive volume, and false negative volume.


## Contributors

In [234]:
## Authors
# Gaps
specificPart = {
    "id":'metadata.authoring.authors', 
    "questions":["""Who are the authors of the """+ context['title']+ """ dataset? Please, answer only with the authors name and affiliation separated by commas."""],
    "promptStrategy": "simple"
}
docs = docsearch.similarity_search(specificPart["questions"][0], k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": specificPart["questions"][0]},return_only_outputs=True)
specificPart['result'] = result['output_text']
results.append(specificPart)

In [235]:

## Funders
question = """Is there any organization which supported or funded the creation of the dataset?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

## Extract the funder's name
results.append(
    {
    "id":'metadata.authoring.fundersName', 
    "questions":[
        """Is there any organization which supported or funded the creation of the dataset?"""],
    "promptStrategy": "in-context",
    "result": result['output_text']
})
## Try to guess funder's type
question = """The organization mentioned in this context:

CONTEXT: """+ result['output_text'] + """

Are of type public, private organizations?
If you are not sure answer with just with "unknown" """
result_sub = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

results.append(
    {
    "id":'metadata.authoring.fundersType', 
    "questions":[
        """The organization mentioned in this context:

        CONTEXT: """+ result['output_text'] + """

        Are of public or private organizations?
        If you are not sure answer with just with "unknown" """],
    "promptStrategy": "chained",
    "result": result_sub['output_text']
})


## Extracting grantor ID
question = """Given the context information:

    CONTEXT: """+ result['output_text'] + """

    Is there any ID or reference of the grants provided by the funders?
    
    If you are not sure, answer "not provided"""
result_sub = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

results.append(
    {
    "id":'metadata.authoring.grantsID', 
    "questions":[
        """Given the context information:

        CONTEXT: """+ result['output_text'] + """

        Which is the ID of the grant?
        
        If you are not sure, answer "not provided"""],
    "promptStrategy": "chained",
    "result": result_sub['output_text']
})


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


In [236]:
## Maintainers
specificPart = {
    "id":'metadata.authoring.maintainers', 
    "questions":["""Who are the maintainers of the  """+ context['title']+ """ dataset?
                """],
    "promptStrategy": "reduce"
}
docs = docsearch.similarity_search(specificPart["questions"][0], k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": specificPart["questions"][0], "token_max":1800}, return_only_outputs=True)
specificPart["result"] = result['output_text']
results.append(specificPart)

## Contribution Guidelines
specificPart= {
    "id":'metadata.authoring.contribution_guidelines', 
    "questions":["""Which are the contribution guidelines of the  """+ context['title']+ """ dataset? If you are not sure, or there is no contribution guidelines just answer with "no".
                """],
    "promptStrategy": "simple"
}
docs = docsearch.similarity_search(specificPart["questions"][0],k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": specificPart["questions"][0]},return_only_outputs=True)
specificPart["result"] = result['output_text']
results.append(specificPart)

# Erratum
specificPart = {
    "id":'metadata.authoring.erratum', 
    "questions":["""Is there any data retention limit in the  """+ context['title']+ """ dataset? If you are not sure, or there is no retention limit just answer with "no".
                """],
    "promptStrategy": "simple"
}
docs = docsearch.similarity_search(specificPart["questions"][0], k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": specificPart["questions"][0]},return_only_outputs=True)
specificPart["result"] = result['output_text']
results.append(specificPart)

# Retention
specificPart = {
    "id":'metadata.authoring.data_retention', 
    "questions":["""Is there any data retention policies policiy of the  """+ context['title']+ """ dataset? If you are not sure, or there is no retention policy just answer with "no".
                """],
    "promptStrategy": "simple"
}
docs = docsearch.similarity_search(specificPart["questions"][0])
result = incontext_prompt({"input_documents": docs, "question": specificPart["questions"][0]},return_only_outputs=True)
specificPart["result"] = result['output_text']
results.append(specificPart)


## Distribution

In [237]:
concepts = []
## Distribution section
concepts.append({
    "id":'metadata.distribution.data_repository', 
    "questions":["""Is there a link to the a repository containing the data? If you are not sure, or there is no link to the repository just answer with "no"."""],
    "promptStrategy": "simple"
})
concepts.append({
    "id":'metadata.distribution.license', 
    "questions":["""Which is the license of the  """ +context['title']+"""  dataset. If you are not sure, or there is mention to a license of the dataset in the context, just answer with "no".
                """],
    "promptStrategy": "simple"
})
concepts.append({
    "id":'metadata.distribution.rights_of_data', 
    "questions":["""Which are the rights of the stand-alone dataset?
                """],
    "promptStrategy": "simple"
})
concepts.append({
    "id":'metadata.distribution.rights_of_model', 
    "questions":["""Which are the rights of the models trained with this data?
                """],
    "promptStrategy": "simple"
})
concepts.append({
    "id":'metadata.distribution.attribution_credits', 
    "questions":["""Is there any attribution notice that have to be used to use the {  """+ context['title']+ """ dataset?
                """],
    "promptStrategy": "simple"
})
concepts.append( {
    "id":'metadata.distribution.designated_third_parties', 
    "questions":["""Are there third parties in charge of the license or distribution of  """+ context['title']+ """ dataset?
                """],
    "promptStrategy": "simple"
})
concepts.append( {
    "id":'metadata.distribution.deprecation_policy', 
    "questions":["""Is there any deprecation plan or policy of the """+ context['title']+ """  dataset?
                """],
    "promptStrategy": "simple"
})
## Doing the similarity search
for dslConcept in concepts:
    ## We perform for each question a semantic similarity
    docs = docsearch.similarity_search(dslConcept["questions"][0], k=retrieved_docs)
    ## Selecting prompting strategy
    if (dslConcept["promptStrategy"] == "simple"):
        result = incontext_prompt({"input_documents": docs, "question": dslConcept["questions"][0]},return_only_outputs=True)
    elif dslConcept["promptStrategy"] == "reduce": 
        print("reduce")  
        result = chain_reduce({"input_documents": docs, "question": dslConcept["questions"][0]}, return_only_outputs=True)
   
    specificPart = {
    "id": dslConcept['id'], 
    "questions":[dslConcept["questions"][0]],
    "promptStrategy": "simple",
    "result": result['output_text']
    }
    results.append(specificPart)



## Composition

In that part we get a general rationale using the reduce methods (it is an exaplanation that can be sparse over the document). From this explanations we try to infer the file distribution, and description of each files (parsing the answer). Going depper is difficult, and is information that can be extracted from analyzing the data directly.

In [238]:
concepts = []
## Distribution section
concepts.append({
    "id":'composition.rationale', 
    "questions":["""Which is the format os each file of the dataset?"""],
    "promptStrategy": "simple"
})
concepts.append({
    "id":'composition.instances_files', 
    "questions":["""Can you enumerate the different files the dataset composed of?
                """],
    "promptStrategy": "simple"
})
concepts.append({
    "id":'composition.instances_files.description', 
    "questions":["""Can you provide a description of each files the dataset is composed of?
                """],
    "promptStrategy": "simple"
})
concepts.append({
    "id":'composition.instances_files.attributes', 
    "questions":["""Can you enumerate the different attributes present in the dataset? 
                """],
    "promptStrategy": "simple"
})
concepts.append({
    "id":'composition.instances_files.statistics', 
    "questions":["""Are there relevant statistics or distributions of the dataset? 
                """],
    "promptStrategy": "simple"
})

concepts.append( {
    "id":'composition.consistency_rules', 
    "questions":["""Has the data any explicit consistency rule?
                """],
    "promptStrategy": "simple"
})
concepts.append( {
    "id":'composition.data_splits', 
    "questions":["""The paper mentions any recommended data split of the dataset?
                """],
    "promptStrategy": "simple"
})
## Doing the similarity search
for dslConcept in concepts:
    ## We perform for each question a semantic similarity
    docs = docsearch.similarity_search(dslConcept["questions"][0], k=retrieved_docs)
    ## Selecting prompting strategy
    if (dslConcept["promptStrategy"] == "simple"):
        result = incontext_prompt({"input_documents": docs, "question": dslConcept["questions"][0]},return_only_outputs=True)
    elif dslConcept["promptStrategy"] == "refine": 
        print("reduce")  
        result = chain_refine({"input_documents": docs, "question": dslConcept["questions"][0]}, return_only_outputs=True)
    

    specificPart = {
    "id": dslConcept['id'], 
    "questions":[dslConcept["questions"][0]],
    "promptStrategy": "simple",
    "result": result['output_text']
    }
    results.append(specificPart)


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


## Gathering

In [239]:
## Provenance gathering section
question = """Provide a rationale about how the data of """+context['title']+""" has been collected and prepared. """
docs = docsearch.similarity_search(question, k=retrieved_docs)
# result = chain_refine({"input_documents": docs, "question": question,"token_max":1800},return_only_outputs=True)
specificPart = {
"id": 'provenance.curation_rationale', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


In [240]:
## Description and Type
### Rationale
question = """Provide a summary of how the data of the dataset has been collected? Please avoid mention the annotation process or data preparation processes"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
"id": 'provenance.gathering.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


### Gathering type
question = """Which of the following types corresponds to the gathering process mentioned in the context?

Types: Web API, Web Scrapping, Sensors, Manual Human Curator, Software collection, Surveys, Observations, Interviews, Focus groups, Document analysis, Secondary data analysis, Physical data collection, Self-reporting, Experiments, Direct measurement, Interviews, Document analysis, Secondary data analysis, Physical data collection, Self-reporting, Experiments, Direct measurement, Customer feedback data, Audio or video recordings, Image data, Biometric data, Medical or health data, Financial data, Geographic or spatial data, Time series data, User-generated content data.

Answer with "Others", if you are unsure. Please answer with only the type"""
result = incontext_prompt({"input_documents": [Document(page_content=result['output_text'],metadata=[])], "question": question},return_only_outputs=True)
specificPart = {
"id": 'provenance.gathering.type', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


In [241]:
## Localization
### Gathering Timeframe
question = "Which are the timeframe when the data was collected? "
instruction = "If present, answer only with the collection timeframe of the data. If your are not sure, or there is no mention, just answers 'not provided'"

docs = docsearch.similarity_search(question, k=retrieved_docs)
result = chain_instruction_simple({"instruction": instruction, "context": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.gathering.timeframe', 
"questions":[question],
"promptStrategy": "simple",
"result": result['text']
}
results.append(specificPart)

### Gathering geolocalization
question = """Which are the places where data has been collected?"""""
instruction = "If present, answer only with the collection timeframe of the data. If your are not sure, or there is no mention, just answers 'not provided'"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = chain_instruction_simple({"instruction": instruction, "context": docs, "question": question},return_only_outputs=True)
specificPart = {
"id": 'provenance.gathering.location', 
"questions":[question],
"promptStrategy": "simple",
"result": result['text']
}
results.append(specificPart)



In [242]:
## Sources

###  Data Sources
question ="Which is the source of the data during the collection process?"
instruction = "Answer solely with the name of the source"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.gathering.source_description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)

###  Infrastructure
question ="Which tools or infrastructure has been used during the collection process?"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.gathering.source_infra', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


In [243]:
# Team Type
# Team
question = "Who was the team who collect the data?"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.gathering.team.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


# Team Type
question = """The data was collected by an internal team, an external team, or crowdsourcing team?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.gathering.team.type', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


question = "Are the any demographic information of "+result['output_text']+"?"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.gathering.team.demographics', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)

## Annotation
In that section we use a reduce approach to get a general description of the process, we classifcate it within the different categories using this answer, and then we extract the validations, the demographics and tooling used in this process.

In [244]:
## Description and type
### description
question = """How the data of the  """+context['title']+""" has been annotated or labelled? Provide a short summary of the annotation process"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
"id": 'provenance.labeling.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


# type
question = """ Which  of the following category corresponds to the annotation
               process mentioned in the context? 
               
            Categories: Bounding boxes, Lines and splines, Semantinc Segmentation, 3D cuboids, Polygonal segmentation, Landmark and key-point, Image and video annotations, Entity annotation, Content and textual categorization
               
            If you are not sure, answer with 'others'. Please answer only with the categories provided in the context. """
result = incontext_prompt({"input_documents": [Document(page_content=result['output_text'],metadata=[])], "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.labeling.type', 
"questions":[question],
"promptStrategy": "classification",
"result": result['output_text']
}
results.append(specificPart)



# Labels
question = """
Which are the specific labels of the dataset? Can you enumerate it an provide a description of each one?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
"id": 'provenance.labeling.labels.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


In [245]:
# Team
question = """Who has annotated the data?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
"id": 'provenance.labeling.team.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


# Team Type
question = """The data was annotated by an internal team, an external team, or crowdsourcing team?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.labeling.team.type', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


# Team demographics
question = """Is there any demographic information about the team who annotate the data?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'provenance.labeling.team.demographics', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)




In [246]:
# Infraestructure and Validation
question = """Which tool has been used to annotate the dataset?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
"id": 'provenance.labeling.infrastructure.tool', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
print(result['output_text'])
results.append(specificPart)


# Validation
question = """How the quality of the labels have been validated?"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
"id": 'provenance.labeling.validation.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)
print(result['output_text'])

 The dataset was annotated using the NORA image analysis platform, University of Freiburg, Germany.


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


 The labels were validated by an experienced radiologist (S.G., 10 years of experience in hybrid imaging) using dedicated software (NORA image analysis platform, University of Freiburg, Germany). In case of uncertainty regarding lesion definition, the specific PET/CT studies were reviewed in consensus with the radiologist and nuclear medicine physician who prepared the initial clinical report. To this end CT and corresponding PET volumes were displayed side by side or as an overlay and tumor lesions showing elevated FDG-uptake (visually above blood-pool levels) were segmented in a slice-per-slice manner resulting in 3D binary segmentation masks.


## Preprocess

## Data preparation

Here we use a parsing strategy. In fact we answer for a enumerated list, we parse the list, then we ask for a general description of each process ussing a reduce technique (maybe is explained along the document), and we use this answer as a context for classification task (guessing the type). So, parsing, description and classification. We get the ID as the label. 

TO DO: If the answer is unknown the process is broken. See how to fix it, by tring to obtain a more constrained structure or putting some "security" if the answer is unknow. Such as, "please asnwer with UKNOWN, if you are not sure".

In [247]:
from langchain.output_parsers import CommaSeparatedListOutputParser
parser = CommaSeparatedListOutputParser()


## We use the previous answer as a context
question_general = """Can you enumerate each processes applied to the data to prepare and preprocess the dataset? Avoid answering with the collection process or the annotation process. Plase provide a list of the processes in a short label and comma separated?

Example Answer: Data Generation, Data Augmentation, Filtering"""
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question_general},return_only_outputs=True)

parsed = parser.parse(result['output_text'])

# For results consistency, ensure that the length of the list is 5, this may be removed for full results
while len(parsed) != 5:
    if(len(parsed) < 5):
        parsed.append(" ")
    if (len(parsed) > 5):
        parsed.pop()

for parsed_one in parsed:
    if (parsed_one == " "):
        specificPartID = {
            "id": 'provenance.preprocesses.id', 
            "questions":"",
            "promptStrategy": "parsing",
            "result": ""
        }
        results.append(specificPartID)

        specificPartDesc = {
        "id": 'provenance.preprocesses.description', 
        "questions":"",
        "promptStrategy": "parsing",
        "result": ""
        }
        results.append(specificPartDesc)

        specificPartType = {
        "id": 'provenance.preprocesses.type', 
        "questions":"",
        "promptStrategy": "simple",
        "result": ""
        }
        results.append(specificPartType)
    else:
        # Description
        questionDesc = "Can you provide a short description of the "+parsed_one+" process?"
        docs = docsearch.similarity_search(questionDesc, k=9)
        result_description = incontext_prompt({"input_documents": docs, "question": questionDesc},return_only_outputs=True)

    
        # Type: We use the previous answer as a contextual info (DOCS)
        questionType = """ Which  of the following category corresponds to
                    the """+parsed_one+""" process?
                    
                    Categories: Missing Values, Data Annotation, Data Augmentation, Outlier Filtering, Remove Duplicates, Data reduction, Sampling, Data Normalization, Others
                    
                    If you are not sure, answer with 'Others' """

        result_type = incontext_prompt({"input_documents": docs, "question": questionType},return_only_outputs=True)

        if(result_type['output_text'] != "Data Annotation"):
            specificPartID = {
            "id": 'provenance.preprocesses.id', 
            "questions":[question_general],
            "promptStrategy": "parsing",
            "result": parsed_one
            }
            results.append(specificPartID)

            specificPartDesc = {
            "id": 'provenance.preprocesses.description', 
            "questions":[questionDesc],
            "promptStrategy": "parsing",
            "result": result_description['output_text']
            }
            results.append(specificPartDesc)

            specificPartType = {
            "id": 'provenance.preprocesses.type', 
            "questions":[questionType],
            "promptStrategy": "simple",
            "result": result_type['output_text']
            }
            results.append(specificPartType)

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


## Social Concerns
Aquí el que he fet es dividir les preguntes pel tipus de social issue. En aquest cas els autors no classificats els perills del seu dataset en la nostra classificacio. Per tant aquí fem preguntes per temàtica.

In [248]:
question = "Is there any potentia bias in the data?"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)
specificPart = {
"id": 'social_concerns.bias_social_issue.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


question = "Are there any social group that could be misrepresented in the dataset?"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'social_concerns.representative_social_issue.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)

question = "Are there any imbalance issue  in the dataset?"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'social_concerns.imbalance_social_issue.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


question = "Are there sensitive data, or data that can be offensive for people in the dataset?"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'social_concerns.sensitive_social_issue.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)


question = "Is there any privacy issues on the data?"
docs = docsearch.similarity_search(question, k=retrieved_docs)
result = incontext_prompt({"input_documents": docs, "question": question},return_only_outputs=True)

specificPart = {
"id": 'social_concerns.privacy_social_issue.description', 
"questions":[question],
"promptStrategy": "simple",
"result": result['output_text']
}
results.append(specificPart)

# Save results

In [249]:
import pandas as pd
df = pd.DataFrame(results)
df.to_excel("./results/"+outputfileName+".xlsx")