In [1]:
# %pip install llama-index
# %pip install transformers accelerate bitsandbytes
# %pip install llama-index-llms-huggingface
# %pip install llama-index-embeddings-huggingface
# %pip install llama-index-program-openai
# %pip install llama-index-agent-openai
# %pip install rouge
# %pip install transformers accelerate

Setup

Loading Stage

In [2]:
from llama_index.core.response.notebook_utils import display_response
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
#pdf files are stored in the directory 'data'
documents = SimpleDirectoryReader("data").load_data()

Indexing Stage

LLM

In [3]:
# huggingface api token for downloading llama2
hf_token = "hf_token"

In [4]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    query_wrapper_prompt=PromptTemplate(" [INST] {query_str} [/INST] "),
    context_window=3900,
    model_kwargs={"token": hf_token, "quantization_config": quantization_config},
    tokenizer_kwargs={"token": hf_token},
    device_map="auto",
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Embedding Model

In [5]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = "local:BAAI/bge-small-en-v1.5"

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Storing Stage

In [6]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex.from_documents(documents)

Query Stage

In [7]:
ground_truths_part1 = {
        "What is the main purpose of the Cider model developed by the Global Policy Lab at U.C. Berkeley?": "The main purpose of the Cider model is to predict subscriber-level poverty from mobile phone metadata for targeting social assistance or humanitarian aid.",
        "During which period was the Cider model implemented in Togo to target humanitarian cash transfers?": "The Cider model was implemented in Togo from November 2020 to August 2021.",
        "What are the four aspects of a data incident in humanitarian response?": "The four aspects of a data incident are: (i) a threat source, (ii) a threat event, (iii) a vulnerability, and (iv) an adverse impact.",
        "What is the recommendation for handling sensitive information in the Cider model regarding mobile phone metadata?": "The recommendation is to collect informed consent from individuals and keep data encrypted whenever possible, with phone numbers always stored encrypted.",
        "What is the primary goal of the Humanitarian Data and Trust Initiative (HDTI) initiated in September 2020?": "The primary goal of the HDTI is to examine and build a common understanding around the objectives, risks, and constraints related to responsible data sharing between humanitarian organizations and donors.",
        "According to the \"A Principled Framework for Responsible Data Sharing,\" what should humanitarian organizations ensure regarding affected populations?": "Humanitarian organizations should ensure that data sharing processes are designed and implemented in ways that put the rights and needs of affected populations at the center throughout the data lifecycle.",
        "What are the key takeaways of the Data Impact Assessments (DIAs) mentioned in the \"DATA IMPACT ASSESSMENTS\" document?": "Key takeaways include determining the potential benefits and risks associated with data management, minimizing risks while maximizing benefits, and conducting assessments before and during data management activities to inform project planning and design.",
        "What are the five steps in responding to data incidents as outlined in the \"DATA INCIDENT MANAGEMENT\" document?": "The five steps are: (i) notification, (ii) classification, (iii) treatment, (iv) closure, and (v) learning.",
        "What is the primary output of the dialogue process initiated by HDTI related to responsible data sharing?": "The primary output of the dialogue process is the framework for responsible data sharing between humanitarian organizations and donors.",
        "What are the examples of common data management activities mentioned in the \"DATA IMPACT ASSESSMENTS\" document?": "Examples of common data management activities include registration exercises and protection monitoring.",
        "What is the main purpose of the Flood Anticipatory Action Trigger in Bangladesh?": "The main purpose is to use the GloFAS river discharge forecasts and water level forecasts by the Bangladesh Flood Forecast and Warning Centre (FFWC) to trigger humanitarian actions in anticipation of floods.",
        "What recommendation was made regarding the flooding constraint in the Flood Anticipatory Action Trigger in Nepal?": "The recommendation was to review the constraint that flooding will occur only during the monsoon season (June - September).",
        "What is the intended use of the predictive model in the Anticipatory Action Trigger for Drought in Niger?": "The intended use is to detect increased drought risk in Niger and trigger the release of funds from the Central Emergency Response Fund in anticipation of drought-related humanitarian needs.",
        "What is the threshold for triggering the Anticipatory Action for Drought in Burkina Faso based on?": "The threshold is dependent on geographical coverage and below-average rainfall probability.",
        "What does data responsibility in humanitarian action encompass?": "Data responsibility in humanitarian action encompasses both data protection (safeguards that preserve the right to privacy with respect to the processing of personal data) and data security (measures that aim to preserve the confidentiality, availability, and integrity of personal and non-personal data).",
        "What are the common data-related benefits in Cash and Voucher Assistance (CVA)?": "Common data-related benefits in CVA include improved understanding of beneficiary priorities, needs and preferences; improved targeting of assistance, including deduplication of beneficiary lists; increased efficiency and effectiveness of different delivery mechanisms; and enhanced transparency and accountability.",
        "What should be conducted for all CVA interventions to determine the expected risks, harms, and benefits?": "Data impact assessments should be conducted for all CVA interventions.",
        "According to the Data Responsibility in Public-Private Partnerships document, what should humanitarian organizations establish to improve data responsibility in CVA?": "Humanitarian organizations should establish an information-sharing protocol specific to CVA programs and data-sharing agreements for the exchange of personal data.",
        "What should the CVA data ecosystem map include?": "The CVA data ecosystem map should include a summary of major data management activities, including the scale, scope, and types of data being processed, the stakeholders involved, the data flows between different actors, and processes and platforms in use in the delivery of CVA in a given context.",
        "What is the purpose of the OCHA Data Responsibility Guidelines?": "The purpose of the OCHA Data Responsibility Guidelines is to offer a set of principles, processes, and tools that support actions for data responsibility in OCHA’s work, addressing how OCHA should implement the IASC Operational Guidance on Data Responsibility at various levels of a response.",
}

ground_truths_part2 = { "What is the primary goal of the INFORM Risk Index?": "The primary goal of the INFORM Risk Index is to identify countries at risk of humanitarian crises that could overwhelm national capacity to respond, supporting decisions about prevention, preparedness, and response.",
        "What key challenge did OCHA face when developing the Anticipatory Action Framework for South Sudan in 2022?": "The key challenge was the lack of sufficient forecast data and the complex operating environment, which required more time to prepare and respond, making traditional forecast models insufficient.",
        "What is the significance of data security in humanitarian action as highlighted in the \"Guidance Note on Data Security in Operational Data Management\"?": "Data security is crucial for safeguarding the confidentiality, integrity, and availability of data, preventing its accidental or intentional loss, destruction, alteration, acquisition, or disclosure, and ensuring the effective delivery of assistance and protection to affected people.",
        "What are the two common threats to data security in humanitarian action mentioned in the \"Guidance Note on Data Security in Operational Data Management\"?": "The two common threats are digital surveillance and interception, and unauthorized or malicious activity.",
        "According to the \"Islamic Perspectives on English Literature\" document, what argument does the author make regarding the integration of Islamic perspectives into English literature?": "The author argues that integrating Islamic perspectives into English literature can rescue the subject from irrelevance and elimination in Muslim societies, making it more applicable and relevant to Muslim readers.",
        "What innovative approach did OCHA take to anticipatory action in South Sudan in 2022 due to insufficient forecast data?": "OCHA piloted a lighter and nimbler approach to anticipatory action, allowing CERF allocations to be provided earlier based on a high likelihood of additional shocks compounding existing humanitarian needs.",
        "What are the main components of data responsibility as discussed in the \"Guidance Note on Data Security in Operational Data Management\"?": "The main components of data responsibility are data protection (safeguards that preserve the right to privacy with respect to the processing of personal data) and data security (measures that aim to preserve the confidentiality, availability, and integrity of personal and non-personal data).",
        "What was the impact of the anticipatory action taken in Bentiu, South Sudan, ahead of the October 2022 floods?": "The anticipatory action included constructing and reinforcing over 55 kilometers of dykes, which protected vital access roads, homes, and the airstrip, preventing 100,000 people from having to be evacuated and further displaced.",
        "In the \"Islamic Perspectives on English Literature\" document, how does the author suggest English literature can remain relevant in Muslim-majority countries?": "The author suggests that interpreting English literature through the lens of Islam can establish its relevance, making it more applicable to Muslim readers and grounding it in Muslim societies.",
        "What ethical considerations are highlighted in the \"INFORM Risk Index\" document regarding the use of the index?": "Ethical considerations include the potential misuse of the index by non-humanitarian actors, the need for monitoring the use of the index, the impact of missing data, and the risk of stigmatizing populations leading to less support.",
        "What methodology was used to calculate the number of people potentially exposed to flooding in Somalia in the 2024 HNRP?": "The methodology involved analyzing Daily FloodScan (1998-2022) & WorldPop (2020 UN Adjusted) raster data, reclassifying flood fraction values, and multiplying these by population estimates to create yearly seasonal flood exposure rasters. These were then aggregated at the administrative level to obtain population exposure estimates.",
        "What are the four workstreams of the OCHA Centre for Humanitarian Data?": "The four workstreams are data services, data responsibility, data literacy, and predictive analytics.",
        "How does observational rainfall data capture precipitation, and what are the common sources of such data?": "Observational rainfall data captures precipitation through measurements at weather stations or estimations via satellite or radar imagery. Common sources include CHIRPS and ARC2 datasets.",
        "What themes are commonly explored in English literature influenced by classical Greek philosophy?": "Common themes include explorations of ethical and moral issues, reflections on human life, political scheming, moral decisions, and social ethics, influenced by philosophers like Socrates, Plato, and Aristotle.",
        "How does \"Patriarchal Dominance in English Literature\" describe the portrayal of women in Shakespeare's plays?": "Women in Shakespeare's plays are often depicted as victims of patriarchal control or as transgressors who defy gender norms, such as Lady Macbeth in \"Macbeth\" and Desdemona in \"Othello.\"",
}

ground_truths_part3 = { "What are the philosophical themes explored in William Wordsworth’s \"Lyrical Ballads\" and Henry David Thoreau’s \"Walden\"?": "\"Lyrical Ballads\" reflects the idealist’s reverence for and pursuit of nature, while \"Walden\" urges a return to simplicity and an idealized natural living, emphasizing the connection between nature and the human spirit.",
        "What is the significance of the work \"A Room of One's Own\" by Virginia Woolf?": "\"A Room of One's Own\" argues for the importance of economic and intellectual independence for women writers, discussing the lack of female representation in literature and the challenges women writers faced in a male-dominated literary tradition.",
        "How does predictive analytics support humanitarian decision-making according to the \"PREDICTIVE ANALYTICS IN HUMANITARIAN RESPONSE\" document?": "Predictive analytics supports decision-making by anticipating events or characteristics of events (likelihood, severity, magnitude, or duration), using methodologies such as weather and climate models, epidemic models, and analysis of historical data to project humanitarian needs.",
        "What are some limitations of observational rainfall data as discussed in the \"Observational Rainfall Data\" document?": "Limitations include underreporting in certain geographical terrains, fewer gauge stations in rural and developing areas leading to less accurate data, potential deterioration and maintenance issues with stations, and difficulties in differentiating rain induced by specific atmospheric phenomena.",
        "What role does feminist literature play according to \"Patriarchal Dominance in English Literature\"?": "Feminist literature analyzes and deconstructs the portrayal of women in literature, calls attention to gender inequality, and seeks to challenge and subvert patriarchal norms. Examples include works by Virginia Woolf and Charlotte Perkins Gilman.",
        "What was the primary focus of the OCHA Anticipatory Action Framework for the Philippines?": "The primary focus was to predict the percentage of houses that will be severely damaged per municipality in the Philippines due to typhoons, allowing OCHA to begin a response before a typhoon hit.",
        "What is the significance of the 1 in 4-year return period in the CADC Anticipatory Action framework?": "The 1 in 4-year return period is used as a threshold for activation, meaning the framework is designed to activate if the seasonal rainfall is forecasted to be at least at this deficit level, indicating significant drought conditions.",
        "What is the role of ECMWF in the trigger mechanism for the CADC Anticipatory Action?": "ECMWF provides the seasonal forecast data, which is used to monitor and predict seasonal rainfall deficits that trigger the anticipatory action in the Central American Dry Corridor.",
        "What are the main components of data security in operational data management according to the \"Tip Sheet on Data Security in Operational Data Management\"?": "The main components include good password management, using antivirus/anti-malware software, keeping software and operating systems up-to-date, avoiding phishing scams, responsible use of mobile devices, and safeguarding sensitive data through practices like data minimization and encryption.",
        "What are some of the complexities and divergent views around the hijab discussed in \"The Hijab: Islam, Women and the Politics of Clothing\"?": "The complexities include different national contexts of hijab protests, such as in Iran and France, and diverse personal and political reasons for wearing or opposing the hijab. Divergent views are highlighted, such as seeing the hijab as either oppressive or a form of empowerment.",
        "How did Benjamin Franklin contribute to the development of libraries in the American colonies?": "Benjamin Franklin proposed the idea of creating a common library for the Junto group, where members could pool their books for communal use, leading to the establishment of the American Philosophical Society and contributing to the development of public libraries.",
        "What was the purpose of the peer review of the Typhoon impact model used in the Philippines?": "The purpose was to evaluate the trigger mechanism used in OCHA’s anticipatory action framework, ensuring that the model accurately predicts typhoon impacts and facilitates timely humanitarian responses.",
        "What ethical considerations are highlighted in the \"Trigger for OCHA Anticipatory Action on Drought in Chad\" document?": "Ethical considerations include addressing false negatives and positives, insufficient data, lack of trust and ownership by local authorities, transparency issues, and ensuring that no action is taken when predicted impacts do not occur.",
        "What were some of the data sources used in the Typhoon impact model for the Philippines?": "Data sources included typhoon track and rainfall forecasts, and input data from the Netherlands Red Cross 510 data and digital team, which were used to predict housing damage.",
        "According to \"Swallowing and spitting out the red pill,\" what are some of the vulnerability and radicalization pathways for young men in the manosphere?": "Vulnerability and radicalization pathways include exposure to online communities that promote misogynistic and extremist ideologies, the influence of personal grievances and social isolation, and the allure of belonging to a group that validates their experiences and frustrations."
    }

In [2]:
# %pip install rouge-score nltk

In [20]:
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score, precision_score, f1_score
import concurrent.futures
from tqdm import tqdm

In [21]:
# Download NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [24]:
# Placeholder function to get the query engine
def get_query_engine():

    return vector_index.as_query_engine()

# Function to get responses from Llama-2 model
def get_llama2_response(question):
    query_engine = get_query_engine()
    llama_2_response = query_engine.query(question)
    return llama_2_response.response

# Function to get responses from RAG-based Llama-2 model
def get_rag_llama2_response(question):
    query_engine = vector_index.as_query_engine(response_mode="refine")
    rag_1_response = query_engine.query(question)
    return rag_1_response.response

# Calculate accuracy, precision, and F1 scores
def calculate_metrics(reference, hypothesis):
    reference_tokens = nltk.word_tokenize(reference)
    hypothesis_tokens = nltk.word_tokenize(hypothesis)

    # Convert to sets for comparison
    reference_set = set(reference_tokens)
    hypothesis_set = set(hypothesis_tokens)

    # Calculate true positives, false positives, and false negatives
    true_positives = len(reference_set & hypothesis_set)
    false_positives = len(hypothesis_set - reference_set)
    false_negatives = len(reference_set - hypothesis_set)

    accuracy = true_positives / len(reference_set)
    precision = true_positives / (true_positives + false_positives)
    f1 = 2 * (precision * accuracy) / (precision + accuracy) if (precision + accuracy) != 0 else 0

    return accuracy, precision, f1

# Function to process ground truths and collect responses and metrics
def process_ground_truths(ground_truths):
    llama2_responses = {}
    rag_llama2_responses = {}
    true_answers = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_llama2 = {executor.submit(get_llama2_response, question): question for question in ground_truths}
        future_rag_llama2 = {executor.submit(get_rag_llama2_response, question): question for question in ground_truths}

        for future in tqdm(concurrent.futures.as_completed(future_llama2), total=len(ground_truths), desc="Llama-2 Responses"):
            question = future_llama2[future]
            llama2_responses[question] = future.result()
            true_answers.append(ground_truths[question])

        for future in tqdm(concurrent.futures.as_completed(future_rag_llama2), total=len(ground_truths), desc="RAG-based Llama-2 Responses"):
            question = future_rag_llama2[future]
            rag_llama2_responses[question] = future.result()

    return llama2_responses, rag_llama2_responses, true_answers

# Function to calculate metrics for responses
def calculate_all_metrics(ground_truths, llama2_responses, rag_llama2_responses):
    data = {
        "Query": [],
        "Ground Truth": [],
        "Llama-2 Response": [],
        "Llama-2 Accuracy": [],
        "Llama-2 Precision": [],
        "Llama-2 F1 Score": [],
        "RAG-based Llama-2 Response": [],
        "RAG-based Llama-2 Accuracy": [],
        "RAG-based Llama-2 Precision": [],
        "RAG-based Llama-2 F1 Score": []
    }

    for question in ground_truths:
        true_answer = ground_truths[question]

        llama2_response = llama2_responses[question]
        rag_llama2_response = rag_llama2_responses[question]

        # Calculate metrics for Llama-2 response
        llama2_accuracy, llama2_precision, llama2_f1 = calculate_metrics(true_answer, llama2_response)

        # Calculate metrics for RAG-based Llama-2 response
        rag_llama2_accuracy, rag_llama2_precision, rag_llama2_f1 = calculate_metrics(true_answer, rag_llama2_response)

        data["Query"].append(question)
        data["Ground Truth"].append(true_answer)
        data["Llama-2 Response"].append(llama2_response)
        data["Llama-2 Accuracy"].append(llama2_accuracy)
        data["Llama-2 Precision"].append(llama2_precision)
        data["Llama-2 F1 Score"].append(llama2_f1)
        data["RAG-based Llama-2 Response"].append(rag_llama2_response)
        data["RAG-based Llama-2 Accuracy"].append(rag_llama2_accuracy)
        data["RAG-based Llama-2 Precision"].append(rag_llama2_precision)
        data["RAG-based Llama-2 F1 Score"].append(rag_llama2_f1)

    df = pd.DataFrame(data)
    return df

# Calculate average scores
def calculate_average(scores):
    return sum(scores) / len(scores)

# Process each part of ground truths and store results into a DataFrame
ground_truths_parts = [ground_truths_part3]
df_list = []
for i, ground_truths in enumerate(ground_truths_parts, start=1):
    print(f"Processing Part {i} of ground truths...")
    llama2_responses, rag_llama2_responses, true_answers = process_ground_truths(ground_truths)
    df = calculate_all_metrics(ground_truths, llama2_responses, rag_llama2_responses)
    df_list.append(df)

# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Save the DataFrame to a CSV file
combined_df.to_csv('responses_and_metrics.csv', index=False)

# Display the combined DataFrame for verification
print(combined_df.head())

Processing Part 1 of ground truths...


Llama-2 Responses: 100%|██████████| 15/15 [08:30<00:00, 34.01s/it]
RAG-based Llama-2 Responses: 100%|██████████| 15/15 [13:27<00:00, 53.81s/it]

                                               Query  \
0  What are the philosophical themes explored in ...   
1  What is the significance of the work "A Room o...   
2  How does predictive analytics support humanita...   
3  What are some limitations of observational rai...   
4  What role does feminist literature play accord...   

                                        Ground Truth  \
0  "Lyrical Ballads" reflects the idealist’s reve...   
1  "A Room of One's Own" argues for the importanc...   
2  Predictive analytics supports decision-making ...   
3  Limitations include underreporting in certain ...   
4  Feminist literature analyzes and deconstructs ...   

                                    Llama-2 Response  Llama-2 Accuracy  \
0  Based on the provided context information, Wil...          0.515152   
1  The work "A Room of One's Own" by Virginia Woo...          0.718750   
2  According to the "PREDICTIVE ANALYTICS IN HUMA...          0.441176   
3  Based on the context inform




In [None]:
# query_engine = vector_index.as_query_engine()
# llama_2_response = query_engine.query("What are the key factors that humanitarian organizations should consider when deciding whether to share non-personal data?")

In [None]:
# query_engine = vector_index.as_query_engine(similarity_top_k=5)
# response = query_engine.query("What are the key factors that humanitarian organizations should consider when deciding whether to share non-personal data?")

Change in Response Mode: In the "refine" response mode, the model iteratively refines the initial response based on additional context.

In [None]:
# query_engine = vector_index.as_query_engine(response_mode="refine")
# rag_1_response = query_engine.query("What are the key factors that humanitarian organizations should consider when deciding whether to share non-personal data?")

Change in Response Mode: The "compact" response mode focuses on providing a concise and to-the-point answer.

In [None]:
# query_engine = vector_index.as_query_engine(response_mode="compact")
# rag_2_response = query_engine.query("What are the key factors that humanitarian organizations should consider when deciding whether to share non-personal data?")

Change in Response Mode: In the "tree_summarize" response mode, the model organizes the retrieved information in a hierarchical or tree-like structure. It generates a summary that captures the main points and their relationships, often providing a more structured overview of the topic.

In [None]:
# query_engine = vector_index.as_query_engine(response_mode="tree_summarize")
# rag_3_response = query_engine.query("What are the key factors that humanitarian organizations should consider when deciding whether to share non-personal data?")