# Information Retrieval


>
> Goal: We have to find the best method of retrieving info from the Climate Policy Radar's Huggingface dataset based on a user's query or information need.
>

>
> Specify your goal:
> 1. Select chunks with exact keyword match with the query
> 2. Select chunks with same semantic meaning to query
> 3. Rerank the chunks based on their relevance to the query
> 4. Define a threshold whether the chunks would be useful to the user or not
> 5. Hyperlinks
>

## 1. Preperations

### 1.1 Import libraries and functions

In [1]:
# pip install fuzzywuzzy
# pip install rank_bm25

In [None]:
import importlib
import retrieval
importlib.reload(retrieval)



<module 'retrieval' from 'c:\\Users\\User\\Documents\\DS205\\group-6-final-project\\retrieval.py'>

In [None]:
# Import necessary modules
import sys
import os
from pathlib import Path

# Get the absolute path of the project root directory
notebook_dir = Path(os.getcwd())  
project_root = notebook_dir.parent.parent  # Go up TWO levels instead of one

# Add project root to Python path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print(f"Added {project_root} to sys.path")

In [1]:

from dotenv import load_dotenv
import os
from transformers import AutoTokenizer, AutoModel
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors, Word2Vec
from gensim.utils import simple_preprocess
from sqlalchemy import create_engine, text

from scripts.retrival.retrieval_support import boolean_search, bm25_search, fuzzy_search, vector_search, df_with_similarity_score, hybrid_scoring
from functions import generate_word2vec_embedding_for_text, generate_embeddings_for_text

ModuleNotFoundError: No module named 'fuzzywuzzy'

Downloading climatebert model and tokenizer

In [4]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

EMBEDDING_MODEL_LOCAL_DIR = os.getenv('EMBEDDING_MODEL_LOCAL_DIR')
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

climatebert_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
climatebert_model = AutoModel.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)

Some weights of RobertaModel were not initialized from the model checkpoint at local_model/climatebert/distilroberta-base-climate-f and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading word2vec model

In [5]:
custom_w2v = Word2Vec.load("./local_model/custom_word2vec_768.model")

### 1.2 Introduce a prompt

#### 1.2.1 Define a prompt and exact keywords

In [6]:
# Introducing a prompt based on ASCOR CP1.a
prompt = "Does the country have a decarbonisation strategy to meet Paris Agreement that they are implementing or in the national legislation?"
keywords = prompt.split(" ")

#### 1.2.2 Generate similar words to keywords

In [11]:
# Generate similar words using word2vec model to prompt's keywords and store them for keyword search
keywords = simple_preprocess(prompt)
similar_words = []

# For each keyword, try to find similar words
for keyword in keywords:
    try:
        # Only get similar words if keyword exists in vocabulary
        if keyword in custom_w2v.wv:
            similar = custom_w2v.wv.most_similar(keyword, topn=5)  # Get top 5 similar words
            similar_words.extend([word for word, score in similar])
    except KeyError:
        # Skip words not in vocabulary
        continue

# Combine original keywords with similar words
all_search_terms = list(set(keywords + similar_words))

print("Original keywords:", keywords)
print("\nExpanded keywords:", all_search_terms)

Original keywords: ['does', 'the', 'country', 'have', 'decarbonisation', 'strategy', 'to', 'meet', 'paris', 'agreement', 'that', 'they', 'are', 'implementing', 'or', 'in', 'the', 'national', 'legislation']

Expanded keywords: ['have', 'supported', 'another', 'yet', 'electrification', 'followed', 'award', 'in', 'reduced', 'based', 'up', 'regulation', 'paris', 'political', 'achieved', 'hydrogen', 'fees', 'agreement', 'or', 'draft', 'contract', 'are', 'ministerial', 'subject', 'national', 'typology', 'well', 'developed', 'they', 'kw', 'meet', 'body', 'articles', 'reason', 'going', 'country', 'continue', 'determination', 'decarbonisation', 'zero', 'prediction', 'once', 'non', 'nations', 'results', 'completed', 'against', 'met', 'that', 'depends', 'stipulated', 'increases', 'implementing', 'procurements', 'encourage', 'page', 'plan', 'intensity', 'motivate', 'will', 'rise', 'revised', 'packages', 'limited', 'action', 'feedstock', 'development', 'now', 'social', 'dimension', 'built', 'prepar

Generate embeddings for the prompt

In [12]:
# Convert prompt into embeddings
prompt_w2v_embeddings = generate_word2vec_embedding_for_text(prompt, custom_w2v)

prompt_climatebert_embeddings = generate_embeddings_for_text(prompt, climatebert_model, climatebert_tokenizer)

### 1.3 Load the dataframe from the database

In [13]:

from sqlalchemy import create_engine, text
import os
engine = create_engine(os.getenv("DB_URL"))

df = pd.read_sql("SELECT * FROM document_embeddings", engine)
df.head()

Unnamed: 0,id,document_id,document_title,country_code,original_text,source_hyperlink,climatebert_embedding,word2vec_embedding
0,2316,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,0.3,https://www.energy-community.org/dam/jcr:a0c2b...,"[-0.025991503,0.0861391,0.0032455176,-0.087798...","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
1,2317,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,0.4,https://www.energy-community.org/dam/jcr:a0c2b...,"[-0.020988133,0.07871499,0.007967811,-0.090249...","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
2,2318,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,0.0,https://www.energy-community.org/dam/jcr:a0c2b...,"[-0.019849952,0.077195846,0.0019134246,-0.0781...","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
3,2319,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,0.0,https://www.energy-community.org/dam/jcr:a0c2b...,"[-0.019849952,0.077195846,0.0019134246,-0.0781...","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
4,2320,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,0.0,https://www.energy-community.org/dam/jcr:a0c2b...,"[-0.019849952,0.077195846,0.0019134246,-0.0781...","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."


## 2. Retrieving Relevant Chunks

### 2.1 Keyword-based Retrieval
### Goal 1: select chunks with exact keyword match with the query

We can retrieve the chunks that contain the exact keyword match with the prompt

Retrieve top k chunks from all techniques and see which answer provides the most sensible context to the prompt

In [14]:
top_k_boolean_chunks = boolean_search(all_search_terms, df, k=25)
relevant_boolean = top_k_boolean_chunks[['original_text', 'boolean_score']]
print('Relevant chunks based on boolean search:')
relevant_boolean.head(5)


Relevant chunks based on boolean search:


Unnamed: 0,original_text,boolean_score
4666,- Environmental impacts of hydropower developm...,0.171717
4667,- Including sustainability principles in hydro...,0.161616
4782,Climate change models predict that there will ...,0.161616
2995,This section specifically provides an assessme...,0.151515
2997,Achieving a substantial decarbonization of the...,0.151515


In [15]:
top_k_bm25_chunks = bm25_search(all_search_terms, df, k=25)
relevant_bm25 = top_k_bm25_chunks[['original_text', 'bm25_score']]
print('Relevant chunks based on BM25 search:')
relevant_bm25.head(5)

Relevant chunks based on BM25 search:


Unnamed: 0,original_text,bm25_score
3377,Business Investment Development Strategy (BIDS),1.0
3848,ADAPTATION STRATEGY AND ACTION PLAN,0.973747
4775,c. 'Rearrange' disturbed forest ecosystems so ...,0.915038
4726,a. Extend or renew native species that are exp...,0.881522
4155,BiH is working on the project 'Advance the Nat...,0.853265


In [16]:
top_k_fuzzy_chunks = fuzzy_search(prompt, df, k=50)
relevant_fuzzy = top_k_fuzzy_chunks[['original_text', 'fuzzy_score']]
print('Relevant chunks based on fuzzy search:')
relevant_fuzzy.head(5)

Relevant chunks based on fuzzy search:


Unnamed: 0,original_text,fuzzy_score
3434,the,1.0
3159,Decarbonisation /\nremovals,0.77
3847,5 THE,0.75
3613,for the,0.6
4138,BiH has demonstrated its commitment to partici...,0.56


As we can see, the retrieved chunks from boolean search or bm25 ranking makes more sense in answering the question in the prompt. Fuzzy string matching, on the other side, yields mostly numbers. Though its stregth lies in its ability to identify some pattern mathcing, it might not be useful in this case.

## 2.2 Semantic Retrieval

### Goal 2: select chunks with same semantic meaning to prompt

We can retrieve the chunks that contain the same semantic meaning to the given prompt. Or, we can use the ones that already have the keywords, and compare similarity score (if not, there is a disconnect on keyword and semantic search. If not, we can compare if the semantic performed better or worse from the keywprd search). Based on the chunks retrieved, we can assess whether Word2Vec or ClimateBERT performs better

Method: Pure embeddings comparison of query and chunks, (reranking based on weighted score) fusion retrieval, (prompt engineering)adaptive retrieval, (reranking) RSE, langchain Q&A, Contextual compression

In [17]:
# Vector search
# 1. Get the embeddings of the prompt
# 2. Get the embeddings of the chunks
# 3. Calculate the cosine similarity between the prompt and the chunks
# 4. Get the top k chunks with the highest cosine similarity
# 5. Return the top k chunks with the highest cosine similarity

#Make this fucntion adaptable for both transformer and word2vec embeddings

In [29]:

climatebert_results = vector_search(
    prompt_embeddings=np.array(prompt_climatebert_embeddings),
    embedding_type='climatebert',
    top_k=25
)

print("Top 25 results using ClimateBERT:")
print(climatebert_results[['original_text', 'similarity_score']].head(25))

w2v_results = vector_search(
    prompt_embeddings=np.array(prompt_w2v_embeddings),
    embedding_type='word2vec',
    top_k=25
)

print("\nTop 25 results using Word2Vec:")
print(w2v_results[['original_text', 'similarity_score']].head(25))

Top 25 results using ClimateBERT:
                                          original_text  similarity_score
3478  Table 11: Overview table of key policies affec...          1.000000
3997           Initial National Determined Contribution          0.959088
3759                                     Project board:          0.952326
3285               No budget calculated for the moment.          0.950437
3995  Initial National Communication Report under th...          0.946192
3410                          Overall policy documents:          0.945382
3973                      Designated National Authority          0.942615
3633                                        Secretariat          0.939853
4572  21UNCC - Article: How Hydropower Can Help Clim...          0.938768
1925                              Public administration          0.938397
4025  Second National Communication Report under the...          0.938183
3782                    Existing policies and measures.          0.935045
3476

In [30]:

df_similarity_score = df_with_similarity_score(
    prompt_embeddings_w2v=np.array(prompt_w2v_embeddings),
    prompt_embeddings_climatebert=np.array(prompt_climatebert_embeddings),
    top_k=None
)
df_similarity_score.head(5)

bm25_df = bm25_search(all_search_terms, df_similarity_score, k=None)
bm25_df.head(5)

Unnamed: 0,document_id,country_code,document_title,original_text,source_hyperlink,w2v_score,climatebert_score,avg_score,bm25_score
4092,CCLW.document.i00000004.n0000,BIH,Climate Change Adaptation and Low Emissions Gr...,The Paris Agreement on Climate Change is based...,https://unfccc.int/sites/default/files/resourc...,0.993667,0.550285,0.771976,1.0
4155,CCLW.document.i00000004.n0000,BIH,Climate Change Adaptation and Low Emissions Gr...,BiH is working on the project 'Advance the Nat...,https://unfccc.int/sites/default/files/resourc...,0.986941,0.725626,0.856283,0.892933
4614,CCLW.document.i00000004.n0000,BIH,Climate Change Adaptation and Low Emissions Gr...,The risks associated with climate change have ...,https://unfccc.int/sites/default/files/resourc...,0.994369,0.605252,0.79981,0.874398
3705,CCLW.document.i00000002.n0000,ALB,National Energy and Climate Plan 2019 Draft,Energy Regulatory Authority (ERE): The Energy ...,https://www.energy-community.org/dam/jcr:a0c2b...,0.985716,0.755189,0.870453,0.859087
3377,CCLW.document.i00000002.n0000,ALB,National Energy and Climate Plan 2019 Draft,Business Investment Development Strategy (BIDS),https://www.energy-community.org/dam/jcr:a0c2b...,0.937189,0.868619,0.902904,0.841708


Interestingly, the Word2vec model seems to retrieve more useful information than climateBERT model, contrary to our expectations. This may reflect weaknesses in embeddings generation or the model itself. 

However, we have yet to exactly identify is the answers make sense or not, which will be tested in the LLM evaluation phase.

(Extra: if have time)

### Goal 3: Rank the chunks based on their relevance to the prompt

We can rerank the chunks based on their relevance to the prompt. Much like the widely knwon hybrid search, we will sum the sparse score (from the chosen keyword technique) and dense score (from embeddings) with weghted parameter alpha. 


In [32]:
# Example usage of hybrid scoring
try:
    hybrid_results = hybrid_scoring(bm25_df, alpha=0.5)
    print("Top results using hybrid scoring:")
    print(hybrid_results[['original_text', 'hybrid_score']].head(50))
except KeyError as e:
    print(f"Error: Missing required column - {e}")

Top results using hybrid scoring:
                                          original_text  hybrid_score
3377    Business Investment Development Strategy (BIDS)      0.855164
4658  - Hydropower development should be part of a b...      0.847423
3848                ADAPTATION STRATEGY AND ACTION PLAN      0.824155
4155  BiH is working on the project 'Advance the Nat...      0.809280
3705  Energy Regulatory Authority (ERE): The Energy ...      0.807138
4136  Under the UNFCCC, Bosnia and Herzegovina is co...      0.800100
4154  At the meeting of the Ministerial Council of t...      0.783631
4092  The Paris Agreement on Climate Change is based...      0.775142
3533  Figure 6: Energy intensity (Source: National S...      0.773565
4775  c. 'Rearrange' disturbed forest ecosystems so ...      0.761036
4614  The risks associated with climate change have ...      0.739825
3004  Energy Efficiency Fund: The EE Law mandates th...      0.734070
3423  . National Energy Efficiency Action Plan 2010-... 

In [None]:

# def tune_alpha():
#     """
#     Tune alpha to find the best combination of sparse and dense scores
#     """
#     # Create a list of alpha values to test
#     alpha_values = [i/10 for i in range(0, 11)]
    
#     # Initialize a dictionary to store the results
#     results = {}
    
#     # Loop through each alpha value
#     for alpha in alpha_values:
#         # Compute the hybrid score
#         df = hybrid_scoring(alpha)
        
#         # Store the results
#         results[alpha] = df['hybrid_score'].mean()
    
#     return results

In [None]:
# Score all chunks based on its similarity with the prompts (keyword score and vector score)
# then, move on to adding them with a weighted parameter alpha