# Information Retrieval


>
> Goal: We have to find the best method of retrieving info from the Climate Policy Radar's Huggingface dataset based on a user's query or information need.
>

>
> Specify your goal:
> 1. Select chunks with exact keyword match with the query
> 2. Select chunks with same semantic meaning to query
> 3. Rerank the chunks based on their relevance to the query
> 4. Define a threshold whether the chunks would be useful to the user or not
> 5. Hyperlinks
>

## 1. Preperations

### 1.1 Import libraries and functions

In [1]:
# pip install fuzzywuzzy
# pip install rank_bm25

In [2]:
import importlib
import retrieval
importlib.reload(retrieval)



<module 'retrieval' from '/Users/jessiefung/Desktop/DS205/group-6-final-project/retrieval.py'>

In [3]:

from dotenv import load_dotenv
import os
from transformers import AutoTokenizer, AutoModel
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors, Word2Vec
from gensim.utils import simple_preprocess
from sqlalchemy import create_engine, text

from retrieval import boolean_search, bm25_search, fuzzy_search, vector_search, df_with_similarity_score, hybrid_scoring
from functions import generate_word2vec_embedding_for_text, generate_embeddings_for_text

Downloading climatebert model and tokenizer

In [4]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

EMBEDDING_MODEL_LOCAL_DIR = os.getenv('EMBEDDING_MODEL_LOCAL_DIR')
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

climatebert_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)
climatebert_model = AutoModel.from_pretrained(EMBEDDING_MODEL_LOCAL_DIR)

Some weights of RobertaModel were not initialized from the model checkpoint at local_model/climatebert/distilroberta-base-climate-f and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading word2vec model

In [5]:
custom_w2v = Word2Vec.load("./local_model/custom_word2vec_768.model")

### 1.2 Introduce a prompt

#### 1.2.1 Define a prompt and exact keywords

In [6]:
# Introducing a prompt based on ASCOR CP1.a
prompt = "Does the country have a decarbonisation strategy to meet Paris Agreement that they are implementing or in the national legislation?"
keywords = prompt.split(" ")

#### 1.2.2 Generate similar words to keywords

In [7]:
# Generate similar words using word2vec model to prompt's keywords and store them for keyword search
keywords = simple_preprocess(prompt)
similar_words = []

# For each keyword, try to find similar words
for keyword in keywords:
    try:
        # Only get similar words if keyword exists in vocabulary
        if keyword in custom_w2v.wv:
            similar = custom_w2v.wv.most_similar(keyword, topn=5)  # Get top 5 similar words
            similar_words.extend([word for word, score in similar])
    except KeyError:
        # Skip words not in vocabulary
        continue

# Combine original keywords with similar words
all_search_terms = list(set(keywords + similar_words))

print("Original keywords:", keywords)
print("\nExpanded keywords:", all_search_terms)

Original keywords: ['does', 'the', 'country', 'have', 'decarbonisation', 'strategy', 'to', 'meet', 'paris', 'agreement', 'that', 'they', 'are', 'implementing', 'or', 'in', 'the', 'national', 'legislation']

Expanded keywords: ['other', 'legislation', 'under', 'evaluation', 'stages', 'finance', 'resilience', 'these', 'procedure', 'after', 'specify', 'does', 'sustain', 'considerations', 'annual', 'include', 'notification', 'policy', 'fallow', 'specific', 'level', 'considered', 'requirements', 'period', 'strategy', 'or', 'works', 'which', 'proven', 'performance', 'legislative', 'paris', 'in', 'third', 'implementing', 'agency', 'depositary', 'consent', 'program', 'medium', 'that', 'submit', 'up', 'credit', 'relevant', 'project', 'than', 'maintenance', 'exceed', 'ceiling', 'researches', 'inspection', 'specifications', 'species', 'parameters', 'country', 'forest', 'meet', 'technologies', 'if', 'may', 'carry', 'sphere', 'goal', 'each', 'coordination', 'are', 'prejudice', 'have', 'biodiversity

Generate embeddings for the prompt

In [8]:
# Convert prompt into embeddings
prompt_w2v_embeddings = generate_word2vec_embedding_for_text(prompt, custom_w2v)

prompt_climatebert_embeddings = generate_embeddings_for_text(prompt, climatebert_model, climatebert_tokenizer)

### 1.3 Load the dataframe from the database

In [9]:

from sqlalchemy import create_engine, text
import os
engine = create_engine(os.getenv("DB_URL"))

df = pd.read_sql("SELECT * FROM document_embeddings", engine)
df.head()

Unnamed: 0,id,document_id,document_title,country_code,original_text,source_hyperlink,climatebert_embedding,word2vec_embedding
0,1,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,Modelling Scenario Considered Type of Instrument,https://www.energy-community.org/dam/jcr:a0c2b...,"[-0.019798215,0.1386577,-0.003902942,-0.052189...","[-0.013072814,-0.06213421,0.010352968,0.044705..."
1,2,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,EE targets based on Article 3 of Directive 201...,https://www.energy-community.org/dam/jcr:a0c2b...,"[0.025648404,0.13442199,0.007120967,-0.0026666...","[0.009358346,-0.0819107,0.031888146,0.05424338..."
2,3,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,· Energy savings goal referring to final energ...,https://www.energy-community.org/dam/jcr:a0c2b...,"[-0.024586704,0.12194323,-0.01636592,-0.096064...","[0.011089353,-0.12704337,0.071228385,0.0793931..."
3,4,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,Reducing energy intensity of GDP by 18% by 2030.,https://www.energy-community.org/dam/jcr:a0c2b...,"[-0.02417453,0.11885291,-0.015361838,-0.095164...","[0.01414451,-0.12517907,0.050284933,0.04082997..."
4,5,CCLW.document.i00000002.n0000,National Energy and Climate Plan 2019 Draft,ALB,(2) the cumulative amount of end-use energy sa...,https://www.energy-community.org/dam/jcr:a0c2b...,"[0.06390555,0.119599454,0.004860446,-0.131328,...","[-0.00065909955,-0.14912702,0.08619301,0.10798..."


## 2. Retrieving Relevant Chunks

### 2.1 Keyword-based Retrieval
### Goal 1: select chunks with exact keyword match with the query

We can retrieve the chunks that contain the exact keyword match with the prompt

Retrieve top k chunks from all techniques and see which answer provides the most sensible context to the prompt

In [10]:
top_k_boolean_chunks = boolean_search(all_search_terms, df, k=25)
relevant_boolean = top_k_boolean_chunks[['original_text', 'boolean_score']]
print('Relevant chunks based on boolean search:')
relevant_boolean.head(5)


Relevant chunks based on boolean search:


Unnamed: 0,original_text,boolean_score
253,Albania is a signatory to the United Nations F...,0.234694
2922,Existing national plans that have been conside...,0.234694
8003,Applying the Instruction No. 6527 of 24.12.200...,0.22449
3910,- Determining the minimum annual amount of bio...,0.193878
9858,Achieving a substantial decarbonization of the...,0.193878


In [11]:
top_k_bm25_chunks = bm25_search(all_search_terms, df, k=25)
relevant_bm25 = top_k_bm25_chunks[['original_text', 'bm25_score']]
print('Relevant chunks based on BM25 search:')
relevant_bm25.head(5)

Relevant chunks based on BM25 search:


Unnamed: 0,original_text,bm25_score
15484,The National Environmental Agency,1.0
3541,"National Environmental Agency, Regional Enviro...",0.909425
13981,These are activities that are carried out rout...,0.868807
145,· Application of new technologies in transmiss...,0.852757
14846,While a project has first been classified as r...,0.806711


In [12]:
top_k_fuzzy_chunks = fuzzy_search(prompt, df, k=50)
relevant_fuzzy = top_k_fuzzy_chunks[['original_text', 'fuzzy_score']]
print('Relevant chunks based on fuzzy search:')
relevant_fuzzy.head(5)

Relevant chunks based on fuzzy search:


Unnamed: 0,original_text,fuzzy_score
6994,Decarbonisation,1.0
14883,National,1.0
12763,the,1.0
4312,1. Decarbonisation,0.94
7987,"Decarbonisation, RES",0.88


As we can see, the retrieved chunks from boolean search or bm25 ranking makes more sense in answering the question in the prompt. Fuzzy string matching, on the other side, yields mostly numbers. Though its stregth lies in its ability to identify some pattern mathcing, it might not be useful in this case.

## 2.2 Semantic Retrieval

### Goal 2: select chunks with same semantic meaning to prompt

We can retrieve the chunks that contain the same semantic meaning to the given prompt. Or, we can use the ones that already have the keywords, and compare similarity score (if not, there is a disconnect on keyword and semantic search. If not, we can compare if the semantic performed better or worse from the keywprd search). Based on the chunks retrieved, we can assess whether Word2Vec or ClimateBERT performs better

Method: Pure embeddings comparison of query and chunks, (reranking based on weighted score) fusion retrieval, (prompt engineering)adaptive retrieval, (reranking) RSE, langchain Q&A, Contextual compression

In [13]:
# Vector search
# 1. Get the embeddings of the prompt
# 2. Get the embeddings of the chunks
# 3. Calculate the cosine similarity between the prompt and the chunks
# 4. Get the top k chunks with the highest cosine similarity
# 5. Return the top k chunks with the highest cosine similarity

#Make this fucntion adaptable for both transformer and word2vec embeddings

In [14]:

climatebert_results = vector_search(
    prompt_embeddings=np.array(prompt_climatebert_embeddings),
    embedding_type='climatebert',
    top_k=25
)

print("Top 25 results using ClimateBERT:")
print(climatebert_results[['original_text', 'similarity_score']].head(25))

w2v_results = vector_search(
    prompt_embeddings=np.array(prompt_w2v_embeddings),
    embedding_type='word2vec',
    top_k=25
)

print("\nTop 25 results using Word2Vec:")
print(w2v_results[['original_text', 'similarity_score']].head(25))

Top 25 results using ClimateBERT:
                                           original_text  similarity_score
232                  Policy context of the national plan          1.000000
15625                                      adaptation is          0.995640
558    Key policies are the INDC and the National Str...          0.993790
2517   Table 11: Overview table of key policies affec...          0.992464
12209  Table 11: Overview table of key policies affec...          0.989046
581    Albania's RES Plan is the key policy and is ha...          0.978735
330    National energy system and policy context of t...          0.978496
3009                           National Energy Strategy:          0.978432
13986  Unable to provide information on the status of...          0.977479
1746                 No report available on the progress          0.976261
3708                No report available on the progress.          0.973428
297                     National Climate Change Strategy          

In [15]:

df_similarity_score = df_with_similarity_score(
    prompt_embeddings_w2v=np.array(prompt_w2v_embeddings),
    prompt_embeddings_climatebert=np.array(prompt_climatebert_embeddings),
    top_k=None
)
df_similarity_score.head(5)

bm25_df = bm25_search(all_search_terms, df_similarity_score, k=None)
bm25_df.head(5)

Unnamed: 0,document_id,country_code,document_title,original_text,source_hyperlink,w2v_score,climatebert_score,avg_score,bm25_score
14846,CCLW.document.i00000964.n0000,ALB,Albania’s National Adaptation Plan First - pro...,While a project has first been classified as r...,https://napglobalnetwork.org/wp-content/upload...,0.998778,0.762567,0.880673,1.0
145,CCLW.document.i00000002.n0000,ALB,National Energy and Climate Plan 2019 Draft,· Application of new technologies in transmiss...,https://www.energy-community.org/dam/jcr:a0c2b...,0.992594,0.74853,0.870562,0.873906
3541,CCLW.document.i00000002.n0000,ALB,National Energy and Climate Plan 2019 Draft,"National Environmental Agency, Regional Enviro...",https://www.energy-community.org/dam/jcr:a0c2b...,0.953775,0.627351,0.790563,0.872762
13981,CCLW.document.i00000964.n0000,ALB,Albania’s National Adaptation Plan First - pro...,These are activities that are carried out rout...,https://napglobalnetwork.org/wp-content/upload...,0.998319,0.739317,0.868818,0.841213
15,CCLW.document.i00000002.n0000,ALB,National Energy and Climate Plan 2019 Draft,· Renovation of the stock of public buildings ...,https://www.energy-community.org/dam/jcr:a0c2b...,0.996294,0.799702,0.897998,0.830055


Interestingly, the Word2vec model seems to retrieve more useful information than climateBERT model, contrary to our expectations. This may reflect weaknesses in embeddings generation or the model itself. 

However, we have yet to exactly identify is the answers make sense or not, which will be tested in the LLM evaluation phase.

(Extra: if have time)

### Goal 3: Rank the chunks based on their relevance to the prompt

We can rerank the chunks based on their relevance to the prompt. Much like the widely knwon hybrid search, we will sum the sparse score (from the chosen keyword technique) and dense score (from embeddings) with weghted parameter alpha. 


In [16]:
# Example usage of hybrid scoring
try:
    hybrid_results = hybrid_scoring(bm25_df, alpha=0.5)
    print("Top results using hybrid scoring:")
    print(hybrid_results[['original_text', 'hybrid_score']].head(50))
except KeyError as e:
    print(f"Error: Missing required column - {e}")

Top results using hybrid scoring:
                                           original_text  hybrid_score
14846  While a project has first been classified as r...      0.881284
15     · Renovation of the stock of public buildings ...      0.814878
145    · Application of new technologies in transmiss...      0.811218
15656  Enabling Albania to prepare its Third National...      0.791114
13981  These are activities that are carried out rout...      0.790265
2142   It can be summarized that there is policy and ...      0.786155
495    · 'planned policies and measures' means option...      0.764342
13952  · fulfill reporting requirements under the UNF...      0.762451
15484                  The National Environmental Agency      0.761986
166    It is the objective to transform Albania towar...      0.761645
3541   National Environmental Agency, Regional Enviro...      0.750056
15205  . the beneficiary ownership of project has oft...      0.747027
15321  2. Mainstreaming climate change into

In [17]:

# def tune_alpha():
#     """
#     Tune alpha to find the best combination of sparse and dense scores
#     """
#     # Create a list of alpha values to test
#     alpha_values = [i/10 for i in range(0, 11)]
    
#     # Initialize a dictionary to store the results
#     results = {}
    
#     # Loop through each alpha value
#     for alpha in alpha_values:
#         # Compute the hybrid score
#         df = hybrid_scoring(alpha)
        
#         # Store the results
#         results[alpha] = df['hybrid_score'].mean()
    
#     return results

In [18]:
# Score all chunks based on its similarity with the prompts (keyword score and vector score)
# then, move on to adding them with a weighted parameter alpha