# Unieversity of Pavia
## Artificial Intelligence BSc
### Information Retrieval and Recommender Systems

### Authors:
 - Michele Ventimiglia
 - Manuel Dellabona

This script is part of the Clinic Trials SE project and is released under the GNU General Public License:
https://www.gnu.org/licenses/gpl-3.0.html#license-text

## Setup

In [1]:
from _setup import check
check(verbose=True)

[94m(i)[0m [SETUP] Checking for virtual environment...
[32mSUCCESS[0m: [SETUP] Virtual environment detected!
[94m(i)[0m [SETUP] Path root added to the environment variables.
[32mSUCCESS[0m: [SETUP] Check completed!


### Libraries

In [2]:
import os
import pickle
import xml.etree.ElementTree as ET
from typing import Union

import pandas as pd
import pyterrier as pt
from tqdm import tqdm
from pyterrier.measures import RR, P, Rprec, R

from src.search.llm import LLM
from src.preprocessing.indexing import Indexer
from src.preprocessing.transform import Transformer

### Paths

In [3]:
paths = {
    'DATA' : "path\\to\\ClinicalTrialsSE\\data",
    'DATASET' : "path\\to\\ClinicalTrialsSE\\data\\test",
    'INDEXING_FILES' : "path\\to\\ClinicalTrialsSE\\data\\index",
    'LLAMA' : "path\\to\\Llama2\\llama-2-7b-chat.Q5_K_M.gguf",
    'MISTRAL' : "path\\to\\Mistral\\mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    'BERT' : "path\\to\\ClinicalBERT",
    'JDK' : "path\\to\\Java\\jdk-21\\bin",
    'EVAL' : "path\\to\\ClinicalTrialsSE\\data\\eval"
}

In [4]:
queries_path = os.path.join(paths['EVAL'], 'topics2022.xml')
qrles_path = os.path.join(paths['EVAL'], 'clc_qrels2022.txt')

### Classes

In [5]:
indexer = Indexer(
    jdk_path = paths['JDK'],
    file_dir = paths['INDEXING_FILES'],
    verbose = False
)

indexer.load()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



In [6]:
transformer = Transformer(
    save_path = "",
    verbose = False
)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\mikiv\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Preprocessing

In [7]:
def parse_xml_queries(
        file_path: Union[str, os.PathLike]
    ) -> pd.DataFrame:
    """
    Parses queries from an XML file.

    Args:
        file_path (Union[str, os.PathLike]): Path to the XML file containing queries.

    Returns:
        pandas.DataFrame: DataFrame containing 'qid' and 'query' columns.
    """
    
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        queries = []
        for topic in root.findall('.//topic'):
            qid = topic.get('number')
            query_text = transformer.process_query(' '.join(topic.itertext()).strip())
            queries.append({'qid': qid, 'query': query_text})

        return pd.DataFrame(queries)

    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
        return pd.DataFrame(columns=['qid', 'query'])
    except Exception as e:
        print(f"Unexpected error: {e}")
        return pd.DataFrame(columns=['qid', 'query'])
    
queries = parse_xml_queries(queries_path)

In [8]:
average_queries_length = queries['query'].apply(lambda x: len(x.split())).mean()
print("Average Length:", int(average_queries_length))

Average Length: 54


In [9]:
qrels = []

with open(qrles_path, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) == 4:
            qid, _, docno, label = parts
            qrels.append({'qid': qid, 'docno': docno, 'label': int(label)})

qrels = pd.DataFrame(qrels)

### Retrieval Systems

The output of your experiments should be dataframes with columns:

 - `qid` - the query id 1-50
 - `docid`: Terrier' internal integer for each document
 - `docno`: the external (string) unique identifier for each document - Clinical trials ID
 - `score`: the output of your ranking model
 - `rank`: A handy attribute showing the descending order by score
 - `query`: the input query as text


 For each query you should return 1000 documents. Therefore your result dataframe should have 50.000 rows, for the 50 queries in the dataset.


Baselines:

In [10]:
bm25 = indexer.bm25
tf_idf = indexer.tf_idf

Expanded Retrieval Systems:

In [11]:
nt = int(average_queries_length/2)
nd = 10

In [12]:
rm3_tf_idf = indexer.expand_query(
    baseline = 'TF_IDF',
    expansion = 'RM3',
    nterms = nt,
    ndocs = nd 
)

bo1_tf_idf = indexer.expand_query(
    baseline = 'TF_IDF',
    expansion = 'Bo1',
    nterms = nt,
    ndocs = nd 
)

kl_tf_idf = indexer.expand_query(
    baseline = 'TF_IDF',
    expansion = 'KL',
    nterms = nt,
    ndocs = nd 
)

ax_tf_idf = indexer.expand_query(
    baseline = 'TF_IDF',
    expansion = 'Ax',
    nterms = nt,
    ndocs = nd 
)

rm3_bm25 = indexer.expand_query(
    baseline = 'BM25',
    expansion = 'RM3',
    nterms = nt,
    ndocs = nd 
)

bo1_bm25 = indexer.expand_query(
    baseline = 'BM25',
    expansion = 'Bo1',
    nterms = nt,
    ndocs = nd 
)

kl_bm25 = indexer.expand_query(
    baseline = 'BM25',
    expansion = 'KL',
    nterms = nt,
    ndocs = nd 
)

ax_bm25 = indexer.expand_query(
    baseline = 'BM25',
    expansion = 'Ax',
    nterms = nt,
    ndocs = nd 
)

### LLM PCA Expansions

In [13]:
def process_queries_with_model(
        model_name: str,
        model_path: Union[str, os.PathLike],
        queries: list,
        indexer: Indexer,
        transformer: Transformer
    ) -> list:
    """
    Function to process queries with a given model.
    """
    
    expanded_queries = []
    
    try:
        with LLM(model_path=model_path, verbose=True) as model:
            for _, row in tqdm(queries.iterrows(), total=queries.shape[0], desc=f"Expanding Queries with {model_name}"):
                try:
                    qid = row['qid']
                    query = row['query']
                    expanded_query, _ = model.expand_query(query)
                    expanded_query = transformer.process_query(expanded_query)
                    
                    results = indexer.retrieve(
                        query = expanded_query,
                        baseline = 'TF_IDF',
                        expansion = None
                    )

                    for _, result in results.iterrows():
                        expanded_queries.append({
                            'qid': qid,
                            'docno': result['docno'],
                            'score': result['score'],
                            'rank': result['rank']
                        })

                except Exception as e:
                    print(f"Error processing query ID {qid}: {e}")
                    
    except Exception as e:
        print(f"Error with model {model_name}: {e}")
    
    return expanded_queries

In [14]:
model_data = {}

for model_key in ['LLAMA', 'MISTRAL']:
    results_file = os.path.join(paths['EVAL'], f'{model_key.lower()}_qrels_df.pkl')
    
    if not os.path.exists(results_file):
        model_queries = process_queries_with_model(model_key, paths[model_key], queries, indexer, transformer)
        model_queries_df = pd.DataFrame(model_queries)

        with open(results_file, 'wb') as file:
            pickle.dump(model_queries_df, file)
        
    if os.path.exists(results_file):
        with open(results_file, 'rb') as file:
            model_data[model_key] = pickle.load(file)

## Evaluate

### Evaluation Metrics

In [17]:
metrics_eval = [
    RR(rel=1)@1000,
    P(rel=1)@1,
    P(rel=1)@10,
    P(rel=1)@30,
    Rprec(rel=1),
    R(rel=1)@10,
    R(rel=1)@30,
    RR(rel=2)@1000,
    P(rel=2)@1,
    P(rel=2)@10,
    P(rel=2)@30,
    Rprec(rel=2),
    R(rel=2)@10,
    R(rel=2)@30,
]

### Retrieval & Results

In [18]:

results = pt.Experiment(
    retr_systems =[
        bm25,
        tf_idf,
        rm3_bm25,
        bo1_bm25,
        kl_bm25,
        ax_bm25,
        rm3_tf_idf,
        bo1_tf_idf,
        kl_tf_idf,
        ax_tf_idf,
        model_data['LLAMA'],
        model_data['MISTRAL']
    ],
    topics = queries,
    qrels = qrels,
    eval_metrics = metrics_eval,
    names = [
        "BM25",
        "TF-IDF",
        "RM3 BM25",
        "Bo1 BM25",
        "KL BM25",
        "Ax BM25",
        "RM3 TF-IDF",
        "Bo1 TF-IDF",
        "KL TF-IDF",
        "Ax TF-IDF",
        "Llama TF-IDF",
        "Mistral TF-IDF"
    ],
    baseline = None,
    perquery = False,
    verbose = True
)

display(results)

pt.Experiment:   0%|          | 0/12 [00:00<?, ?system/s]

pt.Experiment: 100%|██████████| 12/12 [03:05<00:00, 15.48s/system]


Unnamed: 0,name,RR@1000,P@1,P@10,P@30,Rprec,R@10,R@30,RR(rel=2)@1000,P(rel=2)@1,P(rel=2)@10,P(rel=2)@30,Rprec(rel=2),R(rel=2)@10,R(rel=2)@30
0,BM25,0.630032,0.5,0.42,0.326667,0.194513,0.045501,0.09534,0.449254,0.3,0.24,0.186667,0.150428,0.049051,0.101205
1,TF-IDF,0.673661,0.56,0.424,0.326,0.192321,0.044586,0.095882,0.470649,0.32,0.252,0.188,0.14869,0.047933,0.101053
2,RM3 BM25,0.652465,0.54,0.456,0.368,0.241935,0.049629,0.108051,0.466893,0.36,0.272,0.218667,0.180415,0.05585,0.121054
3,Bo1 BM25,0.626433,0.52,0.472,0.389333,0.253583,0.049985,0.113118,0.453888,0.3,0.286,0.228,0.185849,0.059437,0.123288
4,KL BM25,0.63636,0.52,0.478,0.395333,0.253434,0.051385,0.115733,0.467576,0.32,0.288,0.232667,0.187481,0.059698,0.128193
5,Ax BM25,0.634841,0.5,0.422,0.329333,0.194684,0.045787,0.097733,0.456397,0.3,0.242,0.189333,0.150746,0.049421,0.103936
6,RM3 TF-IDF,0.688759,0.6,0.468,0.373333,0.246886,0.049577,0.107231,0.458326,0.34,0.27,0.211333,0.178719,0.055828,0.109924
7,Bo1 TF-IDF,0.676687,0.58,0.488,0.396667,0.253916,0.05235,0.115131,0.498353,0.38,0.314,0.231333,0.191128,0.062604,0.120811
8,KL TF-IDF,0.664428,0.54,0.486,0.397333,0.254091,0.052444,0.116135,0.48765,0.34,0.31,0.232667,0.194068,0.062564,0.12221
9,Ax TF-IDF,0.681439,0.56,0.424,0.326667,0.192321,0.044586,0.096993,0.478427,0.32,0.252,0.188667,0.14869,0.047933,0.102303
