# Analysis (Chapter 1)

### Overview

All datasets can be obtained by running the code below. The data will be sorted in the `./analysis_data/` folder. The format is the same as that of data you used in the previous part of this assignment, so make sure you change the dataset paths in the *assignment3.ipynb* to the correct data folder, except for the `common_words` stopwords. Also, set the RESET flag to `True`. Finally, make sure you dig around the dataset you are working with, to get familiar with the type of queries and documents, the type of relevance judgements, etc.

### Install

In [None]:
!pip install --upgrade ir_datasets
!pip install ipywidgets

### Import

In [None]:
import ir_datasets
import csv
import os
from tqdm import tqdm
import random

### Function to save data in the .tsv format

In [None]:
def save_tsv(
    folder_name,
    dataset_path,
    concatenate_docs=False,
    doc_text=None,
    extract_partial=False,
    extract=None,
    provide_n_docs=False,
    n_docs=None,
    **kwargs,
):
    """
    Saves queries, documents and qrels in asisgnment-specific .tsv format.

    Input:
        - dataset_path: dataset to be downloaded using ir_dataset package
        - folder_name: name of the folder within the data_for_analysis folder
    """
    # Check if the folder exists, and if not, create it
    folder_path = "analysis_data/" + folder_name
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        dataset = ir_datasets.load(dataset_path)

    # Queries
    print(f"Extracting queries of {folder_name} ...")
    query_path = os.path.join(folder_path, "queries.tsv")
    if not os.path.exists(query_path):
        total_queries = sum(1 for _ in dataset.queries_iter())
        with open(query_path, "w", newline="", encoding="utf-8") as tsv_file:
            tsv_writer = csv.writer(tsv_file, delimiter="\t")
            for query in tqdm(
                dataset.queries_iter(), total=total_queries, desc="Saving queries"
            ):
                tsv_writer.writerow([query[0], query[1]])

    # Docs
    print(f"Extracting documents of {folder_name} ...")
    doc_path = os.path.join(folder_path, "collection.tsv")
    if not os.path.exists(doc_path):

        total_docs = n_docs if provide_n_docs else sum(1 for _ in dataset.docs_iter())
        sampled_docs = (
            random.sample(list(dataset.docs_iter()), extract)
            if extract_partial
            else dataset.docs_iter()
        )

        with open(doc_path, "w", newline="", encoding="utf-8") as tsv_file:
            tsv_writer = csv.writer(tsv_file, delimiter="\t")
            for doc in tqdm(dataset.docs_iter(), total=total_docs, desc="Saving docs"):
                if concatenate_docs:
                    tsv_writer.writerow(
                        [doc[0], f"{doc[doc_text[0]]} {doc[doc_text[1]]}"]
                    )
                else:
                    tsv_writer.writerow([doc[0], doc[1]])

    # Qrels
    print(f"Extracting qrels of {folder_name} ...")
    ## Initialize file handles only if they don't exist
    train_file_path = os.path.join(folder_path, "train_pairs_graded.tsv")
    dev_file_path = os.path.join(folder_path, "dev_pairs_graded.tsv")
    test_file_path = os.path.join(folder_path, "test_pairs_graded.tsv")

    if not os.path.exists(train_file_path):
        total_qrels = sum(1 for _ in dataset.qrels_iter())

        ## Calculate the indices to split the data
        index_80_percent = int(0.8 * total_qrels)
        index_90_percent = int(0.9 * total_qrels)

        with open(
            os.path.join(folder_path, "train_pairs_graded.tsv"),
            "w",
            newline="",
            encoding="utf-8",
        ) as train_file, open(
            os.path.join(folder_path, "dev_pairs_graded.tsv"),
            "w",
            newline="",
            encoding="utf-8",
        ) as dev_file, open(
            os.path.join(folder_path, "test_pairs_graded.tsv"),
            "w",
            newline="",
            encoding="utf-8",
        ) as test_file:

            train_writer = csv.writer(train_file, delimiter="\t")
            dev_writer = csv.writer(dev_file, delimiter="\t")
            test_writer = csv.writer(test_file, delimiter="\t")

            for i, qrel in tqdm(
                enumerate(dataset.qrels_iter()), total=total_qrels, desc="Saving qrels"
            ):
                if i < index_80_percent:
                    train_writer.writerow([qrel[0], qrel[1], qrel[2]])
                elif i < index_90_percent:
                    dev_writer.writerow([qrel[0], qrel[1], qrel[2]])
                else:
                    test_writer.writerow([qrel[0], qrel[1], qrel[2]])

In [None]:
datasets_to_save = {
    "GENOMICS": {
        "dataset_path": "medline/2004/trec-genomics-2005",
        "concatenate_docs": True,
        "doc_text": [1, 2],
    },
    "ARGS": {
        "dataset_path": "argsme/2020-04-01/touche-2021-task-1",
        "concatenate_docs": True,
        "doc_text": [3, 1],
    },
    "GAMING": {
        "dataset_path": "beir/cqadupstack/gaming",
        "concatenate_docs": True,
        "doc_text": [2, 1],
    },
    "NUTRITION": {
        "dataset_path": "nfcorpus/train/nontopic",
        "concatenate_docs": True,
        "doc_text": [2, 3],
    },
    "CLIMATE": {
        "dataset_path": "beir/climate-fever",
        "concatenate_docs": True,
        "doc_text": [2, 1],
    },
    "BUSINESS": {
        "dataset_path": "beir/fiqa/train",
    },
}

### Did you choose your scenario? Let's get the dataset! 
Please use the Dataset name as specified in the markdown cells above.

In [None]:
scenario = input("What scenario are you interested in? ")
while scenario not in datasets_to_save.keys():
    print("Please specify a valid scenario.")
    scenario = input("What scenario are you interested in? ")

print(f"Saving {scenario} ...")
save_tsv(scenario, **datasets_to_save[scenario])

## Comparing Pointwise, Pairwise and Listwise

In the next few cells, we will compare the methods you've implemented. Helper functions are provided for you, which you can use to make some conclusions. You can modify the code as needed!

First, let's have a function that plots the average scores of relevant (levels 3 and 4) and non-relevant (levels 0, 1, and 2) scores in terms of training epochs for different loss functions:

In [None]:
# Load metrics and models
import json
import torch
from ltr.model import LTRModel

N_FEATURES = 15

pointwise_temp_model = LTRModel(N_FEATURES)
pointwise_temp_model.load_state_dict(torch.load("./outputs/pointwise_model"))

pairwise_temp_model = LTRModel(N_FEATURES)
pairwise_temp_model.load_state_dict(torch.load("./outputs/pairwise_spedup_model"))

listwise_temp_model = LTRModel(N_FEATURES)
listwise_temp_model.load_state_dict(torch.load("./outputs/listwise_model"))


methods = [
    {"results_file": "./outputs/pointwise_res.json", "label": "Pointwise"},
    {"results_file": "./outputs/pairwise_spedup_res.json", "label": "Pairwise"},
    {"results_file": "./outputs/listwise_res.json", "label": "Listwise"},
]

labels = []
results = []
q_results = []
for m in methods:
    labels.append(m["label"])

    with open(m["results_file"]) as reader:
        r = json.load(reader)

    results.append(r["test_metrics"])
    q_results.append(r["test_query_level_metrics"])

In the following cell, `compare_methods` and `plot_distribution` figures are imported. You can use the them for your analysis and observe how the different loss functions compare to each other.

In [None]:
from ltr.utils import compare_methods

compare_methods(labels, results)

In [None]:
from ltr.utils import plot_distribution

plot_distribution(labels, q_results)

1. Given the training results and evaluation results, please elaborate on the ranking performance with different loss functions.

2. In this assignment, you extracted N_FEATURES features for each query-document pair. Now, consider adding 1-2 new features, and run the training again. Analyze training performance with the extended feature vectors.

See the Canvas assignment for further details and to submit your results.