In [1]:
import bz2
import json
import os
from datetime import datetime
import argparse

from loguru import logger
from tqdm.auto import tqdm


def load_data_in_batches(dataset_path, batch_size, split=-1):
    """
    Generator function that reads data from a compressed file and yields batches of data.
    Each batch is a dictionary containing lists of interaction_ids, queries, search results, query times, and answers.

    Args:
    dataset_path (str): Path to the dataset file.
    batch_size (int): Number of data items in each batch.

    Yields:
    dict: A batch of data.
    """

    def initialize_batch():
        """ Helper function to create an empty batch. """
        return {"interaction_id": [], "query": [], "search_results": [], "query_time": [], "answer": [], "domain": [], "static_or_dynamic": [], "question_type": []}

    try:
        with bz2.open(dataset_path, "rt") as file:
            batch = initialize_batch()
            for line in file:
                try:
                    item = json.loads(line)

                    if split != -1 and item["split"] != split:
                        continue

                    for key in batch:
                        batch[key].append(item[key])

                    if len(batch["query"]) == batch_size:
                        yield batch
                        batch = initialize_batch()
                except json.JSONDecodeError:
                    logger.warn("Warning: Failed to decode a line.")
            # Yield any remaining data as the last batch
            if batch["query"]:
                yield batch
    except FileNotFoundError as e:
        logger.error(f"Error: The file {dataset_path} was not found.")
        raise e
    except IOError as e:
        logger.error(f"Error: An error occurred while reading the file {dataset_path}.")
        raise e


def generate_predictions(dataset_path, model, split):
    """
    Processes batches of data from a dataset to generate predictions using a model.

    Args:
    dataset_path (str): Path to the dataset.
    model (object): UserModel that provides `get_batch_size()` and `batch_generate_answer()` interfaces.

    Returns:
    tuple: A tuple containing lists of queries, ground truths, and predictions.
    """
    queries, ground_truths, predictions = [], [], []
    batch_size = model.get_batch_size()

    for batch in tqdm(load_data_in_batches(dataset_path, batch_size, split), desc="Generating predictions"):
        print(f"batch : {batch}")
        break
        batch_ground_truths = batch.pop("answer")  # Remove answers from batch and store them
        # batch_predictions = model.batch_generate_answer(batch)

        queries.extend(batch["query"])
        ground_truths.extend(batch_ground_truths)
        predictions.extend(batch_predictions)

    return queries, ground_truths, predictions


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--dataset_path", type=str, default="example_data/dev_data.jsonl.bz2",
                        choices=["example_data/dev_data.jsonl.bz2", # example data
                                 "data/crag_task_1_dev_v4_release.jsonl.bz2", # full data
                                 ])
    parser.add_argument("--split", type=int, default=-1,
                        help="The split of the dataset to use. This is only relevant for the full data: "
                             "0 for public validation set, 1 for public test set")

    parser.add_argument("--model_name", type=str, default="vanilla_baseline",
                        choices=["vanilla_baseline",
                                 "rag_baseline"
                                 # add your model here
                                 ],
                        )

    parser.add_argument("--llm_name", type=str, default="meta-llama/Llama-3.2-3B-Instruct",
                        choices=["meta-llama/Llama-3.2-3B-Instruct",
                                 "google/gemma-2-2b-it",
                                 # can add more llm models here
                                 ])
    parser.add_argument("--is_server", action="store_true", default=False,
                        help="Whether we use vLLM deployed on a server or offline inference.")
    parser.add_argument("--vllm_server", type=str, default="http://localhost:8088/v1",
                        help="URL of the vLLM server if is_server is True. The port number may vary.")

    args = parser.parse_args()
    print(args.is_server)

    dataset_path = args.dataset_path
    dataset = dataset_path.split("/")[0]
    split = -1
    if dataset == "data":
        split = args.split
        if split == -1:
            raise ValueError("Please provide a valid split value for the full data: "
                             "0 for public validation set, 1 for public test set.")
    dataset_path = os.path.join("..", dataset_path)

    llm_name = args.llm_name
    _llm_name = llm_name.split("/")[-1]
    
    model_name = args.model_name
    if model_name == "vanilla_baseline":
        from vanilla_baseline import InstructModel
        model = InstructModel(llm_name=llm_name, is_server=args.is_server, vllm_server=args.vllm_server)
    elif model_name == "rag_baseline":
        from rag_baseline import RAGModel
        model = RAGModel(llm_name=llm_name, is_server=args.is_server, vllm_server=args.vllm_server)
    # elif model_name == "your_model":
    #     add your model here
    else:
        raise ValueError("Model name not recognized.")

    # make output directory
    output_directory = os.path.join("..", "output", dataset, model_name, _llm_name)
    os.makedirs(output_directory, exist_ok=True)

    # Generate predictions
    queries, ground_truths, predictions = generate_predictions(dataset_path, model, split)

    # # save predictions
    # json.dump({"queries": queries, "ground_truths": ground_truths, "predictions": predictions},
    #           open(os.path.join(output_directory, "predictions.json"), "w"), indent=4)


In [66]:
import bz2
import json
from bs4 import BeautifulSoup
dataset_path = '/home/jupyter/cs245-project-crag-master/data/crag_task_1_dev_v4_release.jsonl.bz2'
with bz2.open(dataset_path, "rt") as file:
    for line in file:
        item = json.loads(line)
        print(item['question_type'])
        print(item['query_time'])
        print(item['query'])
        print(item['search_results'][0].keys())
        for page in item['search_results']:
            print(page['page_url'])
            print(page['page_last_modified'])
            soup = BeautifulSoup(page['page_result'])
            text = soup.get_text(" ", strip=True)  # Use space as a separator, strip whitespaces
            print(text)
        break

post-processing
03/10/2024, 23:19:21 PT
how many 3-point attempts did steve nash average per game in seasons he made the 50-40-90 club?
dict_keys(['page_name', 'page_url', 'page_snippet', 'page_result', 'page_last_modified'])
https://www.basketball-reference.com/players/n/nashst01.html
 Mon, 11 Mar 2024 06:06:35 GMT
Steve Nash Stats, Height, Weight, Position, Draft Status and more | Basketball-Reference.com Sports Reference ® Baseball Football (college) Basketball (college) Hockey Calcio Blog Stathead ® Immaculate Grid Questions or Comments? Welcome · Your Account Logout Ad-Free Login Create Account MENU Players Teams Seasons Leaders Scores WNBA Draft Stathead Newsletter Full Site Menu Below You are here: BBR Home Page > Players > N > Steve Nash Welcome · Your Account Logout Ad-Free Login Create Account Steve Nash Stephen John Nash ▪ Twitter : SteveNash (MVSteve, Two Time, Nashty) Position: Point Guard


  
  ▪ Shoots: Right 6-3 , 195lb (190cm, 88kg) Born: February 7 , 1974 in Johannes

In [2]:
def analyze_query_types(file_path,eval_type):
    import json
    import pandas as pd
    # Load JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
    print(data.keys())
    # Extract question type data
    question_types = data[eval_type][eval_type]
    extracted_data = []
    for q_type, metrics in question_types.items():
        eval_results = metrics["evaluation_results"]
        extracted_data.append([
            q_type,
            eval_results["score"],
            eval_results["exact_accuracy"],
            eval_results["accuracy"],
            eval_results["hallucination"],
            eval_results["missing"],
            eval_results["n_miss"],
            eval_results["n_correct"],
            eval_results["n_correct_exact"],
            eval_results["total"]
        ])

    # Create DataFrame
    columns = [
        eval_type, "Score", "Exact Accuracy", "Accuracy", "Hallucination",
        "Missing", "N Miss", "N Correct", "N Correct Exact", "Total"
    ]
    df = pd.DataFrame(extracted_data, columns=columns)

    # Convert certain columns to percentages for better readability
    df[["Exact Accuracy", "Accuracy", "Hallucination", "Missing"]] *= 100

    # Print the table
    print(df.to_string(index=False))


In [17]:
# Specify the file path and call the function
file_path = '/home/jupyter/cs245-project-crag-master/output/data/modified_rag/Llama-3.2-3B-Instruct/detailed_evaluation_results.json'
analyze_query_types(file_path)

     Question Type     Score  Exact Accuracy  Accuracy  Hallucination   Missing  N Miss  N Correct  N Correct Exact  Total
            simple -0.041783       11.142061 28.133705      32.311978 39.554318     142        101               40    359
simple_w_condition -0.116505        2.912621 25.242718      36.893204 37.864078      78         52                6    206
        comparison  0.024540        0.613497 24.539877      22.085890 53.374233      87         40                1    163
       aggregation -0.099379        4.347826 23.602484      33.540373 42.857143      69         38                7    161
               set  0.200000        1.600000 49.600000      29.600000 20.800000      26         62                2    125
     false_premise -0.039216        0.000000 13.725490      17.647059 68.627451     105         21                0    153
   post-processing -0.295455        0.000000 13.636364      43.181818 43.181818      19          6                0     44
         multi-h

In [18]:
file_path = '/home/jupyter/cs245-project-crag-master/output/data/rag_baseline/Llama-3.2-3B-Instruct/detailed_evaluation_results.json'
analyze_query_types(file_path)

     Question Type     Score  Exact Accuracy  Accuracy  Hallucination   Missing  N Miss  N Correct  N Correct Exact  Total
            simple -0.089136        6.685237 21.448468      30.362117 48.189415     173         77               24    359
simple_w_condition -0.087379        2.912621 23.300971      32.038835 44.660194      92         48                6    206
        comparison -0.085890        1.226994 17.177914      25.766871 57.055215      93         28                2    163
       aggregation -0.037267        3.105590 23.602484      27.329193 49.068323      79         38                5    161
               set  0.016000        0.800000 39.200000      37.600000 23.200000      29         49                1    125
     false_premise -0.013072        0.000000 14.379085      15.686275 69.934641     107         22                0    153
   post-processing  0.000000        2.272727 29.545455      29.545455 40.909091      18         13                1     44
         multi-h

In [22]:
# Specify the file path and call the function
file_path = '/home/jupyter/cs245-project-crag-master/output/data/modified_rag/Llama-3.2-3B-Instruct/detailed_evaluation_results.json'
analyze_query_types(file_path)

     Question Type     Score  Exact Accuracy  Accuracy  Hallucination   Missing  N Miss  N Correct  N Correct Exact  Total
            simple -0.100279       10.863510 25.069638      35.097493 39.832869     143         90               39    359
simple_w_condition -0.029126        4.368932 23.300971      26.213592 50.485437     104         48                9    206
        comparison -0.092025        1.226994 13.496933      22.699387 63.803681     104         22                2    163
       aggregation  0.055901        3.105590 27.329193      21.739130 50.931677      82         44                5    161
               set  0.192000        0.000000 44.800000      25.600000 29.600000      37         56                0    125
     false_premise  0.071895        0.000000 13.725490       6.535948 79.738562     122         21                0    153
   post-processing  0.090909        2.272727 22.727273      13.636364 63.636364      28         10                1     44
         multi-h

In [59]:
# GPT-4
file_path = '/home/jupyter/cs245-project-crag-master/output/data/modified_rag/Llama-3.2-3B-Instruct/detailed_evaluation_results.json'
analyze_query_types(file_path,'question_type')
analyze_query_types(file_path,'static_or_dynamic')
analyze_query_types(file_path,'domain')

dict_keys(['domain', 'static_or_dynamic', 'question_type'])
     question_type     Score  Exact Accuracy  Accuracy  Hallucination   Missing  N Miss  N Correct  N Correct Exact  Total
            simple -0.061281        1.671309 35.376045      41.504178 23.119777      83        127                6    359
simple_w_condition  0.067961        1.941748 42.233010      35.436893 22.330097      46         87                4    206
        comparison  0.067485        6.134969 38.650307      31.901840 29.447853      48         63               10    163
       aggregation -0.186335        1.863354 29.192547      47.826087 22.981366      37         47                3    161
               set  0.096000        0.800000 49.600000      40.000000 10.400000      13         62                1    125
     false_premise  0.261438        0.000000 49.673203      23.529412 26.797386      41         76                0    153
   post-processing -0.090909        9.090909 36.363636      45.454545 18.181818

In [57]:
# GPT-4
file_path = '/home/jupyter/cs245-project-crag-master/output/data/modified_rag/gpt-4/detailed_evaluation_results.json'
analyze_query_types(file_path,'question_type')
analyze_query_types(file_path,'static_or_dynamic')
analyze_query_types(file_path,'domain')

dict_keys(['domain', 'static_or_dynamic', 'question_type'])
     question_type     Score  Exact Accuracy  Accuracy  Hallucination   Missing  N Miss  N Correct  N Correct Exact  Total
            simple -0.089136        1.671309 33.983287      42.896936 23.119777      83        122                6    359
simple_w_condition -0.106796        1.941748 33.495146      44.174757 22.330097      46         69                4    206
        comparison -0.006135        6.134969 34.969325      35.582822 29.447853      48         57               10    163
       aggregation -0.161491        1.863354 30.434783      46.583851 22.981366      37         49                3    161
               set -0.032000        0.800000 43.200000      46.400000 10.400000      13         54                1    125
     false_premise -0.718954        0.000000  0.653595      72.549020 26.797386      41          1                0    153
   post-processing -0.272727        9.090909 27.272727      54.545455 18.181818

In [61]:
# GPT-4
file_path = '/home/jupyter/cs245-project-crag-master/output/data/modified_rag/gpt-4o-mini/detailed_evaluation_results.json'
analyze_query_types(file_path,'question_type')
analyze_query_types(file_path,'static_or_dynamic')
analyze_query_types(file_path,'domain')

dict_keys(['domain', 'static_or_dynamic', 'question_type'])
     question_type     Score  Exact Accuracy  Accuracy  Hallucination  Missing  N Miss  N Correct  N Correct Exact  Total
            simple -0.281337       16.155989 35.933148      64.066852      0.0       0        129               58    359
simple_w_condition -0.174757        7.281553 41.262136      58.737864      0.0       0         85               15    206
        comparison -0.165644        1.226994 41.717791      58.282209      0.0       0         68                2    163
       aggregation -0.440994        4.968944 27.950311      72.049689      0.0       0         45                8    161
               set  0.104000        0.800000 55.200000      44.800000      0.0       0         69                1    125
     false_premise  0.647059        0.000000 82.352941      17.647059      0.0       0        126                0    153
   post-processing -0.590909        4.545455 20.454545      79.545455      0.0       0

In [8]:
# Llama 3.2 3B
file_path = '/home/jupyter/cs245-project-crag-master/output/data/modified_rag/gpt-4o-mini/detailed_evaluation_results.json'
analyze_query_types(file_path,'question_type')
analyze_query_types(file_path,'static_or_dynamic')
analyze_query_types(file_path,'domain')

dict_keys(['domain', 'static_or_dynamic', 'question_type'])
     question_type     Score  Exact Accuracy  Accuracy  Hallucination   Missing  N Miss  N Correct  N Correct Exact  Total
            simple  0.000000       21.169916 33.983287      33.983287 32.033426     115        122               76    359
simple_w_condition  0.043689       11.650485 34.466019      30.097087 35.436893      73         71               24    206
        comparison  0.196319       12.883436 34.969325      15.337423 49.693252      81         57               21    163
       aggregation -0.018634        8.074534 31.055901      32.919255 36.024845      58         50               13    161
               set  0.176000        4.000000 51.200000      33.600000 15.200000      19         64                5    125
     false_premise -0.006536       28.758170 28.758170      29.411765 41.830065      64         44               44    153
   post-processing -0.022727        6.818182 27.272727      29.545455 43.181818

In [7]:
# GPT-4o with Reranker 
file_path = '/home/jupyter/cs245-project-crag-master/output/data/modified_rag/gpt-4o-min_2/detailed_evaluation_results.json'
analyze_query_types(file_path,'question_type')
analyze_query_types(file_path,'static_or_dynamic')
analyze_query_types(file_path,'domain')

dict_keys(['domain', 'static_or_dynamic', 'question_type'])
     question_type     Score  Exact Accuracy  Accuracy  Hallucination   Missing  N Miss  N Correct  N Correct Exact  Total
            simple  0.050139       17.270195 30.640669      25.626741 43.732591     157        110               62    359
simple_w_condition  0.087379        8.252427 30.097087      21.359223 48.543689     100         62               17    206
        comparison  0.196319        4.907975 30.674847      11.042945 58.282209      95         50                8    163
       aggregation  0.000000        5.590062 24.223602      24.223602 51.552795      83         39                9    161
               set  0.024000        2.400000 40.000000      37.600000 22.400000      28         50                3    125
     false_premise -0.006536       14.379085 15.032680      15.686275 69.281046     106         23               22    153
   post-processing -0.159091        4.545455 15.909091      31.818182 52.272727

In [3]:
file_path = '/home/jupyter/cs245-project-crag-master/output/data/modified_rag/Llama-3.2-3B-Instruct/detailed_evaluation_results.json'
analyze_query_types(file_path,'question_type')
analyze_query_types(file_path,'static_or_dynamic')
analyze_query_types(file_path,'domain')

dict_keys(['domain', 'static_or_dynamic', 'question_type'])
     question_type     Score  Exact Accuracy  Accuracy  Hallucination   Missing  N Miss  N Correct  N Correct Exact  Total
            simple -0.041783       15.598886 32.590529      36.768802 30.640669     110        117               56    359
simple_w_condition -0.063107        6.796117 27.184466      33.495146 39.320388      81         56               14    206
        comparison  0.042945        4.907975 23.926380      19.631902 56.441718      92         39                8    163
       aggregation -0.130435        7.453416 26.086957      39.130435 34.782609      56         42               12    161
               set  0.128000        0.800000 48.000000      35.200000 16.800000      21         60                1    125
     false_premise  0.045752        1.960784 18.954248      14.379085 66.666667     102         29                3    153
   post-processing -0.068182        0.000000 25.000000      31.818182 43.181818