## TODO: 
1. Understand how to run the LLMs using python.
1. Run some prompts using llama models.
1. After you succeed running some prompts, write a code that collects the responses and relevant data.

In [1]:
import llama_models

# URLs for SQuAD datasets

In [1]:
import requests

# This dataset contains questions based on Wikipedia articles. Each question has a segment of text from the articles as the answer.
urls = {
    "train_v1.1": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json",
    "dev_v1.1": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json",
    "train_v2.0": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json",
    "dev_v2.0": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
}

# Download and save the datasets
for name, url in urls.items():
    response = requests.get(url)
    with open(f"{name}.json", 'wb') as f:
        f.write(response.content)
    print(f"{name}.json downloaded.")






train_v1.1.json downloaded.
dev_v1.1.json downloaded.
train_v2.0.json downloaded.
dev_v2.0.json downloaded.


# Example for SQUAD

In [2]:
import json


# Load SQuAD v1.1 Training Set
with open('train_v1.1.json', 'r') as f:
    squad_v1_1_train = json.load(f)

# Load SQuAD v2.0 Training Set
with open('train_v2.0.json', 'r') as f:
    squad_v2_0_train = json.load(f)

# Print first data entry
print("First entry in SQuAD v1.1 Train Set:", squad_v1_1_train['data'][0])
print("First entry in SQuAD v2.0 Train Set:", squad_v2_0_train['data'][0])

#The datasets are in a JSON format. The 'data' field contains a list of articles, and each article has:
#'paragraphs': A list of paragraphs within the article.
#'qas': A list of questions related to each paragraph.
#'answers': For v1.1, this includes a list of answers, and for v2.0, it also includes unanswerable questions with an empty answer list.

First entry in SQuAD v1.1 Train Set: {'title': 'University_of_Notre_Dame', 'paragraphs': [{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'qas': [{'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}], 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'id': '5733be284776f41900661

# Download and (try) to print an example

In [16]:
import requests
import os
import json
import pandas as pd

# Create a directory to save datasets
os.makedirs("datasets", exist_ok=True)

# Define the dataset URLs
datasets = {
    "SQuAD v1.1": {
        "train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json",
        "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
    },
    "SQuAD v2.0": {
        "train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json",
        "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
    },
    "Natural Questions": {
        "train": "https://storage.googleapis.com/natural-questions-v1.0/nq-train.jsonl",
        "dev": "https://storage.googleapis.com/natural-questions-v1.0/nq-dev.jsonl"
    },
    "TriviaQA": {
        "web": "https://nlp.cs.washington.edu/triviaqa/triviaqa-web.tar.gz",  # You may need to extract it after downloading
        "wiki": "https://nlp.cs.washington.edu/triviaqa/triviaqa-wiki.tar.gz"
    },
    "QuAC": {
        "train": "https://quac.ai/data/quac/train_v0.2.json",
        "dev": "https://quac.ai/data/quac/val_v0.2.json"
    },
    "HotpotQA": {
        "train": "https://hotpotqa.github.io/hotpot_train_v1.1.json",
        "dev": "https://hotpotqa.github.io/hotpot_dev_distractor_v1.json"
    },
    "NewsQA": {
        "data": "https://github.com/Maluuba/newsqa/raw/master/newsqa-data/newsqa.csv"  # CSV format
    },
    "DuoRC": {
        "selfrc": "https://duorc.github.io/downloads/selfrc.json",
        "pararc": "https://duorc.github.io/downloads/pararc.json"
    }
}

# Function to download datasets
def download_file(url, folder):
    response = requests.get(url)
    filename = os.path.join(folder, url.split("/")[-1])
    with open(filename, "wb") as f:
        f.write(response.content)
    print(f"Downloaded: {filename}")
    return filename

# Function to print the first data entry of JSON, JSONL, and CSV files
def print_first_entry(filename):
    try:
        if filename.endswith('.json'):
            with open(filename, 'r') as f:
                data = json.load(f)
                print(f"First entry in {filename}:")
                if 'data' in data:
                    print(data['data'][0])
                else:
                    print(data)  # Print whole data if the structure is different
        elif filename.endswith('.jsonl'):
            with open(filename, 'r') as f:
                lines = f.readlines()  # Read all lines
                if not lines:
                    print(f"{filename} is empty.")
                    return
                print(f"First entry in {filename}:")
                print(json.loads(lines[0]))  # Load the first line as JSON
        elif filename.endswith('.csv'):
            data = pd.read_csv(filename)
            print(f"First entry in {filename}:")
            print(data.iloc[0])  # Print the first row of the DataFrame
        elif filename.endswith('.tar.gz'):
            print(f"Downloaded TAR.GZ file: {filename}. You will need to extract it to access the contents.")
    except json.JSONDecodeError:
        print(f"Failed to decode JSON from {filename}. It may be empty or incorrectly formatted.")
    except Exception as e:
        print(f"An error occurred while processing {filename}: {e}")

# Download each dataset
for dataset, urls in datasets.items():
    print(f"\nDownloading {dataset}...")
    for name, url in urls.items():
        filename = download_file(url, "datasets")
        print_first_entry(filename)



Downloading SQuAD v1.1...
Downloaded: datasets\train-v1.1.json
First entry in datasets\train-v1.1.json:
{'title': 'University_of_Notre_Dame', 'paragraphs': [{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'qas': [{'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}], 'question': 'To whom did the Virgin Mary allege