In [7]:
import requests
import zipfile
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import subprocess

# Define paths
glove_zip_path = r"D:\BITS\Classes\Sem3\NLP Applications\Assignment\Code\glove.6B.zip"
glove_file_name = "glove.6B.50d.txt"
dataset_path = "train-v2.0.json"
model_dir = "fine_tuned_distilbert"
predictions_path = "predictions.json"
eval_script_path = r"c:\Users\Ganesh AI\Downloads\evaluate-v2.0.py"

# Step 1: Download and Extract GloVe Embeddings
def download_and_extract_glove(zip_path, file_name):
    glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
    if not os.path.exists(zip_path):
        print("Downloading GloVe embeddings...")
        response = requests.get(glove_url)
        with open(zip_path, 'wb') as file:
            file.write(response.content)
        print("Download complete.")
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        if file_name not in zip_ref.namelist():
            raise FileNotFoundError(f"{file_name} not found in the zip file.")
        zip_ref.extract(file_name, os.path.dirname(zip_path))
        print(f"Extracted {file_name} from the zip file.")

# Step 2: Load GloVe Embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Step 3: Load and Limit the Dataset
def load_squad_dataset(file_path, limit=100):
    with open(file_path, 'r') as file:
        squad_data = json.load(file)
    
    contexts, questions, answers = [], [], []
    count = 0
    
    if 'data' not in squad_data:
        raise KeyError("'data' key not found in the JSON object")
    
    # print(squad_data['data'][0])
    for topic in tqdm(squad_data['data'], desc="Processing Data"):
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if not qa['is_impossible']:  # Include only answerable questions
                    question = qa['question']
                    answer = qa['answers'][0]  # Take the first answer
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return pd.DataFrame({
        "context": contexts,
        "question": questions,
        "answer_text": [ans['text'] for ans in answers],
        "answer_start": [ans['answer_start'] for ans in answers]
    })

# Step 4: Generate Predictions Using the Fine-Tuned Model
def generate_predictions(model_dir, dataset_path, output_path):
    qa_pipeline = pipeline("question-answering", model=model_dir, tokenizer=model_dir)
    squad_data = load_squad_dataset(dataset_path)
    predictions = {}
    for topic in squad_data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                if not qa['is_impossible']:
                    answer = qa_pipeline(question=question, context=context)
                    predictions[qa['id']] = answer['answer']
    with open(output_path, 'w') as file:
        json.dump(predictions, file)
    print(f"Predictions saved to {output_path}")

# Step 5: Evaluate the Model
def evaluate_model(eval_script_path, dataset_path, predictions_path):
    subprocess.run(["python", eval_script_path, dataset_path, predictions_path])

# Execute the steps
download_and_extract_glove(glove_zip_path, glove_file_name)
glove_embeddings = load_glove_embeddings(os.path.join(os.path.dirname(glove_zip_path), glove_file_name))
print(f"Loaded {len(glove_embeddings)} word vectors from {glove_file_name}.")

# Limit the dataset to 100 entries
df = load_squad_dataset(dataset_path, limit=1)
print(f"Dataset Loaded: {len(df)} examples")

# Generate predictions
generate_predictions(model_dir, dataset_path, predictions_path)

# Evaluate the model
evaluate_model(eval_script_path, dataset_path, predictions_path)

Extracted glove.6B.50d.txt from the zip file.
Loaded 400000 word vectors from glove.6B.50d.txt.


Processing Data: 100%|██████████| 442/442 [00:00<00:00, 8505.14it/s]


Dataset Loaded: 86821 examples


Device set to use cpu
Processing Data: 100%|██████████| 442/442 [00:00<00:00, 6148.92it/s]


KeyError: 'data'

In [2]:
import json
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import requests

# Load the dataset
def load_squad_dataset(file_path):
    with open(file_path, 'r') as file:
        squad_data = json.load(file)
    return squad_data

# Extract questions, contexts, and answers
def extract_features(squad_data):
    # Debugging: Print the keys of the JSON object
    print("Top-level keys in the JSON object:", squad_data.keys())
    
    contexts, questions, answers = [], [], []

    for topic in tqdm(squad_data['data'], desc="Processing Data"):
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if not qa['is_impossible']:  # Include only answerable questions
                    question = qa['question']
                    answer = qa['answers'][0]  # Take the first answer
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    return pd.DataFrame({
        "context": contexts,
        "question": questions,
        "answer_text": [ans['text'] for ans in answers],
        "answer_start": [ans['answer_start'] for ans in answers]
    })

# File path to the SQuAD dataset
dataset_path = "train-v2.0.json"

# Check if the file exists
if not Path(dataset_path).is_file():
    response = requests.get("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    if response.status_code == 200:
        with open(dataset_path, 'wb') as file:
            file.write(response.content)
    else:
        raise FileNotFoundError(f"Unable to download the file from the URL")

squad_data = load_squad_dataset(dataset_path)

# Extract features
df = extract_features(squad_data)

# Display a sample
print(f"Dataset Loaded: {len(df)} examples")
df.sample(5)

Top-level keys in the JSON object: dict_keys(['version', 'data'])


Processing Data: 100%|██████████| 442/442 [00:00<00:00, 8093.33it/s]

Dataset Loaded: 86821 examples





Unnamed: 0,context,question,answer_text,answer_start
17713,Another theory describes its derivation from a...,WHen did they find the polish kingdom?,about the fifth century.,836
50658,"When John DeStefano, Jr., became mayor of New ...",When was the last time New Haven had a profess...,2009,575
48524,According to the International Organization fo...,How much money did overseas Nigerians send hom...,USD 2.3 billion,158
84486,"In 1797, Patrick Colquhoun was able to persuad...",Where did the West Indies merchants in London ...,docks,165
65100,The Arabic term ijāzat al-tadrīs was awarded t...,What did earning the ijazat al-tadris award st...,licence to teach,150


In [6]:
import json
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import requests

# Load the dataset
def load_squad_dataset(file_path):
    with open(file_path, 'r') as file:
        squad_data = json.load(file)
    return squad_data

# Extract questions, contexts, and answers
def extract_features(squad_data):
    # Debugging: Print the entire JSON object
    print("JSON structure:", json.dumps(squad_data, indent=2)[:1000])  # Print the first 1000 characters for readability
    
    # Check if 'data' key exists
    if 'data' not in squad_data:
        raise KeyError("'data' key not found in the JSON object")
    
    contexts, questions, answers = [], [], []

    for topic in tqdm(squad_data['data'], desc="Processing Data"):
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if not qa['is_impossible']:  # Include only answerable questions
                    question = qa['question']
                    answer = qa['answers'][0]  # Take the first answer
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    return pd.DataFrame({
        "context": contexts,
        "question": questions,
        "answer_text": [ans['text'] for ans in answers],
        "answer_start": [ans['answer_start'] for ans in answers]
    })

# File path to the SQuAD dataset
dataset_path = "train-v2.0.json"

# Check if the file exists
if not Path(dataset_path).is_file():
    response = requests.get("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    if response.status_code == 200:
        with open(dataset_path, 'wb') as file:
            file.write(response.content)
    else:
        raise FileNotFoundError(f"Unable to download the file from the URL")

squad_data = load_squad_dataset(dataset_path)

# Extract features
df = extract_features(squad_data)

# Display a sample
print(f"Dataset Loaded: {len(df)} examples")
df.sample(5)

JSON structure: {
  "version": "v2.0",
  "data": [
    {
      "title": "Beyonc\u00e9",
      "paragraphs": [
        {
          "qas": [
            {
              "question": "When did Beyonce start becoming popular?",
              "id": "56be85543aeaaa14008c9063",
              "answers": [
                {
                  "text": "in the late 1990s",
                  "answer_start": 269
                }
              ],
              "is_impossible": false
            },
            {
              "question": "What areas did Beyonce compete in when she was growing up?",
              "id": "56be85543aeaaa14008c9065",
              "answers": [
                {
                  "text": "singing and dancing",
                  "answer_start": 207
                }
              ],
              "is_impossible": false
            },
            {
              "question": "When did Beyonce leave Destiny's Child and become a solo singer?",
              "id": "56be85543aeaaa

Processing Data: 100%|██████████| 442/442 [00:00<00:00, 7853.87it/s]

Dataset Loaded: 86821 examples





Unnamed: 0,context,question,answer_text,answer_start
57409,Nasser's regional position changed unexpectedl...,What country experienced a coup in 1962?,North Yemen,135
80413,"Mosaic has a long history, starting in Mesopot...",The Norman kingdomwas in what italian city state?,Sicily,481
41324,"In 2010, 6.9% of the population (1,269,765) co...",What percentage of the population considers th...,"6.9% of the population (1,269,765) considered ...",9
70010,President Richard Nixon declared current speci...,What Congress called for the drafting of the E...,the 93rd United States Congress,101
12788,"Valencian is classified as a Western dialect, ...",What forms are mutually intelligible?,Catalan and Valencian,190


In [9]:
import requests
import zipfile
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import subprocess

# Define paths
glove_zip_path = r"D:\BITS\Classes\Sem3\NLP Applications\Assignment\Code\glove.6B.zip"
glove_file_name = "glove.6B.50d.txt"
dataset_path = "train-v2.0.json"
model_dir = "fine_tuned_distilbert"
predictions_path = "predictions.json"
eval_script_path = r"c:\Users\Ganesh AI\Downloads\evaluate-v2.0.py"

# Step 1: Download and Extract GloVe Embeddings
def download_and_extract_glove(zip_path, file_name):
    glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
    if not os.path.exists(zip_path):
        print("Downloading GloVe embeddings...")
        response = requests.get(glove_url)
        with open(zip_path, 'wb') as file:
            file.write(response.content)
        print("Download complete.")
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        if file_name not in zip_ref.namelist():
            raise FileNotFoundError(f"{file_name} not found in the zip file.")
        zip_ref.extract(file_name, os.path.dirname(zip_path))
        print(f"Extracted {file_name} from the zip file.")

# Step 2: Load GloVe Embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Step 3: Load and Limit the Dataset
def load_squad_dataset(file_path, limit=100):
    with open(file_path, 'r') as file:
        squad_data = json.load(file)
    
    contexts, questions, answers = [], [], []
    count = 0
    
    if 'data' not in squad_data:
        raise KeyError("'data' key not found in the JSON object")
    
    for topic in tqdm(squad_data['data'], desc="Processing Data"):
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if not qa['is_impossible']:  # Include only answerable questions
                    question = qa['question']
                    answer = qa['answers'][0]  # Take the first answer
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
                    count += 1
                    if count >= limit:
                        return pd.DataFrame({
                            "context": contexts,
                            "question": questions,
                            "answer_text": [ans['text'] for ans in answers],
                            "answer_start": [ans['answer_start'] for ans in answers]
                        })

    return pd.DataFrame({
        "context": contexts,
        "question": questions,
        "answer_text": [ans['text'] for ans in answers],
        "answer_start": [ans['answer_start'] for ans in answers]
    })

# Step 4: Generate Predictions Using the Fine-Tuned Model
def generate_predictions(model_dir, dataset_path, output_path):
    qa_pipeline = pipeline("question-answering", model=model_dir, tokenizer=model_dir)
    squad_data = load_squad_dataset(dataset_path)
    predictions = {}
    for index, row in squad_data.iterrows():
        context = row['context']
        question = row['question']
        answer = qa_pipeline(question=question, context=context)
        predictions[index] = answer['answer']
    with open(output_path, 'w') as file:
        json.dump(predictions, file)
    print(f"Predictions saved to {output_path}")

# Step 5: Evaluate the Model
def evaluate_model(eval_script_path, dataset_path, predictions_path):
    subprocess.run(["python", eval_script_path, dataset_path, predictions_path])

# Execute the steps
download_and_extract_glove(glove_zip_path, glove_file_name)
glove_embeddings = load_glove_embeddings(os.path.join(os.path.dirname(glove_zip_path), glove_file_name))
print(f"Loaded {len(glove_embeddings)} word vectors from {glove_file_name}.")

# Limit the dataset to 100 entries
df = load_squad_dataset(dataset_path, limit=1000)
print(f"Dataset Loaded: {len(df)} examples")

# Generate predictions
generate_predictions(model_dir, dataset_path, predictions_path)

# Evaluate the model
evaluate_model(eval_script_path, dataset_path, predictions_path)

Extracted glove.6B.50d.txt from the zip file.
Loaded 400000 word vectors from glove.6B.50d.txt.


Processing Data:   0%|          | 1/442 [00:00<00:04, 103.08it/s]


Dataset Loaded: 1000 examples


Device set to use cpu
Processing Data:   0%|          | 0/442 [00:00<?, ?it/s]


Predictions saved to predictions.json
