In [2]:
import pandas as pd
import torch
from transformers import BertForQuestionAnswering,BertTokenizerFast, BertTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

2024-07-18 11:45:09.968637: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 11:45:09.968757: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 11:45:10.109738: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
train_df = pd.read_csv("/kaggle/input/squad-csv-format/SQuAD_csv.csv")

In [4]:
train_df['answer'] = train_df['text']

In [5]:
train_df = train_df.drop(columns=["Unnamed: 0", "id","text"])

In [6]:
train_df

Unnamed: 0,context,question,answer_start,answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,166,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,276,late 1990s
...,...,...,...,...
86816,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,229,Oregon
86817,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,414,Rangoon
86818,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,476,Minsk
86819,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,199,1975


In [7]:
train_df , valid_df = train_test_split(train_df, test_size=0.4, random_state=42)

In [8]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

In [9]:
train_dataset

Dataset({
    features: ['context', 'question', 'answer_start', 'answer', '__index_level_0__'],
    num_rows: 52092
})

In [10]:
# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    start_positions = []
    end_positions = []

    for i, offset in enumerate(inputs["offset_mapping"]):
        if examples["answer"][i] is None:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = examples["answer_start"][i]
        end_char = start_char + len(examples["answer"][i])

        # Find the start token index
        token_start_index = 0
        while token_start_index < len(offset) and offset[token_start_index][0] <= start_char:
            token_start_index += 1
        token_start_index = max(0, token_start_index - 1)

        # Find the end token index
        token_end_index = len(offset) - 1
        while token_end_index >= 0 and offset[token_end_index][1] >= end_char:
            token_end_index -= 1
        token_end_index = min(len(offset) - 1, token_end_index + 1)

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[token_start_index][0] > end_char or offset[token_end_index][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_positions.append(token_start_index)
            end_positions.append(token_end_index)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [12]:
# Apply preprocessing to the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["context", "question", "answer_start", "answer"])
tokenized_val_dataset = valid_dataset.map(preprocess_function, batched=True, remove_columns=["context", "question", "answer_start", "answer"])


Map:   0%|          | 0/52092 [00:00<?, ? examples/s]

Map:   0%|          | 0/34729 [00:00<?, ? examples/s]

In [13]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
)



In [14]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)


In [15]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.0724,0.074301
2,0.0536,0.068389
3,0.0007,0.107917


TrainOutput(global_step=9768, training_loss=0.07206042095645805, metrics={'train_runtime': 7768.661, 'train_samples_per_second': 20.116, 'train_steps_per_second': 1.257, 'total_flos': 3.062580896675635e+16, 'train_loss': 0.07206042095645805, 'epoch': 3.0})

In [16]:
# Save the model
model.save_pretrained("./question-answering-model")
tokenizer.save_pretrained("./question-answering-model")

('./question-answering-model/tokenizer_config.json',
 './question-answering-model/special_tokens_map.json',
 './question-answering-model/vocab.txt',
 './question-answering-model/added_tokens.json',
 './question-answering-model/tokenizer.json')

In [17]:
from huggingface_hub import login

access_token = "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"

login(token=access_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [18]:
model.push_to_hub("khaledsayed1/Question_answering_bert", use_auth_token="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")
tokenizer.push_to_hub("khaledsayed1/Question_answering_bert", use_auth_token="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")



README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/khaledsayed1/Question_answering_bert/commit/b534f7a3489e54a8b5ebd450af3cb20bc8c4382a', commit_message='Upload tokenizer', commit_description='', oid='b534f7a3489e54a8b5ebd450af3cb20bc8c4382a', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
#import PyPDF2
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering

# Function to extract text from a PDF
#def extract_text_from_pdf(pdf_path):
 #   pdf_reader = PyPDF2.PdfReader(open(pdf_path, "rb"))
  #  text = ""
   # for page_num in range(len(pdf_reader.pages)):
    #    page = pdf_reader.pages[page_num]
     #   text += page.extract_text()
    #return text

# Load the trained model and tokenizer from Hugging Face
model_name = "khaledsayed1/Question_answering_bert"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token="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")
model = AutoModelForQuestionAnswering.from_pretrained(model_name, use_auth_token="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")

# Initialize the question-answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


# Path to your PDF file
pdf_path = '''The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
'''

# Extract text from the PDF
#context = extract_text_from_pdf(pdf_path)

# Example questions for inference
questions = [
   "Where is the Eiffel Tower located?",
    "Who designed the Eiffel Tower?"
    
]

# Perform inference
for question in questions:
    result = qa_pipeline(question=question, context=pdf_path)
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}\n")
