In [23]:
import os
from transformers import ElectraTokenizer, AutoModelForQuestionAnswering,ElectraForQuestionAnswering,Trainer,TrainingArguments, pipeline, ElectraConfig, ElectraModel,SquadExample
import numpy as np
import torch
import tensorflow as tf
from datasets import load_dataset
import requests
from bs4 import BeautifulSoup

In [8]:
model_name = "sultan/BioM-ELECTRA-Large-SQuAD2"

### Import and Initialize the tokenizer
Transformer models can’t process the raw text and would need to be converted into numbers for models to make sense of the data. 

In [9]:
tokenizer = ElectraTokenizer.from_pretrained(model_name)

### Import the model
We can download pre-trained models the same as we downloaded the tokenizer in the above step. Here we will instantiate a model that contains a base transformer module, given inputs, it will produce outputs i.e a high dimensional vector. 

In [6]:
model = ElectraForQuestionAnswering.from_pretrained(model_name)

In [11]:
# Define the URL of the webpage to scrape
url = 'https://en.wikipedia.org/wiki/React_(software)'


In [12]:
# Send a GET request to the URL and parse the HTML content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [13]:
# Extract the text from the HTML content
text = ' '.join([p.get_text() for p in soup.find_all('p')])

In [14]:
# Tokenize text
# tokenized_text=tokenizer.prepare_for_model([text])
tokenized_text=tokenizer.__call__(text)
print(tokenized_text)

{'input_ids': [2, 6623, 11, 2082, 3038, 1732, 6623, 17, 27424, 1781, 6623, 1042, 1026, 12, 1744, 42, 2964, 1690, 4164, 16, 4421, 9729, 16, 2188, 13357, 8187, 2734, 1016, 8231, 36, 22, 38, 1725, 9232, 10066, 14051, 2234, 1755, 3958, 17, 2021, 1744, 5909, 1772, 6242, 11, 21428, 6193, 18242, 12, 1690, 42, 4085, 1685, 2641, 2054, 1783, 1690, 16920, 17, 36, 23, 38, 36, 24, 38, 36, 25, 38, 6623, 1883, 1765, 2093, 1732, 42, 5040, 1682, 1680, 2418, 1685, 2742, 16, 11743, 15, 8767, 15, 1781, 25154, 16, 17634, 4639, 1715, 16899, 3223, 7036, 17, 27424, 17, 2250, 15, 6623, 1744, 2298, 12217, 1715, 1680, 10066, 6990, 1690, 20091, 3958, 1701, 1680, 3046, 15, 2528, 12088, 6623, 4639, 5225, 5322, 1680, 2167, 1685, 3827, 13924, 1725, 9911, 1013, 15, 1732, 2300, 1732, 4823, 15421, 16, 3950, 12889, 17, 36, 26, 38, 36, 27, 38, 1680, 2846, 1744, 42, 24913, 21245, 6239, 1685, 2019, 6623, 1725, 1680, 8711, 15, 11808, 1682, 27424, 1041, 1690, 13357, 8187, 2734, 1016, 17, 2234, 1755, 1680, 4445, 13819, 5226, 5

In [60]:
# Initialize the question generation pipeline
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)


In [50]:
from transformers.data.processors.squad import SquadExample

example = SquadExample(
    qas_id="Q1",
    question_text="What is the capital of France?",
    context_text="Paris is a beatiful city and also the capital of France.",
    answer_text="Paris is the capital of France.",
    start_position_character=1,
    title='q1'
)


In [51]:
input_dict = {
    "question": example.question_text,
    "context": example.context_text
}

In [55]:
# result of the question answering task
result = nlp(input_dict)




In [56]:
result

{'score': 0.9944676756858826, 'start': 0, 'end': 5, 'answer': 'Paris'}