In [5]:
import transformers

In [6]:
!pip install transformers datasets



In [7]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [8]:
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [9]:
del coqa["version"]

In [10]:
#required columns in our dataframe
cols = ["text","question","answer"]
#list of lists to create our dataframe
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)
new_df = pd.DataFrame(comp_list, columns=cols) 
#saving the dataframe to csv file for further loading
new_df.to_csv("CoQA_data.csv", index=False)


In [11]:
data = pd.read_csv("CoQA_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [12]:
print("Number of question and answers: ", len(data))

Number of question and answers:  108647


In [13]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [14]:
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["text"][random_num]

In [15]:
input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))

The input has a total of 335 tokens.


In [16]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

[CLS]        101
who        2,040
is         2,003
cu        12,731
##i        2,072
shang     29,382
##yu      10,513
?          1,029
[SEP]        102
why        2,339
do         2,079
you        2,017
study      2,817
?          1,029
many       2,116
students   2,493
would      2,052
simply     3,432
reply      7,514
:          1,024
"          1,000
to         2,000
get        2,131
good       2,204
result     2,765
in         1,999
the        1,996
college    2,267
entrance   4,211
examination   7,749
.          1,012
"          1,000
for        2,005
several    2,195
years      2,086
,          1,010
many       2,116
have       2,031
spoken     5,287
out        2,041
against    2,114
the        1,996
exams     13,869
.          1,012
some       2,070
say        2,360
they       2,027
have       2,031
turned     2,357
children   2,336
into       2,046
studying   5,702
machines   6,681
.          1,012
others     2,500
think      2,228
that       2,008
one        2,028
exam      1

In [17]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  8
Number of tokens in segment A:  9
Number of tokens in segment B:  326


In [18]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

In [19]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


answer = tokens[answer_start]
for i in range(answer_start+1, answer_end+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]



Question:
Who is cui shangyu?

Answer:
A senior 3 student in sichuan.


In [20]:
answer = tokens[answer_start]
for i in range(answer_start+1, answer_end+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]

In [24]:
def question_answer(question, text):
    
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    a_start = torch.argmax(output.start_logits)
    a_end = torch.argmax(output.end_logits)
    if a_end >= a_start:
        a = tokens[a_start]
        for i in range(a_start+1, a_end+1):
            if tokens[i][0:2] == "##":
                a += tokens[i][2:]
            else:
                a += " " + tokens[i]
                
    if a.startswith("[CLS]"):
        a = "Unable to find the answer to your question."
    
    responseText = a
    print(responseText)
    #print("\nPredicted answer:\n{}".format(a.capitalize()))
    



In [25]:
#your text passage goes here.
text = "Victoria University (VU or Vic Uni) is a public research university based in Melbourne, Victoria, Australia. It is one of only six dual-sector universities in Australia, providing courses in both higher education and Technical and Further Education (TAFE). 2016 marked VU's centenary as an educational institution and its 25th anniversary as a university. The university has several campuses in Melbourne Central Business District, Melbourne Western Region, and in Sydney, comprising six academic colleges, six research institutes, seven research centres and VU's Victoria Polytechnic (providing vocational education and training). The new VU city Tower will be Melbourne’s tallest vertical campus and is due for completion in 2021. It also offers courses at partner institutions throughout Asia. Victoria University Melbourne is a globally recognised institution, ranking in the top 2% of universities worldwide (2021 Times Higher Education World University Rankings), and 56th globally (2nd in Victoria) in the 2020 Times Higher Education (THE) Young University Rankings.VU was also ranked 11th in the world (1st in Australia) for Peace, Justice and Strong Institutions (Times Higher Education Impact Rankings 2021). In 2018, Victoria University became the first university in Australia to adopt the block model style of teaching for all undergraduate courses. Under VU’s Block Model, students’ study and complete one unit at a time over a four-week period (a block), working collaboratively in smaller classes. Unlike the standard model of tertiary education in Australia, students studying under VU’s Block Model focus on learning one unit every four weeks, rather than juggling multiple units and assessments at the same time within a semester. Since introducing the VU Block Model, pass rates amongst onshore undergraduate students have increased, with over 90% of students passing their enrolled units in 2020. Overall grades for this cohort has also grown, with over 60% receiving distinction levels or higher in the same year. This improvement in academic results is complemented by higher levels of learner engagement amongst its students. The latest Australian Government 2020 Student Experience Survey (SES) has Victoria University now ranked by students as the top University in Victoria, and third overall in Australia, for learner engagement. "
question = "What is VU block Model?"

In [26]:
test = question_answer(question, text)

students ’ study and complete one unit at a time over a four - week period ( a block ) , working collaboratively in smaller classes


In [20]:
!pip install flask



In [21]:
!pip install flask-ngrok



In [42]:
!pip install dialogflow



In [43]:
!pip install google-api-core



In [None]:
import flask
import os
from flask import send_from_directory, request

app = flask.Flask(__name__)

@app.route('/favicon.ico')
def favicon():
    return send_from_directory(os.path.join(app.root_path, 'static'),
                               'favicon.ico', mimetype='image/favicon.png')

@app.route('/')
@app.route('/home')
def home():
    return "Hello World"

@app.route('/webhook', methods=['POST'])
def webhook():
    req = request.get_json(force=True)
    
    #WebhookResponse = ''
    
    query_result = req.get('queryResult')
    
    query_Text = query_result.get('queryText')
    
    question = str(query_Text)
    
    print(query_Text)
    print(query_result)
    
    WebhookResponse = question_answer(question, text)
    

    return {
        'fulfillmentText' : WebhookResponse
        }

if __name__ == "__main__":
    app.run()
  

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


what is vu block model?
{'queryText': 'what is vu block model?', 'parameters': {'geo-city': ''}, 'allRequiredParamsPresent': True, 'fulfillmentMessages': [{'text': {'text': ['']}}], 'outputContexts': [{'name': 'projects/askme-krct/locations/global/agent/sessions/06cbc855-9e18-9728-d440-0cff8fde808b/contexts/__system_counters__', 'parameters': {'no-input': 0.0, 'no-match': 0.0, 'geo-city': '', 'geo-city.original': ''}}], 'intent': {'name': 'projects/askme-krct/locations/global/agent/intents/ea3a9c3f-b869-4b91-9eb7-b94a86a87b1b', 'displayName': 'bert model'}, 'intentDetectionConfidence': 1.0, 'languageCode': 'en'}


127.0.0.1 - - [24/Sep/2021 12:26:39] "[37mPOST /webhook HTTP/1.1[0m" 200 -


students ’ study and complete one unit at a time over a four - week period ( a block ) , working collaboratively in smaller classes
