In [1]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [2]:
!kaggle datasets download -d jeromeblanchet/conversational-question-answering-dataset-coqa

Downloading conversational-question-answering-dataset-coqa.zip to /content
  0% 0.00/10.2M [00:00<?, ?B/s]
100% 10.2M/10.2M [00:00<00:00, 190MB/s]


In [3]:
! unzip conversational-question-answering-dataset-coqa.zip

Archive:  conversational-question-answering-dataset-coqa.zip
  inflating: coqa-dev-v1.0.json      
  inflating: coqa-train-v1.0.json    


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

!pip install -q transformers

In [6]:
coqa = pd.read_json('/content/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [7]:
coqa['data'][0]

{'source': 'wikipedia',
 'id': '3zotghdk5ibi9cex97fepx7jetpso7',
 'filename': 'Vatican_Library.txt',
 'story': 'The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula. \n\nThe Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail. \n\nIn March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to 

In [8]:
coqa['data'][0].keys()

dict_keys(['source', 'id', 'filename', 'story', 'questions', 'answers', 'name'])

In [9]:
cols = ["text","question","answer"]

comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)

new_df = pd.DataFrame(comp_list, columns=cols) 

In [10]:
new_df.to_csv("CoQA_data.csv", index=False)

In [11]:
data = pd.read_csv("CoQA_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [12]:
print("Number of question and answers: ", len(data))

Number of question and answers:  108647


In [24]:
from transformers import TFBertForQuestionAnswering
from transformers import BertTokenizer

In [38]:
model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [39]:
random_num = np.random.randint(0,len(data))

question = data["question"][random_num]
text = data["text"][random_num]

In [40]:
random_num

29388

In [41]:
input_ids = tokenizer(question, text, return_tensors="tf")
input_ids.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [42]:
input_ids.input_ids[0]

<tf.Tensor: shape=(459,), dtype=int32, numpy=
array([  101,  2054,  2265,  1029,   102,  1006, 13229,  1007,  1011,
        1011, 17709,  5686,  7723,  2003,  2242,  1997,  1037,  2542,
        5722,  2426, 10205,  1997,  4126,  4349,  1012,  1037,  5440,
        1997,  8817,  1997,  8141,  1010,  1037,  5394,  2000,  7644,
        1997,  4898,  1010,  2002,  1005,  1055,  2042,  2170,  1000,
        2637,  1005,  1055,  4602,  4126,  3213,  1012,  1000,  1996,
        6564,  1011,  2095,  2214,  3166,  2038,  2042,  3015,  2190,
       23836,  2075,  2808,  2005,  8442,  2086,  1010,  3262,  2530,
        2015,  1998,  4126,  6002,  1012,  2116,  1997,  2068,  2031,
        2042,  2357,  2046,  2718,  5691,  1010,  2164,  1000,  1017,
        1024,  2184,  2000,  9805,  2863,  1010,  1000,  1000,  2131,
        2460,  2100,  1000,  1998,  1000,  2041,  1997,  4356,  1012,
        1000,  2085,  1010,  7723,  5651,  2000,  2028,  1997,  2010,
        5440,  3494,  1999,  2010, 14751,  2

In [43]:
print("The input has a total of {} tokens.".format(len(input_ids.input_ids[0])))

The input has a total of 459 tokens.


In [44]:
tokens = tokenizer.convert_ids_to_tokens(input_ids.input_ids[0])

for token, id in zip(tokens, input_ids.input_ids[0]):
    print('{:8}{:8,}'.format(token,id))

[CLS]        101
what       2,054
show       2,265
?          1,029
[SEP]        102
(          1,006
cnn       13,229
)          1,007
-          1,011
-          1,011
elm       17,709
##ore      5,686
leonard    7,723
is         2,003
something   2,242
of         1,997
a          1,037
living     2,542
legend     5,722
among      2,426
lovers    10,205
of         1,997
crime      4,126
fiction    4,349
.          1,012
a          1,037
favorite   5,440
of         1,997
millions   8,817
of         1,997
readers    8,141
,          1,010
a          1,037
hero       5,394
to         2,000
scores     7,644
of         1,997
writers    4,898
,          1,010
he         2,002
'          1,005
s          1,055
been       2,042
called     2,170
"          1,000
america    2,637
'          1,005
s          1,055
greatest   4,602
crime      4,126
writer     3,213
.          1,012
"          1,000
the        1,996
86         6,564
-          1,011
year       2,095
old        2,214
author     3,

In [45]:
input_ids.token_type_ids[0]    #Where 0's correpond to question tokens and 1's to context tokens

<tf.Tensor: shape=(459,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     

In [46]:
output = model(input_ids)

In [47]:
print(output.start_logits)
print('\n')
print(output.end_logits)

tf.Tensor(
[[ 0.96938336 -4.945085   -8.391359   -9.309101    0.9693804  -6.896672
  -3.6343331  -8.780928   -7.327328   -7.7984157  -5.2561417  -8.230055
  -5.1379857  -8.726229   -7.2613993  -9.092161   -8.1635895  -7.254375
  -8.012297   -8.77238    -7.26429    -8.662228   -4.251384   -7.3385677
  -9.347054   -7.478221   -7.099795   -8.750254   -7.5264087  -8.873885
  -7.992382   -9.289098   -7.759342   -8.19162    -8.875092   -8.097706
  -9.043478   -7.929871   -9.169423   -6.738127   -8.784195   -8.717615
  -8.417713   -8.254398   -6.8623924  -5.830492   -7.9834275  -8.147529
  -7.2250786  -6.285223   -7.661238   -8.65226    -8.910417   -7.1059246
  -6.679006   -8.167076   -8.700924   -8.458836   -7.205531   -7.9778256
  -8.141532   -7.2472134  -7.2914357  -8.154713   -8.895203   -7.0017657
  -8.390167   -7.712894   -8.546755   -9.008246   -7.634974   -7.0369453
  -8.886316   -8.663387   -6.411747   -7.5073     -8.615711   -7.1074414
  -8.406      -8.355878   -7.916001   -7.918375

In [48]:
#tokens with highest start and end scores
answer_start = tf.argmax(tf.cast(output.start_logits, tf.int32), axis=1)
answer_end = tf.where(tf.equal(output.end_logits, float(tf.reduce_max(output.end_logits[0]))))[:,-1]

In [49]:
print(answer_start, answer_end)

tf.Tensor([205], shape=(1,), dtype=int64) tf.Tensor([205], shape=(1,), dtype=int64)


In [50]:
if answer_end >= answer_start:
    answer = " ".join(tokens[int(answer_start):int(answer_end)+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("Text:\n{}".format(text.capitalize()))
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))

Text:
(cnn) -- elmore leonard is something of a living legend among lovers of crime fiction. a favorite of millions of readers, a hero to scores of writers, he's been called "america's greatest crime writer." the 86-year old author has been writing bestselling books for sixty years, mostly westerns and crime novels. many of them have been turned into hit movies, including "3:10 to yuma," "get shorty" and "out of sight." 

now, leonard returns to one of his favorite characters in his newest book, his 45th novel to be exact, titled simply, "raylan." that would be u.s. marshal raylan givens. the laid back, stetson-wearing lawman first appeared in leonard's novels, "pronto" and "riding the rap" and again in the 2001 short story, "fire in the hole" which became the basis for the hit tv show, "justified," starring timothy olyphant as the title character. the actor and the show are winning over fans, critics and leonard himself. so much so that leonard has returned to writing about "raylan." 

In [51]:
data.loc[random_num]

text        (CNN) -- Elmore Leonard is something of a livi...
question                                           What show?
answer                                             Justified,
Name: 29388, dtype: object

In [52]:
answer = tokens[int(answer_start)]

for i in range(int(answer_start)+1, int(answer_end)+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]

In [53]:
def question_answer(question, text):
    
    #tokenize question and text in ids as a pair
    input_ids = tokenizer(question, text, return_tensors="tf")
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids.input_ids[0])
    
    #model output using input_ids and segment_ids
    output = model(input_ids)
    
    #reconstructing the answer
    answer_start = tf.argmax(tf.cast(output.start_logits, tf.int32), axis=1)
    answer_end = tf.where(tf.equal(output.end_logits, float(tf.reduce_max(output.end_logits[0]))))[:,-1]

    if answer_end >= answer_start:
        answer = tokens[int(answer_start)]
        for i in range(int(answer_start)+1, int(answer_end)+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nAnswer:\n{}".format(answer.capitalize()))

In [54]:
text = """
New York (CNN) -- More than 80 Michael Jackson collectibles -- including the late pop star's famous rhinestone-studded glove from a 1983 performance -- were 
auctioned off Saturday, reaping a total $2 million. Profits from the auction at the Hard Rock Cafe in New York's Times Square crushed pre-sale expectations of 
only $120,000 in sales. The highly prized memorabilia, which included items spanning the many stages of Jackson's career, came from more than 30 fans, 
associates and family members, who contacted Julien's Auctions to sell their gifts and mementos of the singer. Jackson's flashy glove was the big-ticket item 
of the night, fetching $420,000 from a buyer in Hong Kong, China. Jackson wore the glove at a 1983 performance during \"Motown 25,\" an NBC special where he 
debuted his revolutionary moonwalk. Fellow Motown star Walter \"Clyde\" Orange of the Commodores, who also performed in the special 26 years ago, said he 
asked for Jackson's autograph at the time, but Jackson gave him the glove instead. "The legacy that [Jackson] left behind is bigger than life for 
me,\" Orange said. \"I hope that through that glove people can see what he was trying to say in his music and what he said in his music.\" Orange said 
he plans to give a portion of the proceeds to charity. Hoffman Ma, who bought the glove on behalf of Ponte 16 Resort in Macau, paid a 25 percent buyer's 
premium, which was tacked onto all final sales over $50,000. Winners of items less than $50,000 paid a 20 percent premium.
"""

question = "Where was the Auction held?"

In [55]:
question_answer(question, text)


Answer:
The hard rock cafe in new york ' s times square


In [56]:
print("Original answer:\n", data.loc[data["question"] == question]["answer"].values[0])

Original answer:
 Hard Rock Cafe


In [57]:
question="How is called the place where was done the Auction?"
question_answer(question, text)


Answer:
Hard rock cafe


In [58]:
text2 = """
Seth Rollins won the WWE World Heavyweight Championship in the main event at Wrestlemania 31 in the Levi's stadium that attended 
over 100000 people, in a fantastic cashing of The Money In The Bank contract in a heartstopping moment where both fighter were
absolutely hurted. The Undertaker made his return and continued succesfully his streak by defeating Bray Wyatt in a tough match.
John Cena defeated Rusev for the US Championship in a match that became into a battle of nations and 
at moments polemic. One of the dream match all fans loved to see is the battle of two of the greatest teams in the industry: NWO vs DX, 
being represented by Sting and Triple H respectively.
Vince McMahon (WWE's chairman) said: The event was a success and left satisfied the fans because of the surprise factor and the perfect performance
of all superstars. Nevertheless there are plans for tomorrow at monday night RAW where the former champion Brock Lesnar
will have a rematch for the title. """

In [59]:
questions = ["Who won WWE championship?",
             "How many people attented Wrestlemania 31?",
             "Who made his return at the event?",
             "Who represented NWO?",
             "Who represented DX?",
             "Who is WWE's chairman?",
             "What title won John Cena?",
             "When is will be monday night RAW?",
             "Why the event was a success?",
             "What will happen at RAW live event?"]

In [60]:
for quest in questions:
    question_answer(quest, text2)


Answer:
Seth rollins

Answer:
Over 100000

Answer:
The undertaker

Answer:
Sting

Answer:
Triple h

Answer:
Vince mcmahon

Answer:
Us championship

Answer:
Tomorrow

Answer:
The surprise factor and the perfect performance of all superstars

Answer:
Brock lesnar will have a rematch for the title


In [61]:
text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")

while True:
    question_answer(question, text)
    
    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nBye!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break

Please enter your text: 
hello

Please enter your question: 
hi

Answer:
Hi [sep]

Do you want to ask another question based on this text (Y/N)? y

Do you want to ask another question based on this text (Y/N)? what is

Do you want to ask another question based on this text (Y/N)? Y

Please enter your question: 
what is war?

Answer:
Unable to find the answer to your question.

Do you want to ask another question based on this text (Y/N)? Y

Please enter your question: 
kk

Answer:
Kk [sep]

Do you want to ask another question based on this text (Y/N)? N

Bye!
