In [1]:
import numpy as np
import requests
import html2text
from googlesearch import search
import json
import re
from simpletransformers.question_answering import QuestionAnsweringModel
from IPython.display import display
from IPython.html import widgets
from bs4 import BeautifulSoup
from markdown import markdown



In [2]:
model = QuestionAnsweringModel('distilbert', 'distilbert-base-uncased-distilled-squad')

In [8]:
question_data = {
    'qas': 
    [{'question': 'What color is the sky',
       'id': 0,
        'answers': [{'text': ' ', 'answer_start': 0}],
        'is_impossible': False}],
        'context': 'the sky is blue'
    }

In [10]:
question_data

{'qas': [{'question': 'What color is the sky',
   'id': 0,
   'answers': [{'text': ' ', 'answer_start': 0}],
   'is_impossible': False}],
 'context': 'the sky is blue'}

In [12]:
prediction = model.predict([question_data])

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 1679.06it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 25420.02it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
print(prediction)

([{'id': 0, 'answer': ['the sky is blue', 'blue', 'sky is blue', 'the sky', 'is blue', 'sky', 'the', '', 'the sky is', 'sky is']}], [{'id': 0, 'probability': [0.6718919034393995, 0.2903915782771355, 0.025824106266482086, 0.009136212452780911, 0.0017607881690078252, 0.0003511495227819054, 0.0003260312949665356, 0.0002628223258765999, 5.322314582283977e-05, 2.0456269327398623e-06]}])


In [16]:
input(question_data)

{'qas': [{'question': 'What color is the sky', 'id': 0, 'answers': [{'text': ' ', 'answer_start': 0}], 'is_impossible': False}], 'context': 'the sky is blue'} When did Albert Einstein die?


'When did Albert Einstein die?'

In [17]:
question_data


{'qas': [{'question': 'What color is the sky',
   'id': 0,
   'answers': [{'text': ' ', 'answer_start': 0}],
   'is_impossible': False}],
 'context': 'the sky is blue'}

In [18]:
!export CUDA_VISIBLE_DEVICES=1,2

In [19]:
!echo $CUDA_VISIBLE_DEVICES

1,2


In [22]:
prediction = model.predict([question_data])

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 1136.67it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 24528.09it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
print(prediction)


([{'id': 0, 'answer': ['the sky is blue', 'blue', 'sky is blue', 'the sky', 'is blue', 'sky', 'the', '', 'the sky is', 'sky is']}], [{'id': 0, 'probability': [0.6718919034393995, 0.2903915782771355, 0.025824106266482086, 0.009136212452780911, 0.0017607881690078252, 0.0003511495227819054, 0.0003260312949665356, 0.0002628223258765999, 5.322314582283977e-05, 2.0456269327398623e-06]}])


In [81]:
# Example

# model = create_model()

# print(predict_answer(model, 'what color is the bird?', 'the bird is red.'))

question = 'What color is the sky?'
links = list(search(question, num_results=2))
req = requests.get(links[1])
markdown_string = req.text
context = 'sky is blue'
contexts = []
contexts.append(context)
#pred = predict_answer(model, question, context)
#print(pred)

In [82]:
links
#req.text
#print(html2text.html2text(markdown_string)

['https://www.universetoday.com/74020/what-color-is-the-sky/',
 'https://spaceplace.nasa.gov/blue-sky/en/']

In [84]:
print(html2text.html2text(req.text))

blue-sky

[ ![Link to nasa.gov](/resources/homepage/nasa.png) ](https://www.nasa.gov)

[ ![Link to science.nasa.gov.](/resources/homepage/nasa-science-logo-
horizontal.png) ](https://science.nasa.gov) [ ![Image that reads Space Place
and links to spaceplace.nasa.gov.](/resources/homepage/logo.png) ](/)

[Vea en Español](/sp/blue-sky/)

![Search button.](/resources/homepage/search-button.png)

[![Illustration of Earth that links to the Space Place Earth
menu.](/resources/homepage/nav_earth.png)Earth](/menu/earth) [![Illustration
of the Sun that links to the Space Place Sun
menu.](/resources/homepage/nav_sun.png)Sun](/menu/sun) [![Illustration of
Saturn that links to the Space Place Solar System
menu.](/resources/homepage/nav_solarsystem.png)Solar System](/menu/solar-
system) [![Illustration of a galaxy that links to the Space Place Universe
menu.](/resources/homepage/nav_space.png)Universe](/menu/space)
[![Illustration of a spacecraft that links to the Space Place Science and Tech
menu.

In [69]:
# Source: https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))

    return text

def format_text(text):
    text = markdown_to_text(text)
    text = text.replace('\n', ' ')
#     text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
#     text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    print(text)

    return text

In [54]:
def predict_answer(model, question, contexts, seq_len=512, debug=False):
    split_context = []
    
    if not isinstance(contexts, list):
        contexts = [contexts]
    
    for context in contexts:
        for i in range(0, len(context), seq_len):
            split_context.append(context[i:i+seq_len])
            
    split_context = contexts
    
    f_data = []
    
    for i, c in enumerate(split_context):
        f_data.append(
            {'qas': 
              [{'question': question,
               'id': i,
               'answers': [{'text': ' ', 'answer_start': 0}],
               'is_impossible': False}],
              'context': c
            })
        
    prediction = model.predict(f_data)
    if debug:
        print(prediction)
    preds = [x['answer'].lower().strip() for x in prediction if x['answer'].strip() != '']
    
    return max(set(preds), key = preds.count)

In [36]:
def query_pages(query, n=5):
    return list(search(query, num=n, stop=n, pause=2))

def query_to_text(query, n=5):
    html_conv = html2text.HTML2Text()
    html_conv.ignore_links = True
    html_conv.escape_all = True
    
    text = []
    for link in query_pages(query, n):
        req = requests.get(link)
        text.append(html_conv.handle(req.text))
        text[-1] = format_text(text[-1])
        
    return text

In [55]:
question = 'What color is blood?'
result = query_to_text(question, n=5)
print(result)

TypeError: search() got an unexpected keyword argument 'num'

In [56]:
def q_to_a(model, question, n=2, debug=False):
    context = query_to_text(question, n=n)
    pred = predict_answer(model, question, context, debug=debug)
    return pred


In [None]:
question = 'What color is blood?'
context = query_to_text(question, n=3)
#pred = predict_answer(model, question, context)
#print(pred)