In [7]:
from SPARQLWrapper import SPARQLWrapper, JSON, XML
import ssl
import pandas as pd
import urllib.request
from PIL import Image
import time
import shutil
import requests
import numpy as np
import random


In [8]:
ssl._create_default_https_context = ssl._create_unverified_context

In [9]:
def get_countries_df():
  """
  This function makes the SPARQL request to DBpedia to get the countries df

  Returns : 
  - df_countries : The df of countries information
  """

  sparql = SPARQLWrapper("http://dbpedia.org/sparql")
  sparql.setQuery("""
      SELECT DISTINCT ?country  ?capital ?country_name ?capital_name ?country_flag ?country_population ?country_abstract
      WHERE {
        ?country a yago:WikicatCountries , dbo:Country.
        ?country dbo:capital ?capital.
        ?country rdfs:label ?country_name.
        ?capital rdfs:label ?capital_name.
        ?country dbo:thumbnail ?country_flag.
        ?country dbo:abstract ?country_abstract.
        ?country dbo:populationTotal ?country_population

        FILTER(lang(?country_name) = 'en')
        FILTER(lang(?capital_name) = 'en')
        FILTER(lang(?country_abstract) = 'en')

      } 
      
      ORDER BY ?country
          
  """)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()

  d = {
      "Country_uri" : [],
      "Country_name" : [],
      "Country_flag" : [], 
      "Capital_uri" : [], 
      "Capital_name" : [], 
      "Country_population" : [] ,
      "Country_abstract" : []
      }

  for result in results["results"]["bindings"]:
    d["Country_uri"].append(result["country"]["value"])
    d["Country_name"].append(result["country_name"]["value"])
    d["Country_flag"].append(result["country_flag"]["value"])
    d["Capital_uri"].append(result["capital"]["value"])
    d["Country_population"].append(int(result["country_population"]["value"]))
    d["Capital_name"].append(result["capital_name"]["value"].split(",")[0]) #split with , because of "capital, region" kind of answer

    d["Country_abstract"].append(result["country_abstract"]["value"])

  df_countries = pd.DataFrame(d)

  return df_countries

In [10]:
#Printing random country with capital and flag

def print_random_country_capital_flag(df_countries):
    """
    Given a countries df, prints a random country with its capital and display its flag

    Function arguments : 
    - df_countries : the countries df
    """
    df_countries = get_countries_df()

    row = df_countries.sample(ignore_index=True).iloc[0]
    country_name = row.loc["Country_name"]
    country_capital = row["Capital_name"]
    flag_link = row["Country_flag"]

    print(f"Pays : {country_name}")
    print(f"Capitale : {country_capital}")
    
    urllib.request.urlretrieve(flag_link, "temp_flag.png")
    
    img = Image.open("temp_flag.png")
    img.show()


In [11]:
from urllib.parse import urlsplit, urlunsplit, quote

def iri2uri(iri):
    """
    Convert an IRI to a URI

    Function arguments :
    - iri : iri to convert
    
    Returns :
    - uri : the converted IRI
    """
    uri = ''
    if isinstance(iri, str):
        (scheme, netloc, path, query, fragment) = urlsplit(iri)
        scheme = quote(scheme)
        netloc = netloc.encode('idna').decode('utf-8')
        path = quote(path)
        query = quote(query)
        fragment = quote(fragment)
        uri = urlunsplit((scheme, netloc, path, query, fragment))

    return uri

In [12]:
# Downloading all flags for all countries
def download_flag(df_countries):
    """
    From df_countries download all the flags

    Function arguments :
    - df_countries : the countries df
    """
    done = [] # for coutries that appear twice or more in the dataset (multiple capitals)

    for index, row in df_countries.iterrows():
        time.sleep(0.5)
        country = row["Country_name"]
        if country not in done:
            try :
                url = row["Country_flag"]
                urllib.request.urlretrieve(url, "../country_flags/" + country + "_flag.png")
                done.append(country)
            except:
                print(country)

We can see that we have some problems dealing with Ivory Coast and Spain, until now I have not find a solution so I'll just hardcode those two by downloading manually the flags...

In [13]:
def get_country_list(df_countries):
    """
    From the countries df returns the list of countries
    
    Function arguments : 
    - df_countries : the countries df
    
    Returns :
    - country_list : a list of all country names
    """
    country_list = df_countries["Country_name"].tolist()
    return country_list

In [14]:
def generate_flag_question(country_name,country_list):
    """
    From a country name, generates a multiple answer question with 4 flags and one correct flag to guess

    Function arguments :
    - country_name : str of the name of the country
    - country_list : a list of all country names
    
    Returns :
    - country_flag_filename_list : list of the country_flag filenames
    - d : dict such that dict[flag_filename] = True if the flag is correct and False in the other cases
    """

    correct_flag_filename = "../country_flags/" + country_name + "_flag.png"

 

    country_flag_filename_list = [correct_flag_filename]

    for _ in range(3):
        found = False #In case the random country is already selected
        while not found :
            random_country_name = country_list[np.random.randint(0,len(country_list))]
            random_country_flag_filename = "../country_flags/" + random_country_name + "_flag.png"
            found = random_country_flag_filename not in country_flag_filename_list
        country_flag_filename_list.append(random_country_flag_filename)
    
    random.shuffle(country_flag_filename_list)

    d = {}

    for country_flag_filename in country_flag_filename_list : 
        if country_flag_filename == correct_flag_filename : 
            d[country_flag_filename] = True
        else:
            d[country_flag_filename] = False

    return country_flag_filename_list, d





In [15]:
def evaluate_country_flag_question(country_flag_filename_list, d, user_input):
    """
    Given a flag question, evaluates the user answer

    Function arguments :
    - country_flag_filename_list : list of the country_flag filenames
    - d : dict such that dict[flag_filename] = True if the flag_filename is correct and False else
    - user_input : the str input that the user made to answer the flag question (here we suppose that it is the index of the flag in the country_flag_filename_list)

    Returns : 
    - evaluation : a bool that is True if the answer is accepted and False else
    """

    user_flag_filename_proposal = country_flag_filename_list[int(user_input)]
    evaluation = d[user_flag_filename_proposal]
    return evaluation

In [16]:
#Cell to test the flag question

df_countries = get_countries_df()

country_list = get_country_list(df_countries)
country_name = country_list[np.random.randint(0,len(country_list))]

country_flag_filename_list, d = generate_flag_question(country_name,country_list)

#generating good user_input
good_user_input = ""
for k in range(len(country_flag_filename_list)):
    if d[country_flag_filename_list[k]]:
        good_user_input = str(k)
        break

good_evaluation = evaluate_country_flag_question(country_flag_filename_list, d, good_user_input)

#generating bad_user_input
bad_user_input = ""
for k in range(len(country_flag_filename_list)):
    if not d[country_flag_filename_list[k]]:
        bad_user_input = str(k)
        break

bad_evaluation = evaluate_country_flag_question(country_flag_filename_list, d, bad_user_input)

print(f"For country = {country_name}, the answer_dictionnary was : {d}")

print(f"The evaluation for the good user input which was {good_user_input} is {good_evaluation}")

print(f"The evaluation for the bad user which was {bad_user_input} is {bad_evaluation}")


For country = Saudi Arabia, the answer_dictionnary was : {'../country_flags/Saudi Arabia_flag.png': True, '../country_flags/Algeria_flag.png': False, '../country_flags/Equatorial Guinea_flag.png': False, '../country_flags/Togo_flag.png': False}
The evaluation for the good user input which was 0 is True
The evaluation for the bad user which was 1 is False


In [17]:
def get_country_capital_dict(df_countries):
    """
    From the countries df create a the country to capital dict

    Function arguments : 
    - df_countries : the countries df
    
    Return :
    - country_capital_dict : a dict such that dict[country_name] = country_capital
    """
    keys = get_country_list(df_countries)
    values = df_countries["Capital_name"].tolist()
    country_capital_dict = {}

    for i in range(len(keys)):
        country_capital_dict[keys[i]] = values[i]

    return country_capital_dict
    

In [18]:
def generate_country_capital_question(country_name, country_list, country_capital_dict):
    """
    From a country name, generates a multiple answer question with 4 capital and one correct capital to guess

    Function arguments :
    - country_name : str of the name of the country
    - country_list : a list of all country names
    - country_capital dict : a dict such that dict[country_name] = country_capital

    Returns :
    - list of the country_capital str
    - dict such that dict[country_capital] = True if the capital is correct and False in the other cases
    """

    correct_capital = country_capital_dict[country_name]
    

    country_capital_list = [correct_capital]

    for _ in range(3):
        found = False #In case the random country is already selected
        while not found :
            random_country_name = country_list[np.random.randint(0,len(country_list))]
            random_capital_name = country_capital_dict[random_country_name]
            found = random_capital_name not in country_capital_list
        country_capital_list.append(random_capital_name)

    
    random.shuffle(country_capital_list)

    d = {}
    for country_capital in country_capital_list:
        if country_capital == correct_capital:
            d[country_capital] = True
        else:
            d[country_capital] = False

    return country_capital_list, d

In [19]:
def evaluate_country_capital_question(country_capital_list, d, user_input):
    """
    Given a country capital question, evaluates the user answer

    Function arguments :
    - country_capital_list : list of the country_capital
    - d : dict such that dict[country_capital] = True if the country_capital is correct and False else
    - user_input : the str input that the user made to answer the country_capital question (here we suppose that it is the index of the country_capital in the country_capital_list)

    Returns : 
    - evaluation : a bool that is True if the answer is accepted and False else
    """

    user_country_capital_proposal = country_capital_list[int(user_input)]
    evaluation = d[user_country_capital_proposal]
    return evaluation

In [20]:
#Cell to test the capital question

df_countries = get_countries_df()

country_list = get_country_list(df_countries)
country_name = country_list[np.random.randint(0,len(country_list))]
country_capital_dict = get_country_capital_dict(df_countries)

country_capital_list, d = generate_country_capital_question(country_name, country_list, country_capital_dict)

#generating good user_input
good_user_input = ""
for k in range(len(country_capital_list)):
    if d[country_capital_list[k]]:
        good_user_input = str(k)
        break

good_evaluation = evaluate_country_flag_question(country_capital_list, d, good_user_input)

#generating bad_user_input
bad_user_input = ""
for k in range(len(country_capital_list)):
    if not d[country_capital_list[k]]:
        bad_user_input = str(k)
        break

bad_evaluation = evaluate_country_capital_question(country_capital_list, d, bad_user_input)

print(f"For country = {country_name}, the answer_dictionnary was : {d}")

print(f"The evaluation for the good user input which was {good_user_input} is {good_evaluation}")

print(f"The evaluation for the bad user which was {bad_user_input} is {bad_evaluation}")


For country = Iceland, the answer_dictionnary was : {'Reykjavík': True, 'Dodoma': False, 'Stockholm': False, 'City of San Marino': False}
The evaluation for the good user input which was 0 is True
The evaluation for the bad user which was 1 is False


In [21]:
def get_country_population_dict(df_countries):
    """
    From the cities df create a the country to capital dict

    Function arguments : 
    - df_countries : the countries df

    Returns :
    - country_population_dict : a dict such that dict[country_name] = country_population
    """
    keys = get_country_list(df_countries)
    values = df_countries["Country_population"].tolist()
    country_population_dict = {}

    for i in range(len(keys)):
        country_population_dict[keys[i]] = values[i]

    return country_population_dict
    

In [22]:
def generate_population_question(country_name, error_margin, country_population_dict):
    """
    From a country name, asks the population of the country given an error_margin
    
    Function arguments :
    - country_name : str of the name of the country
    - error_margin : number between 0 and 1 defining if the answer is "close enough" to the real number
    - country_population_dict : a dict such that dict[country_name] = country_population
    
    Returns :
    - interval : the acceptation interval (if the input is in the interval, the answer is considered True)
    """

    country_population = country_population_dict[country_name]
    lower = int((1-error_margin) * country_population)
    higher = int((1+error_margin) * country_population)
    interval = [lower, higher]

    return interval


In [23]:
def evaluate_population_question(interval, user_input):
    """
    Given a population question, evaluates the user answer

    Function arguments :
    - interval : the acceptation interval (if the input is in the interval, the answer is considered True)
    - user_input : the str input that the user made to answer the population question (here we suppose that it is the guessed population of the country)

    Returns : 
    - evaluation : a bool that is True if the answer is accepted and False else
    """
    lower = interval[0]
    higher = interval[1]
    guessed_population = int(user_input)
    evaluation = (guessed_population >= lower) and (guessed_population <= higher)
    
    return evaluation

In [24]:
#Cell to test the population question

df_countries = get_countries_df()

country_list = get_country_list(df_countries)
country_name = country_list[np.random.randint(0,len(country_list))]
error_margin = 0.1
country_population_dict = get_country_population_dict(df_countries)

interval = generate_population_question(country_name, error_margin, country_population_dict)


#generating good user_input

lower = interval[0]
higher = interval[1]

good_user_input = str((lower+higher)//2)

good_evaluation = evaluate_population_question(interval, good_user_input)

#generating bad_user_input
bad_user_input = str(lower-1)

bad_evaluation = evaluate_population_question(interval, bad_user_input)

print(f"For country = {country_name}, the population interval was : {interval}")

print(f"The evaluation for the good user input which was {good_user_input} is {good_evaluation}")

print(f"The evaluation for the bad user which was {bad_user_input} is {bad_evaluation}")


For country = Papua New Guinea, the population interval was : [8041500, 9828500]
The evaluation for the good user input which was 8935000 is True
The evaluation for the bad user which was 8041499 is False


Now trying some question generation from the abstract that was available for each country in DBpedia.

In [160]:
from transformers import pipeline
import pke
import random

In [162]:
# Carefull, the model can require some memory space (1.14GB)

def get_summarizer():
    """
    Get the summarizer
    Returns :
    - summarizer : the summarizer model from transformers
    """
    summarizer = pipeline("summarization")
    return summarizer

In [163]:
def summarize(text,summarizer):
    """
    Summarizes the given text.

    Function arguments :   
    - text : the text to summarize
    - summarizer : the summarizer model

    Returns :
    -summarized_text : the summarized text computed with the given model

    """
    
    max_chunk = 500 #Because of input size limits for transformers

    #Separating sentences
    text = text.replace('.', '.<eos>')
    text = text.replace('?', '?<eos>')
    text = text.replace('!', '!<eos>')

    sentences = text.split('<eos>')

    #Creating chunks of sentences of limited size (with regard to max_chunk)
    current_chunk = 0 
    chunks = []
    for sentence in sentences:
        if len(chunks) == current_chunk + 1: 
            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(sentence.split(' '))
            else:
                current_chunk += 1
                chunks.append(sentence.split(' '))
        else:
            chunks.append(sentence.split(' '))

    for chunk_id in range(len(chunks)):
        chunks[chunk_id] = ' '.join(chunks[chunk_id])

    #summarizing the chunks one by one
    summarized_text = summarizer(chunks, max_length=120, min_length=30, do_sample=False)

    summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])

    return summarized_text



In [164]:
summarizer = get_summarizer()
df_countries = get_countries_df()

abstract = df_countries["Country_abstract"][40] #40 for France
summarized_abstract = summarize(abstract, summarizer)
summarized_abstract

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


" France is a transcontinental country spanning Western Europe and overseas regions and territories in the Americas and the Atlantic, Pacific and Indian Oceans . It is a unitary semi-presidential republic with its capital in Paris, the country's largest city and main cultural and commercial centre . France borders Belgium, Luxembourg, Germany, Switzerland, Monaco, Italy, Andorra and Spain in Europe, as well as the Netherlands, Suriname and Brazil . Its eighteen integral regions span a combined area of 643,801 km2 and over 67 million people .  France reached its political and military zenith in the early 19th century under Napoleon Bonaparte . The collapse of the empire initiated a period of relative decline, in which France endured a tumultuous succession of governments until the founding of the French Third Republic during the Franco-Prussian War in 1870 . France is a developed country with the world's seventh-largest economy by nominal GDP and ninth-largest by PPP ."

In [165]:
def extract_keyphrases(text, nb_keyphrases):
    """
    Extracts the keyphrases of the text

    Function arguments :
    - text : the text from which to extract the keyphrases

    Returns :
    keyphrases : a list of tuple of the form (keyphrase, score)
    """

    extractor = pke.unsupervised.TopicRank()

    extractor.load_document(input=text, language='en')

    extractor.candidate_selection()

    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=nb_keyphrases)

    return keyphrases

In [171]:
nb_keyphrases = 10
summarized_abstract_keyphrases = extract_keyphrases(summarized_abstract, nb_keyphrases)
summarized_abstract_keyphrases

[('france', 0.053973506354601485),
 ('war', 0.032260360042388435),
 ('french', 0.0282115285774075),
 ('mid-15th century', 0.022648176687417428),
 ('powerful', 0.019211290238397894),
 ('world', 0.018312805326517398),
 ('main cultural', 0.01824327381204678),
 ('western europe', 0.017091702750270884),
 ('largest city', 0.016859537935881647),
 ('carolingian empire', 0.01621793516710052)]

In [201]:
from sense2vec import Sense2Vec

from collections import OrderedDict

In [200]:
def generate_distractors_from_word(word, s2v):

    """
    Generates a distractor list from a given word

    Function arguments :
    - word : the word to generate distractors from
    - s2v : the sentence to vector model

    Returns :
    - disctractors_list : a list of distractors of the word
    """

    output = []
    word = word.lower()
    word = word.replace(" ", "_")

    sense = s2v.get_best_sense(word)
    most_similar = s2v.most_similar(sense, n=20)

    # print ("most_similar ",most_similar)

    for each_word in most_similar:
        append_word = each_word[0].split("|")[0].replace("_", " ").lower()
        if append_word.lower() != word:
            output.append(append_word.title())

    disctractors_list = list(OrderedDict.fromkeys(output))
    return disctractors_list

In [205]:
s2v = Sense2Vec().from_disk('s2v_model')
for word, _ in summarized_abstract_keyphrases:
    try :
        disctractors_list = generate_distractors_from_word(word, s2v)
        print(word, disctractors_list)
    except :
        print(word + " is not a good keyphrase")

france ['Britain', 'Germany', 'Belgium', 'Serbia', ' France', 'Europe', 'Poland', 'Britian', 'Spain', 'Austria', 'Other European Countries', 'Algeria', 'Albania', 'Bulgaria', 'Bosnia', 'Hungary', 'Sweden', 'Turkey', 'Western Europe']
war ['Wars', 'Civil War', 'Actual War', 'Total War', 'Global War', 'Waged', 'Bloody War', 'Whole War', 'Massive War', 'Multiple Wars', 'Many Wars', 'Brutal War', 'Entire War', 'Uprising', 'Waging', 'Wage', 'Other War', 'Real War', 'Invasion', 'Other Wars']
french ['German', 'Dutch', 'Spanish', 'Polish', 'Portuguese', 'Turkish', 'Danish', 'English', 'Italian', 'Czech', 'Hungarian', 'British', 'Swedish', 'Russian']
mid-15th century is not a good keyphrase
powerful ['Powerfull', 'Poweful', 'Weaker', 'Strong', 'Overpowered', 'Potential Power', 'Raw Power', 'Most Powerful One', 'Sheer Power', 'Weak', 'Underpowered', 'Powered', 'Fearsome', 'Stronger', 'Puny', 'More Powerfull']
world ['Entire World', 'Whole World', 'World-', ' World', 'World*.', 'Planet Earth', '