In [1]:
import urllib.request as urllib
from bs4 import BeautifulSoup
import numpy as np
np.set_printoptions(threshold=10) #np.inf
from scipy import spatial as cs
import json
from pprint import pprint

## Function definitions

In [2]:
# Read in a webpage
def read_in_page(url):
    #Query the website and return the html to the variable 'page'
    page = urllib.urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    return(soup)

# Get all relevant links in a webpage
def get_links(page):
    links = page.find_all("a", class_="action")
    links_new = []
    for link in links:
        sep = '?'
        rest = link['href'].split(sep, 1)[0]
        links_new.append(rest)
    return(links_new)

# Returns url of UvA search engine with keywords
def create_search_query_url(keywords_list):
    main_url = "http://student.uva.nl/search?q=" + keywords_list[0]
    keywords_list.remove(keywords_list[0])
    
    for keyword in keywords_list:
        main_url = main_url + "+" + keyword
    
    return(main_url)

# Returns a list of keywords when given a question URL.
def get_keywords(question_url):
    # grab keywords with beautiful soup
    question_page = read_in_page(question_url)
    keyword_tag = question_page.findAll("meta", {"name": 'keywords'})
    keywords = keyword_tag[0]['content']
    return(keywords.split(','))

# Create general vector for whole dictionary
def general_vector(dictionary):
    # first make a set to ensure no duplicates are added to the vector
    general_vector = set()
    # self explanatory
    for faculty, studies in data.items():
        for study, az_link in studies.items():
            for article, keyword_lists in az_link.items():
                for keyword_list in keyword_lists:
                    keywords = keyword_list.split(',')
                    for word in keywords:
                        general_vector.add(word)
    return list(general_vector)

# Create vector for an article
def vector(URL_keywords, general_vector):
    # URL_keywords is a list of keywords for a certain URL we want a vector for
    URL_vector = np.zeros(len(general_vector))
    # loops through the (now zero) values of the URL vector
    for i, val in enumerate(URL_vector):
        # loops through the given keywords of this URL
        for keyword in URL_keywords:
            # if an article keyword matches a keyword in our general vector
            if keyword == general_vector[i]:
                # change the value of the URL vector to be 1 in the same place as the general vector
                URL_vector[i] = 1
    return URL_vector

### Test with example query

In [5]:
# Example test
example_query = 'Wat zijn de toelatingseisen van kunstmatige intelligentie?'
example_keywords = ['toelatingseisen', 'kunstmatige', 'intelligentie']
example_url = create_search_query_url(example_keywords)
print("Input query keywords:", example_keywords)
print("Query URL:", example_url, "\n")

example_page = read_in_page(example_url)
example_search_results = get_links(example_page)
example_top_five = example_search_results[:5]
print("Top 5 search results:", example_top_five, "\n")

print("Correct webpage URL:", example_search_results[0])
result_keywords = get_keywords(example_search_results[0])
print("Keywords of correct URL:", result_keywords)

Input query keywords: ['kunstmatige', 'intelligentie']
Query URL: http://student.uva.nl/search?q=toelatingseisen+kunstmatige+intelligentie 

Top 5 search results: ['http://www.uva.nl/programmas/bachelors/kunstmatige-intelligentie/toelating-en-inschrijven/toelatingseisen/toelatingseisen.html', 'http://www.uva.nl/shared-content/studentensites/fnwi/iw-gedeelde-content/nl/az/vakaanmelding/vakaanmelding.html', 'http://www.uva.nl/shared-content/studentensites/fnwi/iw-gedeelde-content/nl/az/bindend-studieadvies-bsa/bindend-studieadvies-bsa.html', 'http://www.student.uva.nl/ki', 'http://www.uva.nl/programmas/bachelors/kunstmatige-intelligentie/toelating-en-inschrijven/toelating-en-inschrijven.html'] 

Correct webpage URL: http://www.uva.nl/programmas/bachelors/kunstmatige-intelligentie/toelating-en-inschrijven/toelatingseisen/toelatingseisen.html
Keywords of correct URL: ['faculteiten', 'natuurwetenschappen', ' wiskunde en informatica']


I wanted to test the whole process with an example query ('Wat zijn de toelatingseisen van kunstmatige intelligentie?'), but as shown above the correct webpage does not have the same keywords as the input query. Specifically, it does not have any keywords that would indicate the content of the article. 

However it is the first search result returned by the UvA search engine. Although, it needs to be said that the keywords needs to exactly match either the words in the url or the title of the article. Because the keyword 'ki' will not return the same results as 'kunstmatige', 'intelligentie', neither does 'ingangseisen' work.
Moreover, we would not be able to calculate the cosine similarity or any kind of score based on matching keywords.

### Import json file and replace dictionary keywords with vectors

In [6]:
with open('uva_json_file', 'r') as f:
    data = json.load(f)

In [7]:
generalvec = general_vector(data)
print(len(generalvec))

3471


In [8]:
%%time
# Replace the keywords of each article with a vector
for faculty, studies in data.items():
    for study, az_links in studies.items():
        for article, keyword_lists in az_links.items():
            for keyword_list in keyword_lists:
                keywords = keyword_list.split(',')
                vec = vector(keywords, generalvec)
            # save vec in list form instead of np array 
            # because arrays don't work with json
            az_links.update({article: vec}) 

CPU times: user 1min 18s, sys: 264 ms, total: 1min 19s
Wall time: 1min 19s


### Test UvA search engine & cosine similarity scores
In this test, we'll be using the UvA search engine. We'll use an example query and determine which words will be the keywords, these keywords will be put in the UvA search engine. A list of search results will be returned, and the cosien similarity scores will be calculated for the top 10 search results.

In [9]:
# Example test
query = 'Wat zijn de toelatingseisen van kunstmatige intelligentie?' # example query
query_keywords = ['toelatingseisen', 'kunstmatige', 'intelligentie'] # keywords from the query
query_url = create_search_query_url(query_keywords) # resulting url of keywords put in search engine

query_page = read_in_page(query_url) # read in query url
search_results = get_links(query_page) # get the links of the search results
top_five = search_results[:5] # get the links of the top 5 search results

keywords = get_keywords(search_results[0]) # get the keywords of a search result

In [10]:
# Define the query and top 5 article vectors
query_vector = vector(query_keywords, generalvec)
article1_vector = vector(search_results[0], generalvec)
article2_vector = vector(search_results[1], generalvec)
article3_vector = vector(search_results[2], generalvec)
article4_vector = vector(search_results[3], generalvec)
article5_vector = vector(search_results[4], generalvec)

print("Query vector:", query_vector)
print("Article 1 vector:", article1_vector)
print("Article 2 vector:", article2_vector)
print("Article 3 vector:", article3_vector)
print("Article 4 vector:", article4_vector)
print("Article 5 vector:", article5_vector)

Query vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 1 vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 2 vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 3 vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 4 vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 5 vector: [ 0.  0.  0. ...,  0.  0.  0.]


In [11]:
# Calculate the cosine similarities
cosine_sim = 1 - cs.distance.cosine(article1_vector, query_vector)
print(cosine_sim)

nan


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


As mentioned above the correct webpage does not contain the same keywords as the input query, even though the correct webpage is incorporated in our json file, so those keywords do appear in our general vector. However, the query keywords are matched in the content of the article, just not the embedded keywords. This results in the query vector consisting of only zero's, so it's impossible to calculate the cosine similarity. 

### Test 2

In [16]:
# Example test
query = 'Wat zijn de laptop eisen voor kunstmatige intelligentie?' # example query
query_keywords = ['eisen', 'kunstmatige', 'intelligentie', 'laptop'] # keywords from the query
query_url = create_search_query_url(query_keywords) # resulting url of keywords put in search engine

query_page = read_in_page(query_url) # read in query url
search_results = get_links(query_page) # get the links of the search results

# get the keywords for the top 5 search results
article1_keywords = get_keywords(search_results[0]) 
article2_keywords = get_keywords(search_results[1])
article3_keywords = get_keywords(search_results[2])
article4_keywords = get_keywords(search_results[3])
article5_keywords = get_keywords(search_results[4])

In [17]:
# Define the query and top 5 article vectors
query_vector = vector(query_keywords, generalvec)
article1_vector = vector(article1_keywords, generalvec)
article2_vector = vector(article2_keywords, generalvec)
article3_vector = vector(article3_keywords, generalvec)
article4_vector = vector(article4_keywords, generalvec)
article5_vector = vector(article5_keywords, generalvec)

print("Query vector:", query_vector)
print("Article 1 vector:", article1_vector)
print("Article 2 vector:", article2_vector)
print("Article 3 vector:", article3_vector)
print("Article 4 vector:", article4_vector)
print("Article 5 vector:", article5_vector)

Query vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 1 vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 2 vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 3 vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 4 vector: [ 0.  0.  0. ...,  0.  0.  0.]
Article 5 vector: [ 0.  0.  0. ...,  0.  0.  0.]


In [18]:
# Calculate the cosine similarities
cosine_sim1 = 1 - cs.distance.cosine(article1_vector, query_vector)
cosine_sim2 = 1 - cs.distance.cosine(article2_vector, query_vector)
cosine_sim3 = 1 - cs.distance.cosine(article3_vector, query_vector)
cosine_sim4 = 1 - cs.distance.cosine(article4_vector, query_vector)
cosine_sim5 = 1 - cs.distance.cosine(article5_vector, query_vector)

print(cosine_sim1)
print(cosine_sim2)
print(cosine_sim3)
print(cosine_sim4)
print(cosine_sim5)

0.333333333333
0.0
0.0
0.0
0.0


The first article has a cosine similarity of 0.333, whereas the rest of the articles has a cosine similarity of 0.0. So the first article has the highest probability of returning the answer. The difference between this test and the first test, is that some of the query keywords do appear in the keyword list of the search result.

In [21]:
print("Query keywords:", query_keywords, "\n")
print("URL of first article:", search_results[0])
print("Keywords of the first article:", article1_keywords)

Query keywords: ['kunstmatige', 'intelligentie', 'laptop'] 

URL of first article: http://student.uva.nl/ki/content/az/laptop-minimumeisen/laptop-minimumeisen.html
Keywords of the first article: ['minimumeisen', 'studenten', 'ict en faciliteiten', 'natuurwetenschappen', ' wiskunde en informatica', 'ict', 'computereisen', 'laptop', 'kunstmatige intelligentie']
