In [19]:
%run Vector.ipynb

In [32]:
np.set_printoptions(threshold=10) #np.inf

## Input 

Dictionary:

    Language (int):
        0 = Nederlands       
        1 = Engels
              
    Type (int):  
        0 = Onbekend      
        1 = Wie        
        2 = Wat        
        3 = Waar        
        4 = Waarom        
        5 = Wanneer        
        6 = Hoe        
        7 = Welke        
        
    Level (str):        
        Studie        
        Faculteit        
        UvA        
        None        
        
    Keywords (set):        
        {<keyword_1>, ..., <keyword_n>}        
        
    Source (str):
        <Oorspronkelijke zin>

We assume that for this specific problem (only searching the tree we have established) we actually only need the keywords provided from the input query. The reason for this is that the keywords of the articles we have are a mix of both Dutch and English and will therefore automatically return the right URL. The type is irrelevant for this problem as we only have access to the keywords of the articles because we have not scraped the entire texts. Therefore, we are unable to parse what kind of question a person is asking, this will be handled in our other solution of using Watson to find the right text. 
The source is also not relevant as we have the keywords necessary.

In [110]:
query = {'Language':0, 
         'Type':0, 
         'Level':'Kunstmatige intelligentie (bachelor)', 
         'Keywords':set(['laptop', 'eisen', 'kunstmatige', 'intelligentie']), 
         'Source':'Wat zijn de laptop eisen voor kunstmatige intelligentie?'}

In [115]:
def analyse_query(query):
    keywords = list(query.get('Keywords'))
    level = query.get('Level')
    queryvec = vector(keywords, generalvec)
    return queryvec, level, keywords

In [114]:
def search_tree(data, queryvec, level):
    topdict = {}
    value = []
    highest = 0
    for faculty, studies in data.items():
        for study, az_links in studies.items():
            #if study == level:
            for article, articlevec in az_links.items():
                cosinesim = 1 - cs.distance.cosine(articlevec, queryvec)
                if cosinesim > highest:
                    #highest = cosinesim
                    topdict.update({article: cosinesim})
    toplinks = sorted(topdict, key=topdict.get, reverse=True)
    return toplinks, topdict

In [120]:
def topresults(toplinks, topdict, level, keywords, n):
    topndicts = []
    for toplink in toplinks[:n]:
        for link, score in topdict.items():
            if toplink == link:
                topndicts.append({'URL':link, 'Score':score, 'Level':level, 'Keywords':keywords})
    return topndicts

In [121]:
def main(query, data, n):
    queryvec, level, keywords = analyse_query(query)
    toplinks, topdict = search_tree(data, queryvec, level)
    topndicts = topresults(toplinks, topdict, level, keywords, n)
    return topndicts

print(main(query, data, 10))

[{'URL': 'http://www.student.uva.nl/sgpl/content/az/vak-en-tentamenaanmelding-sgp/ingangseisen/ingangseisen-kopie.html', 'Score': 0.25, 'Level': 'Kunstmatige intelligentie (bachelor)', 'Keywords': ['kunstmatige', 'laptop', 'eisen', 'intelligentie']}, {'URL': 'http://www.student.uva.nl/inc/content/az/laptop-minimumeisen/laptop-minimumeisen.html', 'Score': 0.25, 'Level': 'Kunstmatige intelligentie (bachelor)', 'Keywords': ['kunstmatige', 'laptop', 'eisen', 'intelligentie']}, {'URL': 'http://www.student.uva.nl/sgpl/content/az/ingangseisen/ingangseisen.html', 'Score': 0.23570226039551578, 'Level': 'Kunstmatige intelligentie (bachelor)', 'Keywords': ['kunstmatige', 'laptop', 'eisen', 'intelligentie']}, {'URL': 'http://www.student.uva.nl/ki/content/az/laptop-minimumeisen/laptop-minimumeisen.html', 'Score': 0.23570226039551578, 'Level': 'Kunstmatige intelligentie (bachelor)', 'Keywords': ['kunstmatige', 'laptop', 'eisen', 'intelligentie']}, {'URL': 'http://www.student.uva.nl/ifk/content/az/la