In [3]:
%run Vector.ipynb

In [4]:
np.set_printoptions(threshold=10) #np.inf

In [5]:
with open('uva_json_file', 'r') as f:
    olddata = json.load(f)

## Input 

Dictionary:

    Language (int):
        0 = Nederlands       
        1 = Engels
              
    Type (int):  
        0 = Onbekend      
        1 = Wie        
        2 = Wat        
        3 = Waar        
        4 = Waarom        
        5 = Wanneer        
        6 = Hoe        
        7 = Welke        
        
    Level (str):        
        Studie        
        Faculteit        
        UvA        
        None        
        
    Keywords (set):        
        {<keyword_1>, ..., <keyword_n>}        
        
    Source (str):
        <Oorspronkelijke zin>

We assume that for this specific problem (only searching the tree we have established) we actually only need the keywords provided from the input query. The reason for this is that the keywords of the articles we have are a mix of both Dutch and English and will therefore automatically return the right URL. The type is irrelevant for this problem as we only have access to the keywords of the articles because we have not scraped the entire texts. Therefore, we are unable to parse what kind of question a person is asking, this will be handled in our other solution of using Watson to find the right text. 
The source is also not relevant as we have the keywords necessary.

In [6]:
query = {'Language':0, 
         'Type':0, 
         'Level':'Kunstmatige intelligentie (bachelor)', 
         'Keywords':set(['laptop', 'eisen', 'kunstmatige', 'intelligentie']), 
         'Source':'Wat zijn de laptop eisen voor kunstmatige intelligentie?'}

In [7]:
def leveltype(level, data):
    for faculty, studies in data.items():
        if faculty == level:
            return 'fac'
        else:
            for study, az_links in studies.items():
                if study == level:
                    return 'stud'
    return None

In [8]:
def lvlkey(link, olddata):
    for faculty, studies in olddata.items():
        for study, az_links in studies.items():
            for article, keywords in az_links.items():
                if link == article:
                    return study, keywords
    return None

In [9]:
def analyse_query(query, data):
    keywords = list(query.get('Keywords'))
    level = query.get('Level')
    lvltype = leveltype(level, data)
    queryvec = vector(keywords, generalvec)
    return queryvec, level, lvltype, keywords

In [10]:
def search_tree(data, queryvec, level, leveltype):
    topdict = {}
    value = []
    threshold = 0        
    if leveltype == 'fac':
        for faculty, studies in data.items():
            if faculty == level:
                for study, az_links in studies.items():
                    for article, articlevec in az_links.items():
                        cosinesim = 1 - cs.distance.cosine(articlevec, queryvec)
                        if cosinesim > threshold:
                            topdict.update({article: cosinesim})
    elif leveltype == 'stud':
        for faculty, studies in data.items():
            for study, az_links in studies.items():
                if study == level:
                    for article, articlevec in az_links.items():
                        cosinesim = 1 - cs.distance.cosine(articlevec, queryvec)
                        if cosinesim > threshold:
                            topdict.update({article: cosinesim})
    else:
        for faculty, studies in data.items():
                for study, az_links in studies.items():
                    for article, articlevec in az_links.items():
                        cosinesim = 1 - cs.distance.cosine(articlevec, queryvec)
                        if cosinesim > threshold:
                            topdict.update({article: cosinesim})
    toplinks = sorted(topdict, key=topdict.get, reverse=True)
    return toplinks, topdict

In [11]:
def topresults(olddata, toplinks, topdict, n):
    topndicts = []
    for toplink in toplinks[:n]:
        level, keywords = lvlkey(toplink, olddata)
        for link, score in topdict.items():
            if toplink == link:
                topndicts.append({'URL':link, 'Score':score, 'Level':level, 'Keywords':keywords})
    return topndicts

In [12]:
def main(query, data, olddata, n):
    queryvec, level, lvltype, keywords = analyse_query(query, data)
    toplinks, topdict = search_tree(data, queryvec, level, lvltype)
    topndicts = topresults(olddata, toplinks, topdict, n)
    return topndicts

main(query, data, olddata, 5)

[{'Keywords': ['minimumeisen,studenten,ict en faciliteiten,natuurwetenschappen, wiskunde en informatica,ict,computereisen,laptop,kunstmatige intelligentie'],
  'Level': 'Kunstmatige intelligentie (bachelor)',
  'Score': 0.23570226039551578,
  'URL': 'http://www.student.uva.nl/ki/content/az/laptop-minimumeisen/laptop-minimumeisen.html'},
 {'Keywords': ['onderwijsprogramma’s,opleidingsintroductie,natuurwetenschappen, wiskunde en informatica,nieuwe studenten,laptop,kunstmatige intelligentie,informatie,zij-instromer,studieprogramma,inschrijving en toelating,de studiefaciliteiten,studie,begeleiding,instromer,eerstejaars,ki,instroom'],
  'Level': 'Kunstmatige intelligentie (bachelor)',
  'Score': 0.16666666666666663,
  'URL': 'http://www.student.uva.nl/ki/content/az/nieuwe-studenten/nieuwe-studenten.html'}]