In [6]:
%run Vector.ipynb

3471


In [7]:
np.set_printoptions(threshold=10) #np.inf

In [8]:
with open('uva_json_file', 'r') as f:
    olddata = json.load(f)

## Input 

Dictionary:

    Language (int):
        0 = Nederlands       
        1 = Engels
              
    Type (int):  
        0 = Onbekend      
        1 = Wie        
        2 = Wat        
        3 = Waar        
        4 = Waarom        
        5 = Wanneer        
        6 = Hoe        
        7 = Welke        
        
    Level (str):        
        Studie        
        Faculteit        
        UvA        
        None        
        
    Keywords (set):        
        {<keyword_1>, ..., <keyword_n>}        
        
    Source (str):
        <Oorspronkelijke zin>

We assume that for this specific problem (only searching the tree we have established) we actually only need the keywords provided from the input query. The reason for this is that the keywords of the articles we have are a mix of both Dutch and English and will therefore automatically return the right URL. The type is irrelevant for this problem as we only have access to the keywords of the articles because we have not scraped the entire texts. Therefore, we are unable to parse what kind of question a person is asking, this will be handled in our other solution of using Watson to find the right text. 
The source is also not relevant as we have the keywords necessary.

In [9]:
'''
This function is used to determine the level 'type' of the level passed through by the input. 
This type can either be a factulty or a study, and determines where in the tree the algorithm will search.
'''
def leveltype(level, data):
    for faculty, studies in data.items():
        if faculty == level:
            return 'fac'
        else:
            for study, az_links in studies.items():
                if study == level:
                    return 'stud'
    return None

In [10]:
'''
This function returns the keywords for the given article, and is used to find the respective keywords
for the input article.
'''
def lvlkey(link, olddata):
    for faculty, studies in olddata.items():
        for study, az_links in studies.items():
            for article, keywords in az_links.items():
                if link == article:
                    return study, keywords
    return None

In [11]:
'''
This function looks at the input query dictionary and assigns variable names to certain parts of the query.
'''
def analyse_query(query, data):
    keywords = list(query.get('Keywords'))
    level = query.get('Level')
    lvltype = leveltype(level, data)
    queryvec = vector(keywords, generalvec)
    return queryvec, level, lvltype, keywords

In [12]:
'''
This is the most important function which searches through the given part of the tree, and returns a 
dictionary of the top results with their cosine similarity scores.
'''
def search_tree(data, queryvec, level, leveltype):
    topdict = {}
    value = []
    threshold = 0     
    # searches only through a specific factulty if the level is a faculty
    if leveltype == 'fac':
        for faculty, studies in data.items():
            if faculty == level:
                for study, az_links in studies.items():
                    for article, articlevec in az_links.items():
                        cosinesim = 1 - cs.distance.cosine(articlevec, queryvec)
                        if cosinesim > threshold:
                            topdict.update({article: cosinesim})
    # searches only through a specific study if the level is a study
    elif leveltype == 'stud':
        for faculty, studies in data.items():
            for study, az_links in studies.items():
                if study == level:
                    for article, articlevec in az_links.items():
                        cosinesim = 1 - cs.distance.cosine(articlevec, queryvec)
                        if cosinesim > threshold:
                            topdict.update({article: cosinesim})
    # searches through the entire tree
    else:
        for faculty, studies in data.items():
                for study, az_links in studies.items():
                    for article, articlevec in az_links.items():
                        cosinesim = 1 - cs.distance.cosine(articlevec, queryvec)
                        if cosinesim > threshold:
                            topdict.update({article: cosinesim})
    toplinks = sorted(topdict, key=topdict.get, reverse=True)
    return toplinks, topdict

In [13]:
'''
This function creates a dictionary of n links of the top results, with all the required information
about those results (the URL, score, level, and keywords).
'''
def topresults(olddata, toplinks, topdict, n):
    topndicts = []
    for toplink in toplinks[:n]:
        level, keywords = lvlkey(toplink, olddata)
        for link, score in topdict.items():
            if toplink == link:
                topndicts.append({'URL':link, 'Score':score, 'Level':level, 'Keywords':keywords})
    return topndicts

In [14]:
'''
This is the main function which stiches together all the above functions and returns the corresponding results. 
'''
def main(query, data, olddata, n):
    queryvec, level, lvltype, keywords = analyse_query(query, data)
    toplinks, topdict = search_tree(data, queryvec, level, lvltype)
    topndicts = topresults(olddata, toplinks, topdict, n)
    return topndicts

In [15]:
'''
Some example queries
'''
# Query to show a good working example without level given
query1 = {'Language':0, 
         'Type':0, 
         'Level':'', 
         'Keywords':set(['laptop', 'eisen', 'kunstmatige', 'intelligentie']), 
         'Source':'Wat zijn de laptop eisen voor kunstmatige intelligentie?'}
# Query to show a good working example with level given
query2 = {'Language':0, 
         'Type':0, 
         'Level':'Kunstmatige intelligentie (bachelor)', 
         'Keywords':set(['laptop', 'eisen', 'kunstmatige', 'intelligentie']), 
         'Source':'Wat zijn de laptop eisen voor kunstmatige intelligentie?'}
# Query to show a bad working example without level given
query3 = {'Language':0, 
         'Type':0, 
         'Level':'', 
         'Keywords':set(['toelatingseisen', 'kunstmatige', 'intelligentie']), 
         'Source':'Wat zijn de toelatingseisen voor kunstmatige intelligentie?'}
# Query to show a bad working example without level given
query4 = {'Language':0, 
         'Type':0, 
         'Level':'Kunstmatige intelligentie (bachelor)', 
         'Keywords':set(['toelatingseisen', 'kunstmatige', 'intelligentie']), 
         'Source':'Wat zijn de toelatingseisen voor kunstmatige intelligentie?'}

In [16]:
main(query1, data, olddata, 5)

[{'Keywords': ['maatschappij- en gedragswetenschappen,studieprogramma,vakaanmelding,sociale geografie en planologie (bachelor),eisen,voorwaarden,in- en uitschrijven,instroom'],
  'Level': 'Sociale geografie en Planologie (bachelor)',
  'Score': 0.25,
  'URL': 'http://www.student.uva.nl/sgpl/content/az/vak-en-tentamenaanmelding-sgp/ingangseisen/ingangseisen-kopie.html'},
 {'Keywords': ['minimumeisen,studenten,ict en faciliteiten,natuurwetenschappen, wiskunde en informatica,computereisen,laptop,informatica'],
  'Level': 'Informatica (bachelor)',
  'Score': 0.25,
  'URL': 'http://www.student.uva.nl/inc/content/az/laptop-minimumeisen/laptop-minimumeisen.html'},
 {'Keywords': ['maatschappij- en gedragswetenschappen,studieprogramma,inschrijving en toelating,vakaanmelding,sociale geografie en planologie (bachelor),eisen,voorwaarden,in- en uitschrijven,instroom'],
  'Level': 'Sociale geografie en Planologie (bachelor)',
  'Score': 0.23570226039551578,
  'URL': 'http://www.student.uva.nl/sgpl/c

In [17]:
main(query2, data, olddata, 5)

[{'Keywords': ['minimumeisen,studenten,ict en faciliteiten,natuurwetenschappen, wiskunde en informatica,ict,computereisen,laptop,kunstmatige intelligentie'],
  'Level': 'Kunstmatige intelligentie (bachelor)',
  'Score': 0.23570226039551578,
  'URL': 'http://www.student.uva.nl/ki/content/az/laptop-minimumeisen/laptop-minimumeisen.html'},
 {'Keywords': ['onderwijsprogramma’s,opleidingsintroductie,natuurwetenschappen, wiskunde en informatica,nieuwe studenten,laptop,kunstmatige intelligentie,informatie,zij-instromer,studieprogramma,inschrijving en toelating,de studiefaciliteiten,studie,begeleiding,instromer,eerstejaars,ki,instroom'],
  'Level': 'Kunstmatige intelligentie (bachelor)',
  'Score': 0.16666666666666663,
  'URL': 'http://www.student.uva.nl/ki/content/az/nieuwe-studenten/nieuwe-studenten.html'}]

In [18]:
main(query3, data, olddata, 5)

[{'Keywords': ['inschrijving en toelating,rechtsgeleerdheid,rechten,schakelprogramma,bijscholing,onderwijs,toelatingseisen,schakelaar,instromen'],
  'Level': 'Publiekrecht: Militair recht (master)',
  'Score': 0.33333333333333326,
  'URL': 'http://www.student.uva.nl/rechten/content/az/schakelprogramma/schakelprogramma.html'},
 {'Keywords': ['owi,regelingen en inspraak,reglementen,natuurwetenschappen, wiskunde en informatica,regeling,toelatingseisen,examen,medezeggenschap,regels,examenregeling,studieprogramma,vakken, tentamens en cijfers,medezeggenschap en inspraak,onderwijs,regelingen,oer,richtlijnen,inspraak,onderwijsregeling'],
  'Level': 'Psychobiologie (bachelor)',
  'Score': 0.21821789023599236,
  'URL': 'http://www.student.uva.nl/pb/shared/studentensites/fnwi/esc-gedeelde-content/nl/az/onderwijs-en-examenregeling-oer/onderwijs--en-examenregeling-oer.html'},
 {'Keywords': ['owi,regelingen en inspraak,reglementen,natuurwetenschappen, wiskunde en informatica,regeling,toelatingseisen

In [19]:
main(query4, data, olddata, 5)

[{'Keywords': ['owi,regelingen en inspraak,reglementen,natuurwetenschappen, wiskunde en informatica,regeling,toelatingseisen,examen,medezeggenschap,regels,examenregeling,studieprogramma,vakken, tentamens en cijfers,medezeggenschap en inspraak,onderwijs,regelingen,oer,richtlijnen,inspraak,onderwijsregeling'],
  'Level': 'Kunstmatige intelligentie (bachelor)',
  'Score': 0.21821789023599236,
  'URL': 'http://www.student.uva.nl/ki/shared/studentensites/fnwi/esc-gedeelde-content/nl/az/onderwijs-en-examenregeling-oer/onderwijs--en-examenregeling-oer.html'}]