# Choosing features and metrics for nearest neighbor search

## Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
%matplotlib inline

## Load in the dataset

In [2]:
wiki = pd.read_csv('people_wiki.csv')
# add id column
wiki['id'] = range(0, len(wiki))
wiki.head()

Unnamed: 0,URI,name,text,id
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,0
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,1
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,2
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,3
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,4


## Extract word count vectors

In [3]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

In [4]:
word_count = load_sparse_csr('people_wiki_word_count.npz')

In [5]:
# map_index_to_word = pd.read_json('people_wiki_map_index_to_word.json', orient='records')
import json

with open('people_wiki_map_index_to_word.json') as json_data:
    data = json.load(json_data)

map_index_to_word = pd.DataFrame(data, index=[0]).T
map_index_to_word.columns = ['index']
map_index_to_word['category'] = map_index_to_word.index
map_index_to_word.head()

Unnamed: 0,index,category
0,540315,0
0,536260,0
0,535641,0
0,83348,0
0,81527,0


## Find nearest neighbors using word count vectors

In [6]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='euclidean', algorithm='brute')
nbrs = model.fit(word_count)

In [7]:
wiki[wiki['name'] == 'Barack Obama']

Unnamed: 0,URI,name,text,id
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,35817


In [8]:
distances, indices = nbrs.kneighbors(word_count[35817], n_neighbors=10) # 1st arg: word count vector

In [9]:
neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
neighbors

Unnamed: 0,distance,id
0,0.0,35817
1,33.075671,24478
2,34.394767,28447
3,36.152455,35357
4,36.166283,14754
5,36.331804,13229
6,36.400549,31423
7,36.496575,22745
8,36.633318,36364
9,36.959437,9210


In [10]:
wiki.merge(neighbors, on='id').sort_values('distance')[['id','name','distance']]

Unnamed: 0,id,name,distance
8,35817,Barack Obama,0.0
4,24478,Joe Biden,33.075671
5,28447,George W. Bush,34.394767
7,35357,Lawrence Summers,36.152455
2,14754,Mitt Romney,36.166283
1,13229,Francisco Barrio,36.331804
6,31423,Walter Mondale,36.400549
3,22745,Wynn Normington Hugh-Jones,36.496575
9,36364,Don Bonker,36.633318
0,9210,Andy Anstett,36.959437


## Interpreting the nearest neighbors

In [35]:
def unpack_dict(matrix, map_index_to_word):
    table = list(map_index_to_word.sort_values('index')['category'])
    data = matrix.data
    indices = matrix.indices
    indptr = matrix.indptr
    
    num_doc = matrix.shape[0]
    return [{k:v for k,v in zip([table[word_id] for word_id in indices[indptr[i]:indptr[i+1]] ],
                                 data[indptr[i]:indptr[i+1]].tolist())} \
               for i in range(num_doc) ]

wiki['word_count'] = unpack_dict(word_count, map_index_to_word)
wiki.head()

Unnamed: 0,URI,name,text,id,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,0,"{'against': 2, '19982000': 1, 'australian': 3,..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,1,"{'current': 1, '94': 1, 'worked': 1, 'availabl..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,2,"{'few': 2, 'playing': 1, 'also': 1, 'won': 1, ..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,3,"{'k': 2, 'stanislaw': 2, 'critical': 1, 'phant..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,4,"{'performers': 1, 'was': 4, 'vacation': 1, 'ga..."


In [36]:
def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    word_count_table = pd.DataFrame(list(row['word_count'].iloc[0].items()), columns=['word','count'])
    return word_count_table.sort_values('count', ascending=False)

obama_words = top_words('Barack Obama')
obama_words.head(10)

Unnamed: 0,word,count
217,the,40
58,in,30
106,and,21
86,of,18
102,to,14
262,his,11
126,obama,9
237,act,8
38,he,7
118,a,7


In [37]:
def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    word_count_dict = row['word_count'].to_dict()[int(row['id'])]
    word_count_table = pd.DataFrame(list(word_count_dict.items()), columns=['word','count'])
    return word_count_table.sort_values('count', ascending=False)

obama_words = top_words('Barack Obama')
obama_words.head(10)

Unnamed: 0,word,count
217,the,40
58,in,30
106,and,21
86,of,18
102,to,14
262,his,11
126,obama,9
237,act,8
38,he,7
118,a,7


In [38]:
barrio_words = top_words('Francisco Barrio')
barrio_words.head(10)

Unnamed: 0,word,count
175,the,36
202,of,24
44,and,18
77,in,17
114,he,10
65,to,9
1,chihuahua,7
89,governor,6
57,a,6
181,as,5


In [39]:
combined_words = obama_words.merge(barrio_words, on='word')
combined_words.head(10)

Unnamed: 0,word,count_x,count_y
0,the,40,36
1,in,30,17
2,and,21,18
3,of,18,24
4,to,14,9
5,his,11,5
6,he,7,10
7,a,7,6
8,as,6,5
9,was,5,4


In [41]:
combined_words.columns = ['word', 'Obama', 'Barrio']
combined_words.head(10)

Unnamed: 0,word,Obama,Barrio
0,the,40,36
1,in,30,17
2,and,21,18
3,of,18,24
4,to,14,9
5,his,11,5
6,he,7,10
7,a,7,6
8,as,6,5
9,was,5,4


In [42]:
combined_words.sort_values('Obama', ascending=False)
combined_words.head(10)

Unnamed: 0,word,Obama,Barrio
0,the,40,36
1,in,30,17
2,and,21,18
3,of,18,24
4,to,14,9
5,his,11,5
6,he,7,10
7,a,7,6
8,as,6,5
9,was,5,4


### Quiz Question

Among the words that appear in both Barack Obama and Francisco Barrio, take the 5 that appear most frequently in Obama. How many of the articles in the Wikipedia dataset contain all of those 5 words?

In [43]:
common_words = combined_words['word'][:5].tolist()
common_words

['the', 'in', 'and', 'of', 'to']

In [52]:
def has_top_words(word_count_vector):
    # extract the keys of word_count_vector and convert it to a set
    unique_words1 = set(x for x in word_count_vector.keys())   # YOUR CODE HERE
    # return True if common_words is a subset of unique_words
    # return False otherwise
    return set(common_words).issubset(unique_words1)  # YOUR CODE HERE

wiki['has_top_words'] = wiki['word_count'].apply(has_top_words)
wiki.head(10)

Unnamed: 0,URI,name,text,id,word_count,has_top_words
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,0,"{'against': 2, '19982000': 1, 'australian': 3,...",True
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,1,"{'current': 1, '94': 1, 'worked': 1, 'availabl...",True
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,2,"{'few': 2, 'playing': 1, 'also': 1, 'won': 1, ...",True
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,3,"{'k': 2, 'stanislaw': 2, 'critical': 1, 'phant...",True
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,4,"{'performers': 1, 'was': 4, 'vacation': 1, 'ga...",False
5,<http://dbpedia.org/resource/Sam_Henderson>,Sam Henderson,sam henderson born october 18 1969 is an ameri...,5,"{'san': 1, 'asses': 1, '1993': 2, 'scene': 1, ...",False
6,<http://dbpedia.org/resource/Aaron_LaCrate>,Aaron LaCrate,aaron lacrate is an american music producer re...,6,"{'create': 1, 'vinyl': 1, 'streamz': 1, 'def':...",True
7,<http://dbpedia.org/resource/Trevor_Ferguson>,Trevor Ferguson,trevor ferguson aka john farrow born 11 novemb...,7,"{'history': 2, 'company': 1, 'ever': 1, 'criti...",True
8,<http://dbpedia.org/resource/Grant_Nelson>,Grant Nelson,grant nelson born 27 april 1971 in london also...,8,"{'delivered': 2, 'hardcoredrum': 1, 'jodeci': ...",True
9,<http://dbpedia.org/resource/Cathy_Caruth>,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes pro...,9,"{'t': 1, 'departments': 1, 'writing': 1, 'crit...",True


**Checkpoint**. Check your has_top_words function on two random articles:

In [59]:
print('Output from your function:', has_top_words(wiki['word_count'][32]))
print('Correct output: True')
print('Length of unique_words:', len(set(x for x in wiki['word_count'][32].keys())))
print('Also check the length of unique_words. It should be 167')

print('Output from your function:', has_top_words(wiki['word_count'][33]))
print('Correct output: False')
print('Length of unique_words:', len(set(x for x in wiki['word_count'][33].keys())))
print('Also check the length of unique_words. It should be 188')

Output from your function: True
Correct output: True
Length of unique_words: 167
Also check the length of unique_words. It should be 167
Output from your function: False
Correct output: False
Length of unique_words: 188
Also check the length of unique_words. It should be 188
