# Choosing features and metrics for nearest neighbor search

## Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
%matplotlib inline

## Load in the dataset

In [2]:
wiki = pd.read_csv('people_wiki.csv')
# add id column
wiki['id'] = range(0, len(wiki))
wiki.head()

Unnamed: 0,URI,name,text,id
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,0
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,1
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,2
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,3
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,4


## Extract word count vectors

In [3]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

In [4]:
word_count = load_sparse_csr('people_wiki_word_count.npz')

In [5]:
# map_index_to_word = pd.read_json('people_wiki_map_index_to_word.json', orient='records')
import json

with open('people_wiki_map_index_to_word.json') as json_data:
    data = json.load(json_data)

map_index_to_word = pd.DataFrame(data, index=[0]).T
map_index_to_word.columns = ['index']
map_index_to_word['category'] = map_index_to_word.index
map_index_to_word.head()

Unnamed: 0,index,category
0,540315,0
0,536260,0
0,535641,0
0,83348,0
0,81527,0


## Find nearest neighbors using word count vectors

In [6]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='euclidean', algorithm='brute')
nbrs = model.fit(word_count)

In [7]:
wiki[wiki['name'] == 'Barack Obama']

Unnamed: 0,URI,name,text,id
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,35817


In [8]:
distances, indices = nbrs.kneighbors(word_count[35817], n_neighbors=10) # 1st arg: word count vector

In [9]:
neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
neighbors

Unnamed: 0,distance,id
0,0.0,35817
1,33.075671,24478
2,34.394767,28447
3,36.152455,35357
4,36.166283,14754
5,36.331804,13229
6,36.400549,31423
7,36.496575,22745
8,36.633318,36364
9,36.959437,9210


In [10]:
wiki.merge(neighbors, on='id').sort_values('distance')[['id','name','distance']]

Unnamed: 0,id,name,distance
8,35817,Barack Obama,0.0
4,24478,Joe Biden,33.075671
5,28447,George W. Bush,34.394767
7,35357,Lawrence Summers,36.152455
2,14754,Mitt Romney,36.166283
1,13229,Francisco Barrio,36.331804
6,31423,Walter Mondale,36.400549
3,22745,Wynn Normington Hugh-Jones,36.496575
9,36364,Don Bonker,36.633318
0,9210,Andy Anstett,36.959437


## Interpreting the nearest neighbors

In [11]:
def unpack_dict(matrix, map_index_to_word):
    table = list(map_index_to_word.sort_values('index')['category'])
    data = matrix.data
    indices = matrix.indices
    indptr = matrix.indptr
    
    num_doc = matrix.shape[0]

    return [{k:v for k,v in zip([table[word_id] for word_id in indices[indptr[i]:indptr[i+1]] ],
                                 data[indptr[i]:indptr[i+1]].tolist())} \
               for i in range(num_doc) ]

wiki['word_count'] = unpack_dict(word_count, map_index_to_word)
wiki.head()

Unnamed: 0,URI,name,text,id,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,0,"{'has': 1, '19982000': 1, 'the': 27, 'forward'..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,1,"{'shifts': 2, '1973': 1, 'current': 1, 'assess..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,2,"{'has': 2, 'what': 2, 'small': 1, 'vancouver':..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,3,"{'1973': 1, 'american': 1, 'hg': 1, 'andersona..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,4,"{'next': 1, 'new': 1, 'record': 2, 'of': 1, 'k..."


In [35]:
# def top_words(name):
#     """
#     Get a table of the most frequent words in the given person's wikipedia page.
#     """
#     row = wiki[wiki['name'] == name]
#     word_count_table = pd.DataFrame.from_dict(row[['word_count']], column=['word','count'])
#     return word_count_table.sort('count', ascending=False)
# import ast
# wiki["word_count"] =  wiki["word_count"].map(lambda d : ast.literal_eval(d))

row = wiki[wiki.name == ('Barack Obama')]['word_count']
# obama_words = top_words('Barack Obama')
# print(obama_words)
# obama_words = pd.DataFrame.from_dict(row)
# obama_words
print(wiki.word_count)

# barrio_words = top_words('Francisco Barrio')
# print(barrio_words)

0        {'has': 1, '19982000': 1, 'the': 27, 'forward'...
1        {'shifts': 2, '1973': 1, 'current': 1, 'assess...
2        {'has': 2, 'what': 2, 'small': 1, 'vancouver':...
3        {'1973': 1, 'american': 1, 'hg': 1, 'andersona...
4        {'next': 1, 'new': 1, 'record': 2, 'of': 1, 'k...
5        {'has': 2, 'strip': 2, 'known': 1, 'new': 4, '...
6        {'sponsored': 1, 'delicious': 1, 'later': 1, '...
7        {'equally': 1, 'praise': 1, 'kinkajou': 1, 'sa...
8        {'1971': 1, 'hardcoredrum': 1, 'known': 2, 'an...
9        {'ways': 1, 'professor': 1, 't': 1, 'trauma': ...
10       {'inquirer': 1, 'until': 1, 'of': 5, 'clowning...
11       {'novelistsshe': 1, 'later': 1, 'saints': 1, '...
12       {'has': 1, 'american': 1, 'current': 1, 'of': ...
13       {'has': 4, 'british': 1, 'the': 16, 'european'...
14       {'several': 1, 'olympic': 1, 'make': 1, 'pursu...
15       {'unconventional': 1, 'genres': 1, 'josh': 1, ...
16       {'has': 3, 'pocket': 1, 'degree': 1, 'oxford':.