# Document retrieval from Wikipedia data

In [1]:
import turicreate

# Load some text data from Wikipedia

In [2]:
people = turicreate.SFrame('people_wiki.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [3]:
people

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


# Explore data

In [4]:
people['word_count'] = turicreate.text_analytics.count_words(people['text'])
people['tfidf'] = turicreate.text_analytics.tf_idf(people['text'])

In [5]:
elton = people[people['name'] == 'Elton John']
elton_word_count_table = elton[['word_count']].stack('word_count', new_column_name = ['word', 'count'])
elton_tfidf_table = elton[['tfidf']].stack('tfidf', new_column_name = ['word', 'tfidf'])

In [6]:
elton_word_count_table.sort('count', ascending = False)

word,count
the,27.0
in,18.0
and,15.0
of,13.0
a,10.0
has,9.0
john,7.0
he,7.0
on,6.0
award,5.0


In [7]:
elton_tfidf_table.sort('tfidf', ascending = False)

word,tfidf
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203
john,13.93931279239831
songwriters,11.25040644703154
tonightcandle,10.986495389225194
overallelton,10.986495389225194
19702000,10.293348208665249
fivedecade,10.293348208665249
aids,10.262846934045534


# Manually evaluate the distance between certain people's articles

In [8]:
victoria = people[people['name'] == 'Victoria Beckham']
paul = people[people['name'] == 'Paul McCartney']

elton_victoria_distance = turicreate.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0])
elton_paul_distance = turicreate.distances.cosine(elton['tfidf'][0], paul['tfidf'][0])

print('Elton to Victoria Distance: ' + str(elton_victoria_distance))
print('Elton to Paul Distance: ' + str(elton_paul_distance))

Elton to Victoria Distance: 0.9567006376655429
Elton to Paul Distance: 0.8250310029221779


# Apply nearest neighbors for retrieval of Wikipedia articles

## Build the NN model

In [9]:
knn_word_count_model = turicreate.nearest_neighbors.create(people, label = 'name', 
                                                           features = ['word_count'], distance = 'cosine')

In [10]:
knn_tfidf_model = turicreate.nearest_neighbors.create(people, label = 'name', 
                                                      features = ['tfidf'], distance = 'cosine')

In [11]:
knn_word_count_model.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.220446049250313e-16,1
0,Cliff Richard,0.1614241525896703,2
0,Sandro Petrone,0.1682254275104111,3
0,Rod Stewart,0.168327165587061,4
0,Malachi O'Doherty,0.177315545978884,5


In [12]:
knn_tfidf_model.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.220446049250313e-16,1
0,Rod Stewart,0.7172196678927374,2
0,George Michael,0.7476009989692847,3
0,Sting (musician),0.7476719544306141,4
0,Phil Collins,0.7511932487904706,5


In [13]:
knn_word_count_model.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.220446049250313e-16,1
0,Mary Fitzgerald (artist),0.2073070361150499,2
0,Adrienne Corri,0.2145097827875479,3
0,Beverly Jane Fry,0.2174664687407927,4
0,Raman Mundair,0.2176954749915048,5


In [14]:
knn_tfidf_model.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.1102230246251563e-16,1
0,David Beckham,0.5481696102632145,2
0,Stephen Dow Beckham,0.7849867068283364,3
0,Mel B,0.8095855234085036,4
0,Caroline Rush,0.81982642291868,5
