In [1]:
import graphlab

In [2]:
# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)





# Load some text data - from wikipedia, pages on people

In [3]:
people = graphlab.SFrame('people_wiki.gl/')

# Explore the dataset and checkout the text it contains

In [4]:
elton = people[people['name'] == 'Elton John']

In [5]:
elton['word_count'] = graphlab.text_analytics.count_words(elton['text'])

In [6]:
print elton['word_count']

[{'all': 1L, 'six': 1L, 'producer': 1L, 'heavily': 1L, 'over': 2L, 'named': 1L, 'fifty': 1L, 'four': 1L, 'openly': 1L, 'including': 1L, 'highestprofile': 1L, 'years': 1L, 'its': 2L, 'impact': 1L, 'westminster': 1L, '27': 1L, '21': 2L, 'wed': 1L, 'had': 1L, '1947': 1L, 'abbey': 1L, 'winning': 1L, 'late': 1L, 'to': 4L, 'commander': 1L, 'about': 1L, 'born': 1L, '2014': 1L, 'as': 2L, 'has': 9L, '2013': 1L, 'his': 4L, 'march': 1L, 'than': 3L, 'song': 1L, 'songwriter': 2L, 'continues': 1L, 'records': 1L, 'five': 1L, 'occasional': 1L, 'they': 1L, 'inception': 1L, 'world': 1L, 'brit': 1L, 'him': 3L, 'datein': 1L, 'hall': 2L, 'fivedecade': 1L, 'knighthood': 1L, 'bestselling': 2L, 'artist': 1L, 'be': 1L, '1996': 1L, 'list': 1L, 'roll': 2L, 'hercules': 1L, 'announced': 1L, 'rock': 2L, 'become': 1L, 'bernie': 1L, 'outstanding': 1L, 'england': 1L, 'composer': 1L, 'queens': 1L, 'foundation': 2L, 'diana': 1L, 'globe': 1L, 'artists': 2L, 'culture': 1L, 'been': 3L, '49': 1L, 'year': 1L, 'billboard': 4L

# Sort the word counts for the Eelton article

In [7]:
elton_word_count_table = elton[['word_count']].stack('word_count', new_column_name = ['word','count'])

In [8]:
elton_word_count_table.sort('count',ascending=False)

word,count
the,27
in,18
and,15
of,13
a,10
has,9
he,7
john,7
on,6
since,5


# Compute TF-IDF for the corpus

In [9]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])

In [10]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])

In [11]:
people['tfidf'] = tfidf

In [13]:
elton = people[people['name'] == 'Elton John']
elton[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
overallelton,10.9864953892
tonightcandle,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


# Manually compute distances between a few people

In [14]:
beckham = people[people['name'] == 'Victoria Beckham']

In [15]:
mccartney = people[people['name'] == 'Paul McCartney']

In [16]:
graphlab.distances.cosine(elton['tfidf'][0],beckham['tfidf'][0])

0.9567006376655429

In [17]:
graphlab.distances.cosine(elton['tfidf'][0],mccartney['tfidf'][0])

0.8250310029221779

# Building nearest neighbors models with different input features and setting the distance metric

In [20]:
tfidfnn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name',distance='cosine')
wcnn_model = graphlab.nearest_neighbors.create(people,features=['word_count'],label='name',distance='cosine')

In [21]:
tfidfnn_model.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


In [22]:
wcnn_model.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [23]:
tfidfnn_model.query(beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5


In [24]:
wcnn_model.query(beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5
