In [1]:
from __future__ import print_function # to conform python 2.x print to python 3.x
import numpy as np
import turicreate
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances
import time
from copy import copy
import matplotlib.pyplot as plt
%matplotlib inline

'''compute norm of a sparse vector
   Thanks to: Jaiyam Sharma'''
def norm(x):
    sum_sq=x.dot(x.T)
    norm=np.sqrt(sum_sq)
    return(norm)

In [39]:
wiki = turicreate.SFrame('people_wiki.sframe/')

In [40]:
wiki = wiki.add_row_number()

In [41]:
wiki

id,URI,name,text
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
1,<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
2,<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
3,<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
4,<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
5,<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
6,<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
7,<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
8,<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
9,<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


In [42]:
wiki['tf_idf'] = turicreate.text_analytics.tf_idf(wiki['text'])
wiki.head()

id,URI,name,text,tf_idf
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'melbourne': 3.8914310119380633, ..."
1,<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'time': 1.3253342074200498, ..."
2,<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'time': 1.3253342074200498, ..."
3,<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'kurdlawitzpreis': 10.986495389225194, ..."
4,<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'curtis': 5.299520032885375, ..."
5,<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'journal': 3.025473923341824, ..."
6,<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'including': 1.2272824458461182, ..."
7,<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'concordia': 6.250296940830698, ..."
8,<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'knuckles': 8.042056410058754, ..."
9,<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'n3': 10.293348208665249, ..."


In [37]:
assert type(wiki['tf_idf'][0]) == dict, \
    'The chosen column must be dict type, representing sparse data.'

In [38]:
wiki = wiki.stack('tf_idf', ['feature', 'value'])

In [20]:
wiki

id,URI,name,text,feature,value
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,melbourne,3.8914310119380633
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,college,1.5613662703175557
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,parade,5.510031837293684
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,education,2.4487155642005685
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,teaches,3.7712554104950966
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,currently,1.637088969126014
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,edflhe,10.986495389225194
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,also,0.4627270916162349
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,coburg,7.851001173296044
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,coached,3.9925624140020046


In [18]:
unique_words = sorted(wiki['feature'].unique())

In [24]:
mapping = {word: i for i, word in enumerate(unique_words)}

In [25]:
mapping

{'0': 0,
 '00': 1,
 '000': 2,
 '0000': 3,
 '00000': 4,
 '00000van': 5,
 '0001': 6,
 '00014338': 7,
 '0001sec': 8,
 '0002': 9,
 '00026': 10,
 '0003': 11,
 '0005': 12,
 '000577': 13,
 '0005sec': 14,
 '0006': 15,
 '0007': 16,
 '0007105916': 17,
 '0007200374': 18,
 '0007207328': 19,
 '0007213506': 20,
 '000721426xhe': 21,
 '0007a': 22,
 '000he': 23,
 '000in': 24,
 '000m': 25,
 '000seelenprojekt': 26,
 '000tnmickushina': 27,
 '001': 28,
 '0017': 29,
 '001cd': 30,
 '001ehebbm': 31,
 '002': 32,
 '0020849605': 33,
 '0024': 34,
 '0026183900': 35,
 '002864574x': 36,
 '0028659287': 37,
 '003': 38,
 '0033': 39,
 '0034': 40,
 '0036': 41,
 '004': 42,
 '0043': 43,
 '0046': 44,
 '004erdemir': 45,
 '005': 46,
 '006': 47,
 '0060222425': 48,
 '0060628227': 49,
 '0060628464': 50,
 '0060669667': 51,
 '006074393x': 52,
 '0064': 53,
 '0066': 54,
 '007': 55,
 '0070710481': 56,
 '0071357440': 57,
 '0071375627': 58,
 '0072131772': 59,
 '0072131896': 60,
 '0072222611': 61,
 '0072225351': 62,
 '0072438886': 63,
 

In [26]:
wiki['feature_id'] = wiki['feature'].apply(lambda x: mapping[x])

In [27]:
wiki

id,URI,name,text,feature,value
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,melbourne,3.8914310119380633
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,college,1.5613662703175557
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,parade,5.510031837293684
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,education,2.4487155642005685
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,teaches,3.7712554104950966
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,currently,1.637088969126014
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,edflhe,10.986495389225194
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,also,0.4627270916162349
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,coburg,7.851001173296044
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,coached,3.9925624140020046

feature_id
323172
131032
373417
174907
483003
147551
174332
55818
129485
129011


In [29]:
np.arange(15, -1, -1)

array([15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0])

In [43]:
def sframe_to_scipy(x, column_name):
    '''
    Convert a dictionary column of an SFrame into a sparse matrix format where
    each (row_id, column_id, value) triple corresponds to the value of
    x[row_id][column_id], where column_id is a key in the dictionary.
       
    Example
    >>> sparse_matrix, map_key_to_index = sframe_to_scipy(sframe, column_name)
    '''
    assert type(x[column_name][0]) == dict, \
        'The chosen column must be dict type, representing sparse data.'
    #assert: debugging function
    # Stack will transform x to have a row for each unique (row, key) pair.
    x = x.stack(column_name, ['feature', 'value'])
    
    # Map feature words to integers 
    unique_words = sorted(x['feature'].unique())
    mapping = {word:i for i, word in enumerate(unique_words)}
    #Add indices for unique words
    x['feature_id'] = x['feature'].apply(lambda x: mapping[x])
    
    # Create numpy arrays that contain the data for the sparse matrix.
    row_id = np.array(x['id'])
    col_id = np.array(x['feature_id'])
    data = np.array(x['value'])
    
    width = x['id'].max() + 1
    height = x['feature_id'].max() + 1
    
    # Create a sparse matrix.
    mat = csr_matrix((data, (row_id, col_id)), shape=(width, height))
    return mat, mapping

In [44]:
%%time
corpus, mapping = sframe_to_scipy(wiki, 'tf_idf')

CPU times: user 2min 58s, sys: 9.47 s, total: 3min 8s
Wall time: 50.3 s


In [95]:
corpus

<59071x547979 sparse matrix of type '<class 'numpy.float64'>'
	with 10379283 stored elements in Compressed Sparse Row format>

In [45]:
np.random.seed(0)

In [46]:
dim = corpus.shape[1]
dim

547979

In [47]:
def generate_random_vectors(dim, n_vectors):
    return np.random.randn(dim, n_vectors)

In [52]:
n_vectors = 16

In [53]:
random_vectors = generate_random_vectors(dim, n_vectors)

In [54]:
bin_indices_bits = corpus.dot(random_vectors) >= 0

In [55]:
bin_indices_bits

array([[ True, False,  True, ...,  True,  True, False],
       [ True,  True, False, ..., False,  True,  True],
       [False, False, False, ..., False,  True,  True],
       ...,
       [False,  True,  True, ..., False,  True, False],
       [ True, False, False, ..., False,  True, False],
       [ True,  True, False, ...,  True,  True,  True]])

In [56]:
power_of_two = 1 << np.arange(n_vectors - 1, -1, step = -1)

In [57]:
power_of_two

array([32768, 16384,  8192,  4096,  2048,  1024,   512,   256,   128,
          64,    32,    16,     8,     4,     2,     1])

In [60]:
bin_indices = bin_indices_bits.dot(power_of_two)

In [61]:
bin_indices

array([48334, 54075,  2819, ..., 29650, 39114, 54031])

In [75]:
from collections import defaultdict
table = defaultdict(list)

In [76]:
table

defaultdict(list, {})

In [77]:
for idx, bin_index in enumerate(bin_indices):
    table[bin_index].append(idx)

In [78]:
table

defaultdict(list,
            {48334: [0, 2925],
             54075: [1, 2728, 9505, 16101, 32569, 44000],
             2819: [2, 30735, 31501, 36257, 38846, 55532],
             1047: [3, 34328],
             36210: [4],
             24265: [5],
             33590: [6, 54314],
             27311: [7, 31859, 37909, 55122, 55599],
             57357: [8],
             59311: [9, 24751],
             33478: [10, 47491],
             41339: [11, 28374, 47866],
             56508: [12],
             6660: [13],
             13884: [14, 48997],
             3438: [15, 3022, 8220, 14553],
             26267: [16],
             30365: [17, 47784],
             59385: [18],
             28310: [19, 20421, 37439],
             37885: [20, 48705],
             10522: [21, 22847, 44433],
             27319: [22, 24752, 25139, 33975],
             44835: [23],
             38353: [24],
             25708: [25],
             1344: [26, 42262],
             28757: [27, 9856, 20593],
             272

In [79]:
model = {'data': corpus,
         'table': table,
         'random_vectors': random_vectors,
         'bin_indices': bin_indices,
         'bin_indices_bits': bin_indices_bits}

In [80]:
model

{'data': <59071x547979 sparse matrix of type '<class 'numpy.float64'>'
 	with 10379283 stored elements in Compressed Sparse Row format>,
 'table': defaultdict(list,
             {48334: [0, 2925],
              54075: [1, 2728, 9505, 16101, 32569, 44000],
              2819: [2, 30735, 31501, 36257, 38846, 55532],
              1047: [3, 34328],
              36210: [4],
              24265: [5],
              33590: [6, 54314],
              27311: [7, 31859, 37909, 55122, 55599],
              57357: [8],
              59311: [9, 24751],
              33478: [10, 47491],
              41339: [11, 28374, 47866],
              56508: [12],
              6660: [13],
              13884: [14, 48997],
              3438: [15, 3022, 8220, 14553],
              26267: [16],
              30365: [17, 47784],
              59385: [18],
              28310: [19, 20421, 37439],
              37885: [20, 48705],
              10522: [21, 22847, 44433],
              27319: [22, 24752, 25139, 339

In [81]:
obama_id = wiki[wiki['name'] == 'Barack Obama']['id'][0]
biden_id = wiki[wiki['name'] == 'Joe Biden']['id'][0]
print(obama_id)
print(biden_id)

35817
24478


In [83]:
bits1 = model['bin_indices_bits'][obama_id]
bits2 = model['bin_indices_bits'][biden_id]

In [84]:
print(bits1)
print(bits2)

[ True False False  True  True False False  True  True  True  True False
  True  True  True  True]
[ True  True False  True  True False False  True  True  True  True False
  True False  True  True]


In [85]:
print(np.sum(bits1 == bits2))

14


# Get similarity items

In [89]:
obama_tf_idf = corpus
biden_tf_idf = corpus[24478,:]

In [90]:
print(obama_tf_idf)[35817]

  (0, 710)	2.3157231098806563
  (0, 11157)	2.6032908378122737
  (0, 16910)	6.509158574746988
  (0, 18150)	1.8763068991994527
  (0, 18679)	1.9280249665871378
  (0, 19185)	1.8753125887822302
  (0, 19721)	1.8013702663900752
  (0, 20925)	1.6425861253275964
  (0, 21560)	1.520737905384506
  (0, 22260)	1.4879730697555795
  (0, 23058)	1.5093391374786154
  (0, 23813)	1.5644364836042695
  (0, 25336)	1.7023470901042916
  (0, 26675)	1.9545642372230505
  (0, 27237)	2.2073995783446634
  (0, 28377)	2.797250863489293
  (0, 32041)	4.3717697890214335
  (0, 34717)	9.887883100557085
  (0, 41740)	0.022476737890332586
  (0, 45157)	4.137429106591736
  (0, 48424)	4.70049729471633
  (0, 48464)	10.986495389225194
  (0, 49145)	4.015921958283749
  (0, 49249)	2.138848033513307
  (0, 55456)	2.5088749729287803
  :	:
  (59070, 466704)	1.7090572737165175
  (59070, 466865)	5.4811638532928315
  (59070, 466930)	3.648800659754012
  (59070, 467661)	5.723805200320308
  (59070, 468302)	4.763919121153825
  (59070, 468636)	4.3

TypeError: 'NoneType' object is not subscriptable

In [93]:
obama_id = wiki[wiki['name'] == 'Barack Obama']['id']

In [94]:
obama_id

dtype: int
Rows: ?
[35817, ... ]