In [329]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

import scipy.sparse as sparse

%matplotlib inline

from implicit.als import AlternatingLeastSquares

In [330]:
bookmark = pd.read_table('bookmark/bookmarks.dat', sep="\t+", usecols=['id', 'md5', 'url', 'md5Principal', 'urlPrincipal'])

  """Entry point for launching an IPython kernel.


In [331]:
bookmark.rename(index=str, columns={"id": "bookmarkID"}, inplace=True)
bookmark.head()

Unnamed: 0,bookmarkID,md5,url,md5Principal,urlPrincipal
0,1,ab4954b633ddaf5b5bba6e9b71aa6b70,http://www.ifla.org/,7f431306c428457bc4e12b15634484f,www.ifla.org
1,2,2221e9cd106d269dd34682666f576fa3,http://archive.ifla.org/VII/s14/nd1/gcdp-e.pdf,1ef8cfcfe968101fa9b4e301847503d4,archive.ifla.org
2,7,c97c571dadaddbbb493126a0d4d01ba3,http://www.edselect.com/,792fd7eb20143386d0c4eb193c6124d,www.edselect.com
3,8,25bfe8dca0ef263ec9c341b9f16c38b5,http://www.collectionscanada.gc.ca/cool/index-...,6fce4f6391516f0732531d9cfacda5b7,www.collectionscanada.gc.ca
4,9,c97284629e17b8e2861afaacd59918bc,http://www.kidsreads.com/,5854ce8404857a45373eea01a3d98000,www.kidsreads.com


In [332]:
user_cont = pd.read_table('bookmark/user_contacts.dat')
user_cont.head()

Unnamed: 0,userID,contactID,date_day,date_month,date_year,date_hour,date_minute,date_second
0,8,28371,4,10,2010,2,14,19
1,8,40306,3,10,2010,12,33,9
2,8,85279,2,10,2010,3,10,48
3,8,91764,4,10,2010,22,43,47
4,8,97723,12,10,2010,3,28,33


In [333]:
user_tags = pd.read_table('bookmark/user_taggedbookmarks.dat')
user_tags.head()

Unnamed: 0,userID,bookmarkID,tagID,day,month,year,hour,minute,second
0,8,1,1,8,11,2010,23,29,22
1,8,2,1,8,11,2010,23,25,59
2,8,7,1,8,11,2010,18,55,1
3,8,7,6,8,11,2010,18,55,1
4,8,7,7,8,11,2010,18,55,1


In [335]:
joined = pd.merge(user_tags, bookmark, on='bookmarkID', how='left')

In [336]:
user_url = joined.groupby(['userID','urlPrincipal']).count().reset_index()
user_url.head()

Unnamed: 0,userID,urlPrincipal,bookmarkID,tagID,day,month,year,hour,minute,second,md5,url,md5Principal
0,8,addons.mozilla.org,1,1,1,1,1,1,1,1,1,1,1
1,8,archive.ifla.org,1,1,1,1,1,1,1,1,1,1,1
2,8,augusthouse.com,2,2,2,2,2,2,2,2,2,2,2
3,8,bctf.ca,1,1,1,1,1,1,1,1,1,1,1
4,8,blog.cathyjonelson.com,1,1,1,1,1,1,1,1,1,1,1


In [337]:
user_url.rename(index=str, columns={"bookmarkID": "urlCount"}, inplace=True)
user_url.drop(['tagID', 'day', 'month','year','hour','minute','second','md5','url','md5Principal'], axis=1, inplace=True)
user_url.head()

Unnamed: 0,userID,urlPrincipal,urlCount
0,8,addons.mozilla.org,1
1,8,archive.ifla.org,1
2,8,augusthouse.com,2
3,8,bctf.ca,1
4,8,blog.cathyjonelson.com,1


In [338]:
item_lookup = user_url[['urlPrincipal']].drop_duplicates()
url_id_name = {}

for index, row in item_lookup.iterrows():
    url_id_name[index] = row.urlPrincipal

In [339]:
urls = pd.DataFrame.from_dict(url_id_name, orient='index')

In [340]:
urls = urls.reset_index()
urls.rename(index=str, columns={"index": "urlID", 0:"urlPrincipal"}, inplace=True)
urls.head()

Unnamed: 0,urlID,urlPrincipal
0,0,addons.mozilla.org
1,1,archive.ifla.org
2,2,augusthouse.com
3,3,bctf.ca
4,4,blog.cathyjonelson.com


In [341]:
urls.drop('urlID', axis=1, inplace=True)
urls['urlID'] = urls.index

In [342]:
user_url = pd.merge(user_url, urls, on='urlPrincipal', how='left')
#user_url.rename(index=str, columns={"urlID_x": "urlID"}, inplace=True)
#user_url.drop('urlID_y', axis=1,inplace=True)
user_url.head()

Unnamed: 0,userID,urlPrincipal,urlCount,urlID
0,8,addons.mozilla.org,1,0
1,8,archive.ifla.org,1,1
2,8,augusthouse.com,2,2
3,8,bctf.ca,1,3
4,8,blog.cathyjonelson.com,1,4


In [343]:
item_lookup1 = user_url[['userID']].drop_duplicates()
user_id_name = {}

for index, row in item_lookup1.iterrows():
    user_id_name[index] = row.userID

In [344]:
users = pd.DataFrame.from_dict(user_id_name, orient='index')

In [345]:
users = users.reset_index()
users.sort_values(by=['index']) 
users.head()

Unnamed: 0,index,0
0,0,8
1,59,32
2,64,57
3,88,147
4,161,233


In [346]:
users.drop('index',axis=1, inplace=True)
users.rename(index=str, columns={"level_0": "userSK", 0:"userID"}, inplace=True)
users.head()

Unnamed: 0,userID
0,8
1,32
2,57
3,147
4,233


In [347]:
users['userSK'] = users.index

In [348]:
#user_url.drop(['urlPrincipal'], axis=1, inplace=True)
#user_url.head()

In [349]:
user_url = pd.merge(user_url, users, on='userID', how='left')
user_url.head()

Unnamed: 0,userID,urlPrincipal,urlCount,urlID,userSK
0,8,addons.mozilla.org,1,0,0
1,8,archive.ifla.org,1,1,0
2,8,augusthouse.com,2,2,0
3,8,bctf.ca,1,3,0
4,8,blog.cathyjonelson.com,1,4,0


In [350]:
users1 = list(np.sort(user_url.userSK.astype(int).unique()))
urls1  = list(np.sort(user_url.urlID.astype(int).unique()))
counts = list(user_url.urlCount)

In [351]:
rows = user_url.userSK.astype(int)
cols = user_url.urlID.astype(int)

In [352]:
len(urls1)

38569

In [353]:
len(users1)

1867

In [354]:
max(cols)

38568

In [355]:
min(rows)

0

In [356]:
len(count)

93149

In [357]:
sparse.csr_matrix((len(urls1), len(users1)))

<38569x1867 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [358]:
data_sparse = sparse.csr_matrix((count, (cols,rows)), shape=(len(urls1), len(users1)))

In [359]:
model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)

100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:02<00:00,  5.64it/s]


In [360]:
userid = 0

user_items = data_sparse.T.tocsr()
recommendations = model.recommend(userid, user_items)

In [361]:
recommendations

[(87, 0.4839357),
 (1177, 0.39915353),
 (12383, 0.36327255),
 (12308, 0.3565163),
 (2016, 0.3488353),
 (1302, 0.34628427),
 (2013, 0.33882904),
 (1154, 0.32753092),
 (2411, 0.32710063),
 (6822, 0.31697372)]

In [362]:
users.head()

Unnamed: 0,userID,userSK
0,8,0
1,32,1
2,57,2
3,147,3
4,233,4


In [363]:
urls.head()

Unnamed: 0,urlPrincipal,urlID
0,addons.mozilla.org,0
1,archive.ifla.org,1
2,augusthouse.com,2
3,bctf.ca,3
4,blog.cathyjonelson.com,4


In [364]:
cols = urls.columns.tolist()
cols = cols[-1:] + cols[:-1]
urls = urls[cols]
urls.head()

Unnamed: 0,urlID,urlPrincipal
0,0,addons.mozilla.org
1,1,archive.ifla.org
2,2,augusthouse.com
3,3,bctf.ca
4,4,blog.cathyjonelson.com


In [387]:
url_id_name = urls['urlPrincipal'].to_dict()

In [391]:
url_id_name['87']

'www.youtube.com'

In [389]:
for r in recommendations:
    print(str(r[0]))

87
1177
12383
12308
2016
1302
2013
1154
2411
6822


In [392]:
for r in recommendations:
    print(url_id_name[str(r[0])])

www.youtube.com
www.readwritethink.org
www.sldirectory.com
voicethread.com
nlvm.usu.edu
docs.google.com
illuminations.nctm.org
www.eschoolnews.com
www.voki.com
www.teachertube.com


In [39]:
for r in recommendations:
    print(artist_id_name[str(r[0])])

kittie
the dresden dolls
emilie autumn
flyleaf
epica
after forever
in this moment
otep
the gossip
tarja turunen


In [40]:
itemid = 107209
related = model.similar_items(itemid)

In [41]:
related

[(107209, 0.12407882),
 (157178, 0.11105041),
 (171169, 0.11082364),
 (261221, 0.1099599),
 (187011, 0.10985715),
 (127227, 0.10922149),
 (48904, 0.10792142),
 (218091, 0.10681016),
 (260378, 0.10674973),
 (155893, 0.10651716)]

In [42]:
for a in related:
    print(artist_id_name[str(a[0])])

eminem
kanye west
linkin park
the prodigy
michael jackson
gorillaz
black eyed peas
rage against the machine
the offspring
justin timberlake


In [37]:
artist_id_name['234786']

'sick on the bus'