In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

import scipy.sparse as sparse

%matplotlib inline

from implicit.als import AlternatingLeastSquares

In [2]:
raw_data = pd.read_csv('user_contacts.csv', sep = '\t')

In [3]:
raw_data.head()

Unnamed: 0,userID,contactID,date_day,date_month,date_year,date_hour,date_minute,date_second
0,8,28371,4,10,2010,2,14,19
1,8,40306,3,10,2010,12,33,9
2,8,85279,2,10,2010,3,10,48
3,8,91764,4,10,2010,22,43,47
4,8,97723,12,10,2010,3,28,33


In [4]:
raw_data = raw_data.groupby(['userID', 'contactID']).count().reset_index()[['userID', 'contactID', 'date_day']]
raw_data.head()

Unnamed: 0,userID,contactID,date_day
0,8,28371,1
1,8,40306,1
2,8,85279,1
3,8,91764,1
4,8,97723,1


In [5]:
data = pd.DataFrame()
data['userID'] = raw_data['userID']
data['contactID'] = raw_data['contactID']
data['contact'] = raw_data['date_day']
data = data.loc[data.contact != 0]
data.head()

Unnamed: 0,userID,contactID,contact
0,8,28371,1
1,8,40306,1
2,8,85279,1
3,8,91764,1
4,8,97723,1


In [6]:
data['user_id'] = data['userID'].astype("category").cat.codes
data['contact_id'] = data['contactID'].astype("category").cat.codes
data.head()

Unnamed: 0,userID,contactID,contact,user_id,contact_id
0,8,28371,1,0,632
1,8,40306,1,0,814
2,8,85279,1,0,1529
3,8,91764,1,0,1614
4,8,97723,1,0,1705


In [7]:
item_lookup = data[['contactID', 'contact_id']].drop_duplicates()
item_lookup['contact_id'] = item_lookup.contact_id.astype(str)

In [8]:
item_lookup.head()

Unnamed: 0,contactID,contact_id
0,28371,632
1,40306,814
2,85279,1529
3,91764,1614
4,97723,1705


In [9]:
contactID_name = {}

for index, row in tqdm_notebook(item_lookup.iterrows()):
    contactID_name[row.contact_id] = row.contactID

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
data = data.drop(['userID', 'contactID'], axis=1)
data.head()

Unnamed: 0,contact,user_id,contact_id
0,1,0,632
1,1,0,814
2,1,0,1529
3,1,0,1614
4,1,0,1705


In [11]:
user_id = list(np.sort(data.user_id.unique()))
contact_id = list(np.sort(data.contact_id.unique()))
contact = list(data.contact)

In [12]:
rows = data.user_id.astype(int)
cols = data.contact_id.astype(int)

In [13]:
data_sparse = sparse.csr_matrix((contact, (cols, rows)), shape=(len(contact_id), len(user_id)))

In [14]:
model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)

100%|███████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:00<00:00, 156.35it/s]


In [15]:
userid = 0

user_items = data_sparse.T.tocsr()
recommendations = model.recommend(userid, user_items)

In [16]:
recommendations

[(1658, 0.61004156),
 (341, 0.6100413),
 (1688, 0.52943075),
 (567, 0.51631254),
 (1249, 0.47728875),
 (0, 0.39548635),
 (1565, 0.0007290095),
 (1474, 0.00061716326),
 (275, 0.0005584806),
 (1042, 0.00045377656)]

In [17]:
for r in recommendations:
    print(contactID_name[str(r[0])])

94895
13102
96666
24770
67926
8
88649
81597
10949
54255
