In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import implicit

## Load data and count interact

In [2]:
df = pd.read_csv('userLog_201801_201802_for_participants.csv', sep=';')
df = df[['userCode', 'project_id']]
df.head(5)

Unnamed: 0,userCode,project_id
0,7717bdc2-ea3e-e8ad-5d6b-178bd71c38b2,7956
1,7717bdc2-ea3e-e8ad-5d6b-178bd71c38b2,7956
2,cb5b4b68-cc01-6db6-f54b-4a0f881301c5,5067
3,5f74cef2-0d1e-b619-3564-0955a14e0985,6654
4,dba8f279-844e-eef6-73ac-22bd7d1353cc,6474


In [3]:
df['interacted'] = 1
df.head(5)

Unnamed: 0,userCode,project_id,interacted
0,7717bdc2-ea3e-e8ad-5d6b-178bd71c38b2,7956,1
1,7717bdc2-ea3e-e8ad-5d6b-178bd71c38b2,7956,1
2,cb5b4b68-cc01-6db6-f54b-4a0f881301c5,5067,1
3,5f74cef2-0d1e-b619-3564-0955a14e0985,6654,1
4,dba8f279-844e-eef6-73ac-22bd7d1353cc,6474,1


In [4]:
data = df.groupby(['userCode', 'project_id']).size().reset_index()
data.head(10)

Unnamed: 0,userCode,project_id,0
0,00005aba-5ebc-0821-f5a9-bacca40be125,5342,1
1,0000bae7-6233-d7cc-2a6d-48aa70fe8ad4,5678,1
2,0000c576-e929-19eb-615a-349ec3b4709b,6461,1
3,0000d196-6385-80b8-661d-b7427042daa3,9040,1
4,0000e1e2-f595-0ae7-860f-fcc07dcb116e,6709,1
5,0000e1e2-f595-0ae7-860f-fcc07dcb116e,6712,1
6,0000fa46-1f0b-9504-b568-43479d17620e,4184,1
7,0000fa46-1f0b-9504-b568-43479d17620e,4703,2
8,0000fa46-1f0b-9504-b568-43479d17620e,6577,1
9,0000fa46-1f0b-9504-b568-43479d17620e,8829,2


In [5]:
data.columns = ['userCode', 'project_id', 'interact']
data.head(10)

Unnamed: 0,userCode,project_id,interact
0,00005aba-5ebc-0821-f5a9-bacca40be125,5342,1
1,0000bae7-6233-d7cc-2a6d-48aa70fe8ad4,5678,1
2,0000c576-e929-19eb-615a-349ec3b4709b,6461,1
3,0000d196-6385-80b8-661d-b7427042daa3,9040,1
4,0000e1e2-f595-0ae7-860f-fcc07dcb116e,6709,1
5,0000e1e2-f595-0ae7-860f-fcc07dcb116e,6712,1
6,0000fa46-1f0b-9504-b568-43479d17620e,4184,1
7,0000fa46-1f0b-9504-b568-43479d17620e,4703,2
8,0000fa46-1f0b-9504-b568-43479d17620e,6577,1
9,0000fa46-1f0b-9504-b568-43479d17620e,8829,2


## Encode userCode project_id to user_code project_code for use in sparse.csr_matrix()

In [6]:
data = data.dropna()
data['userCode'] = data['userCode'].astype("category")
data['project_id'] = data['project_id'].astype("category")
data['user_code'] = data['userCode'].cat.codes
data['project_code'] = data['project_id'].cat.codes
data.head(5)

Unnamed: 0,userCode,project_id,interact,user_code,project_code
0,00005aba-5ebc-0821-f5a9-bacca40be125,5342,1,0,2130
1,0000bae7-6233-d7cc-2a6d-48aa70fe8ad4,5678,1,1,2383
2,0000c576-e929-19eb-615a-349ec3b4709b,6461,1,2,2873
3,0000d196-6385-80b8-661d-b7427042daa3,9040,1,3,4995
4,0000e1e2-f595-0ae7-860f-fcc07dcb116e,6709,1,4,3079


In [7]:
sparse_item_user = sparse.csr_matrix((data['interact'].astype(float), (data['project_code'], data['user_code'])))
sparse_user_item = sparse.csr_matrix((data['interact'].astype(float), (data['user_code'], data['project_code'])))

## Modeling using Iimplicit library

In [8]:
model = implicit.als.AlternatingLeastSquares()



In [9]:
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

In [10]:
model.fit(data_conf)

100%|██████████| 15.0/15 [00:50<00:00,  3.74s/it]


## Load test data and merge with encode data to match user_code project_code

In [11]:
testfile = pd.read_csv('testing_users.csv')
testfile.head(5)

Unnamed: 0,userCode
0,003366c6-0cd9-48e7-d134-2051d7360c2d
1,003b2f18-60b4-37be-c63d-0caead1afa97
2,003d571e-9de7-6de3-2cb6-2ae190cc7beb
3,00426ede-6813-949b-de2e-6d8b228dff1d
4,0043a9c7-cace-1aa1-3188-4b173bf28991


In [12]:
encode_test = testfile.merge(data, how='left',on = 'userCode')
print encode_test.shape
encode_test.head(10)

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

(19272, 5)


Exception ValueError: "Buffer dtype mismatch, expected 'Python object' but got 'long'" in 'pandas._libs.lib.is_bool_array' ignored


Unnamed: 0,userCode,project_id,interact,user_code,project_code
0,003366c6-0cd9-48e7-d134-2051d7360c2d,6007,2,337,2570
1,003b2f18-60b4-37be-c63d-0caead1afa97,5980,2,408,2552
2,003d571e-9de7-6de3-2cb6-2ae190cc7beb,6652,3,419,3031
3,003d571e-9de7-6de3-2cb6-2ae190cc7beb,6824,1,419,3171
4,00426ede-6813-949b-de2e-6d8b228dff1d,8227,1,459,4281
5,0043a9c7-cace-1aa1-3188-4b173bf28991,8345,1,468,4383
6,005063ec-6147-54c6-e31e-64fb2f127618,1632,1,576,145
7,0075bd3a-e711-0958-8666-e40403ba5249,4179,1,850,1384
8,008296bd-7a40-3b84-1937-a477c52c2efa,7817,1,944,3955
9,00ba2a68-3d1f-f4f7-888d-57688cf2a887,3738,1,1330,1082


In [13]:
test= encode_test[['userCode', 'user_code']]
test = test.drop_duplicates()
print test.shape
test.head(5)

(7408, 2)


Unnamed: 0,userCode,user_code
0,003366c6-0cd9-48e7-d134-2051d7360c2d,337
1,003b2f18-60b4-37be-c63d-0caead1afa97,408
2,003d571e-9de7-6de3-2cb6-2ae190cc7beb,419
4,00426ede-6813-949b-de2e-6d8b228dff1d,459
5,0043a9c7-cace-1aa1-3188-4b173bf28991,468


## Prediction

In [14]:
with open('predict.csv', 'w') as file:
    for index, row in test.iterrows():
        user_code = row['user_code']
        recommended = model.recommend(user_code, sparse_user_item,N=7)
        answer = []

        for item in recommended:
            idx, score = item
            answer.append(data.project_id.loc[data.project_code == idx].iloc[0])
        recommendations = pd.DataFrame({'userCode':test.userCode.loc[test.user_code == user_code].iloc[0],'project': answer})
        userCode = recommendations['userCode'][0]
        projectId = list(recommendations['project'])
        projectId = [str(item) for item in projectId]
        predict = userCode + ',' + ' '.join(projectId)
        file.write(predict+'\n')