In [1]:
from Utils.Toolkit import get_data
import numpy as np
import scipy.sparse as sps
import pandas as pd

urm_train = get_data()['train']
urm_test = get_data()['test']
icm_subclass = get_data()['ICM_subclass'].tocsr()

In [2]:
urm_train

<30911x18495 sparse matrix of type '<class 'numpy.float64'>'
	with 371381 stored elements in Compressed Sparse Row format>

In [3]:
urm_test

<30911x18495 sparse matrix of type '<class 'numpy.float64'>'
	with 27255 stored elements in Compressed Sparse Row format>

In [4]:
icm_subclass

<18495x2011 sparse matrix of type '<class 'numpy.float64'>'
	with 18495 stored elements in Compressed Sparse Row format>

In [5]:
cold_users = np.ediff1d(urm_train.indptr) < 1
cold_users_index = np.arange(urm_train.shape[0])[cold_users]
cold_users_index

array([    3,    13,    20, ..., 30902, 30903, 30907])

In [6]:
tot_cold_users = len(cold_users_index)/urm_train.shape[0]*100
print("Total percentage of cold users is {:.2f}%".format(tot_cold_users))

Total percentage of cold users is 20.94%


In [7]:
target = get_data()['target_users']
counter = 0

for cold_index in cold_users_index:
    if cold_index in target:
        counter += 1

print(counter)
print("Total percentage of cold users in target is {:.2f}%".format(counter/len(target)*100))

5552
Total percentage of cold users in target is 19.75%


In [8]:
max_value = max(np.ediff1d(urm_train.indptr))
user_mask = np.where(np.ediff1d(urm_train.indptr) == max_value)
user_mask

(array([63]),)

In [9]:
import collections

user_id = 63

s_pos = urm_train.indptr[user_id]
e_pos = urm_train.indptr[user_id + 1]

user_profile = urm_train.indices[s_pos:e_pos]
num_of_interactions = len(user_profile)

features = []
for item_id in user_profile:
    s_pos_i = icm_subclass.indptr[item_id]
    e_pos_i = icm_subclass.indptr[item_id + 1]
    
    feature = icm_subclass.indices[s_pos_i:e_pos_i]
    
    features.append(feature)

features = np.squeeze(np.asarray(features))

print("Items    -> {}".format(num_of_interactions))
print("Items    -> {}".format(user_profile[:10]))
print("Features -> {}".format(features[:10]))
print("Occurr   -> {}".format(collections.Counter(features)))

Items    -> 676
Items    -> [ 66  70 114 144 146 187 189 192 194 197]
Features -> [  87   87 1460 1075 1075 1718 1718 1574 1574 1574]
Occurr   -> Counter({1718: 17, 1581: 14, 985: 12, 1202: 11, 122: 10, 294: 9, 1728: 9, 1919: 8, 496: 8, 287: 8, 270: 7, 689: 7, 364: 7, 1692: 7, 721: 7, 1574: 6, 1944: 6, 1098: 6, 1650: 6, 163: 6, 1990: 6, 1786: 6, 1075: 5, 119: 5, 1268: 5, 531: 5, 138: 5, 1766: 5, 582: 5, 1395: 4, 773: 4, 513: 4, 1155: 4, 1798: 4, 779: 4, 1068: 4, 1499: 4, 1977: 4, 1803: 4, 1607: 4, 385: 3, 1179: 3, 509: 3, 258: 3, 744: 3, 70: 3, 1201: 3, 1700: 3, 314: 3, 1078: 3, 474: 3, 602: 3, 1536: 3, 1994: 3, 113: 3, 867: 3, 1895: 3, 1495: 3, 1156: 3, 1288: 3, 1150: 3, 497: 3, 1338: 3, 1847: 3, 1853: 3, 2002: 3, 404: 3, 176: 3, 788: 3, 469: 3, 87: 2, 836: 2, 1484: 2, 1938: 2, 54: 2, 290: 2, 634: 2, 326: 2, 105: 2, 1523: 2, 1214: 2, 624: 2, 1632: 2, 1436: 2, 17: 2, 194: 2, 142: 2, 1311: 2, 1843: 2, 1004: 2, 1301: 2, 1033: 2, 1054: 2, 204: 2, 1665: 2, 1924: 2, 1058: 2, 1181: 2, 1178: 

In [17]:
from Utils.Toolkit import generate_SM_user_feature_matrix

ufm = generate_SM_user_feature_matrix(urm_train, icm_subclass)

Evaluating SM_user_feature_matrix: 100%|██████████| 30911/30911 [00:10<00:00, 2985.67it/s]


Generated UFM with shape (30911, 2011)


In [18]:
order = np.argsort(ufm[user_id].data)[::-1]
print("Top 5 features\t {}".format(ufm[user_id].indices[order][:10]))
print("With score    \t {}".format(ufm[user_id].data[order][:10]))

Top 5 features	 [1718  513  867  509 1728 1764  117 1449 1692  163]
With score    	 [9 6 4 4 4 4 3 3 3 3]


In [19]:
urm = get_data()['URM_all'].tocsr()
icm = get_data()['ICM_subclass'].tocsr()

users = []
features = []
confidence = []

# Per ogni user
for user_id in range(urm.shape[0]):
    # Prendi le features delle item con cui ha interagito e sommale
    arr = icm[urm[user_id].indices,:].sum(axis=0)
    # Converti in array
    arr = np.asarray(arr, dtype=int)[0]
    # Crea maschera dei valori diversi da 0
    conf = arr[arr != 0]
    # 
    feat = np.arange(icm.shape[1])[arr != 0]
    user = np.zeros(len(conf), dtype=int) + user_id
    
    users.extend(user)
    features.extend(feat)
    confidence.extend(conf)

In [20]:
d = {
    'user_id':users,
    'feature_id':features,
    'confidence':confidence
}

p = pd.DataFrame(d)
#p.to_csv('./data/ufm_all.csv', index = None)
p

Unnamed: 0,user_id,feature_id,confidence
0,0,509,1
1,0,662,1
2,0,1155,1
3,0,1453,1
4,0,1544,1
...,...,...,...
333164,30910,1938,2
333165,30910,1944,3
333166,30910,1987,2
333167,30910,1990,1


In [21]:
ufm_all = sps.csr_matrix((confidence,(users, features)))

from tqdm import tqdm

user = []
conf = []
items = []
features = []

urm = urm.tocsr()

for user_id in tqdm(range(ufm_all.shape[0])):
    for item_index in urm[user_id].indices:
        item_feature = icm[item_index].indices[0]
        confidence = ufm_all[user_id, item_feature]
        
        user.append(user_id)
        conf.append(confidence)
        items.append(item_index)

100%|██████████| 30911/30911 [01:04<00:00, 478.96it/s]

398636 398636 398636





In [22]:
data = {
    'user_id' : user,
    'feature_id' : items,
    'confidence' : conf
}

df = pd.DataFrame(data)
df

Unnamed: 0,user_id,feature_id,confidence
0,0,3568,1
1,0,3827,1
2,0,4844,1
3,0,5734,1
4,0,6518,1
...,...,...,...
398631,30910,18176,9
398632,30910,18185,3
398633,30910,18248,1
398634,30910,18349,3


In [None]:
#df.to_csv('./data/urm_ufm_mapped.csv', index=None)

In [23]:
urm_ufm = sps.csr_matrix((conf, (user, items)))
urm_ufm

<30911x18495 sparse matrix of type '<class 'numpy.int64'>'
	with 398636 stored elements in Compressed Sparse Row format>