In [2]:
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz
from implicit.als import AlternatingLeastSquares
from scipy.sparse import load_npz
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import json



In [3]:
transaction = pd.read_csv("/home/slahlou/Documents/recoGnomon/dataset/datas/transactions_train.csv")


In [4]:
transaction.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [5]:
def get_interaction_matrix(transaction):
	interaction_counts = transaction.groupby(['customer_id', 'article_id']).size().reset_index(name='interaction_count')
	user_encoder = LabelEncoder()
	item_encoder = LabelEncoder()

	interaction_counts['user_idx'] = user_encoder.fit_transform(interaction_counts['customer_id'])
	interaction_counts['item_idx'] = item_encoder.fit_transform(interaction_counts['article_id'])

	# Step 3: Create sparse matrix
	interaction_matrix = csr_matrix((
		interaction_counts['interaction_count'],
		(interaction_counts['user_idx'], interaction_counts['item_idx'])
	))

	user_id_map = dict(zip(interaction_counts['user_idx'], interaction_counts['customer_id']))
	item_id_map = dict(zip(interaction_counts['item_idx'], interaction_counts['article_id']))

	return {
		"interaction_matrix": interaction_matrix,
		"user_id_map": user_id_map,
		"item_id_map": item_id_map
		 }

def save_interaction_matrix(filepath, interaction_matrix):
	save_npz(filepath, interaction_matrix)

def json_save(filepath, id_map):
	id_map_json = {str(k): v for k, v in id_map.items()}
	with open(filepath, "w") as f:
		json.dump(id_map_json, f)

In [6]:
def leave_one_out_split(df):
	df = df.sort_values(by=['customer_id', 't_dat'])
	test_rows = df.groupby('customer_id').tail(1)
	train_rows = df.drop(index=test_rows.index)
	return train_rows, test_rows

In [7]:
train_rows, test_rows = leave_one_out_split(transaction)

In [8]:
train_rows.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
4212358,2018-12-27,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,625548001,0.044051,1
4212359,2018-12-27,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023,0.035576,1
4212360,2018-12-27,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,627759010,0.030492,1
9663224,2019-05-02,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,697138006,0.010153,2
10754876,2019-05-25,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006,0.050831,2


In [9]:
train_interaction_matrix = get_interaction_matrix(train_rows)

In [10]:
interaction_matrix = train_interaction_matrix["interaction_matrix"]
user_id_map = train_interaction_matrix["user_id_map"]
item_id_map = train_interaction_matrix["item_id_map"]

In [11]:
print(type(interaction_matrix))
print(type(user_id_map))
print(type(item_id_map))

<class 'scipy.sparse._csr.csr_matrix'>
<class 'dict'>
<class 'dict'>


##### checking if the sparsity has change

In [12]:
num_users, num_items = interaction_matrix.shape
num_possible_interactions = num_users * num_items
num_actual_interactions = interaction_matrix.nnz
sparsity = 1 - (num_actual_interactions / num_possible_interactions)

print(f"Sparsity: {sparsity:.4f} ({sparsity*100:.2f}%)")

Sparsity: 0.9998 (99.98%)


In [13]:
save_interaction_matrix('dataset/collaborative-filtering-data/interaction_matrix_loo.npz', interaction_matrix)
json_save('dataset/collaborative-filtering-data/user_id_map_loo.json', user_id_map)
json_save('dataset/collaborative-filtering-data/item_id_map_loo.json', item_id_map)

#### Now lets train our model on the generated interaction matrix and see how our model perform

In [14]:
interaction_matrix = load_npz("dataset/collaborative-filtering-data/interaction_matrix_loo.npz")
with open("dataset/collaborative-filtering-data/user_id_map_loo.json") as f:
	user_id_map = {int(k): v for k, v in json.load(f).items()}

with open("dataset/collaborative-filtering-data/item_id_map_loo.json") as f:
	item_id_map = {int(k): v for k, v in json.load(f).items()}

In [15]:
model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
model.fit(interaction_matrix)

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [16]:
userid = 0
print(userid) 
ids, scores = model.recommend(userid, interaction_matrix[userid], N=10, filter_already_liked_items=True)
print('ids:', ids)
print('scores:', scores)

0
ids: [15911 15927 15912 16965 58312 16966 42415  6609 18498 66840]
scores: [0.4132275  0.31898493 0.26439756 0.18125413 0.16386145 0.15594429
 0.15402187 0.13101044 0.11816773 0.09811558]


In [17]:
def get_reindex_id(id_map, original_id):
	index_id = [k for k, v in id_map.items() if v == original_id][0]
	return index_id


In [18]:
def hit_rate_at_k(model, train_matrix, test_rows, user_id_map, item_id_map, K=10):
	hits = 0
	total = 0
	total_len = len(test_rows)

	for idx, (_, row) in enumerate(test_rows.iterrows()):
		try:
			uid = get_reindex_id(user_id_map, row["customer_id"])
			iid = get_reindex_id(item_id_map, row["article_id"])

			recommended, score = model.recommend(uid, train_matrix[uid], N=K, filter_already_liked_items=True)
			recommended_items = [item_id for item_id in recommended]
			
			if iid in recommended_items:
				hits += 1
			total += 1
		except IndexError as e:
			print("LOG: customer_id or item_id not in train ds, skipping this row..")
		if idx % 100 == 0:
			print(f'LOG: {idx} / {total_len}')

	return hits / total

In [29]:

def get_reindex_ids_from_df(df, column_name, id_map):
    index_ids = []
    not_found = []

    # Reverse the id_map for efficient value-to-key lookup
    reverse_map = {v: k for k, v in id_map.items()}

    for oid in df[column_name]:
        if oid in reverse_map:
            index_ids.append(reverse_map[oid])
        else:
            not_found.append(oid)

    return index_ids, not_found

def drop_missing_ids(df, column_name, id_map):
    # Reverse the map for quick lookup
    reverse_map = {v: k for k, v in id_map.items()}
    
    # Keep only rows where the original_id exists in the reverse_map
    filtered_df = df[df[column_name].isin(reverse_map.keys())].copy()
    
    return filtered_df

In [34]:
test_whitout_unseen = drop_missing_ids(test_rows, "customer_id", user_id_map)
test_whitout_unseen = drop_missing_ids(test_whitout_unseen, "article_id", item_id_map)

In [35]:
len(test_rows)

1362281

In [36]:
len(test_whitout_unseen)

1230430

In [40]:
micro_test_whitout_unseen = test_whitout_unseen[:5000]

In [41]:
score = hit_rate_at_k(model, interaction_matrix, micro_test_whitout_unseen, user_id_map, item_id_map)

LOG: 0 / 5000
LOG: 100 / 5000
LOG: 200 / 5000
LOG: 300 / 5000
LOG: 400 / 5000
LOG: 500 / 5000
LOG: 600 / 5000
LOG: 700 / 5000
LOG: 800 / 5000
LOG: 900 / 5000
LOG: 1000 / 5000
LOG: 1100 / 5000
LOG: 1200 / 5000
LOG: 1300 / 5000
LOG: 1400 / 5000
LOG: 1500 / 5000
LOG: 1600 / 5000
LOG: 1700 / 5000
LOG: 1800 / 5000
LOG: 1900 / 5000
LOG: 2000 / 5000
LOG: 2100 / 5000
LOG: 2200 / 5000
LOG: 2300 / 5000
LOG: 2400 / 5000
LOG: 2500 / 5000
LOG: 2600 / 5000
LOG: 2700 / 5000
LOG: 2800 / 5000
LOG: 2900 / 5000
LOG: 3000 / 5000
LOG: 3100 / 5000
LOG: 3200 / 5000
LOG: 3300 / 5000
LOG: 3400 / 5000
LOG: 3500 / 5000
LOG: 3600 / 5000
LOG: 3700 / 5000
LOG: 3800 / 5000
LOG: 3900 / 5000
LOG: 4000 / 5000
LOG: 4100 / 5000
LOG: 4200 / 5000
LOG: 4300 / 5000
LOG: 4400 / 5000
LOG: 4500 / 5000
LOG: 4600 / 5000
LOG: 4700 / 5000
LOG: 4800 / 5000
LOG: 4900 / 5000


In [42]:
print(score)

0.0384
