Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
%%time

transactions = pd.read_parquet('../input/warmup/transactions_train.parquet')
customers = pd.read_parquet('../input/parquet-new-assignement/new_customer_one.parquet')
articles = pd.read_parquet('../input/parquet-new-assignement/new_articles_one.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: user 1.37 s, sys: 1.32 s, total: 2.69 s
Wall time: 5.51 s


In [5]:
#adding of the column week to the dataframe transactions
transactions.t_dat = pd.to_datetime(transactions.t_dat, format='%Y-%m-%d')
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

In [6]:
#definition of the week for test
test_week = transactions.week.max() + 1 
#transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [7]:
'''deciding to work only with data of the months of September since probably articles
bought in this period could be the same and then the reccomendation could be more 
related also to the period in which we need to give them'''

#filtering only the weeks of the month of semptember for the years 2018-2020
filtered_transactions = transactions[(transactions['t_dat'].dt.year == 2020)]

#groupby of candidates based on the customer_id and extraction of weeks
c2weeks = filtered_transactions.groupby('customer_id')['week'].unique()

#creation of a dictionary for shifted weeks
c2weeks2shifted_weeks = {}
candidates_last_purchase_2020 = filtered_transactions.copy()

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

#update of the dataframe candidates_last_purchase_2020
weeks = []
for i, (c_id, week) in enumerate(zip(filtered_transactions['customer_id'], filtered_transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase_2020.week = weeks

### candidates

In [8]:
#creation of the mean price of purchases for each candidate only in the filtered transactions
mean_price_purchases = filtered_transactions.groupby('customer_id')['price'].mean().rename('mean price purchases').reset_index()
mean_price_purchases

Unnamed: 0,customer_id,mean price purchases
0,4245900472157,0.021169
1,28847241659200,0.031091
2,41046458195168,0.016932
3,41318098387474,0.020322
4,46878247658203,0.016932
...,...,...
862719,18446624797007271432,0.013119
862720,18446630855572834764,0.045181
862721,18446662237889060501,0.057186
862722,18446705133201055310,0.050831


In [9]:
#identification of the gender group that is the most present in a specific geographic area
most_present_gender = customers.groupby(['customer_id', 'postal_code', 'numerical_gender']).size().reset_index(name='count')
idx = most_present_gender.groupby(['customer_id', 'postal_code'])['count'].transform(max) == most_present_gender['count']
most_present_gender = most_present_gender[idx][['customer_id', 'postal_code', 'numerical_gender']]
most_present_gender = most_present_gender.rename(columns = {'numerical_gender': 'most_present_gender_in_area'})
most_present_gender.drop(columns = 'postal_code', inplace = True)
most_present_gender

Unnamed: 0,customer_id,most_present_gender_in_area
0,4245900472157,2
1,23962613628581,0
2,25398598941468,2
3,28847241659200,2
4,41046458195168,2
...,...,...
1371975,18446630855572834764,0
1371976,18446662237889060501,2
1371977,18446705133201055310,0
1371978,18446723086055369602,2


In [10]:
#identification of the medium age for geographic area
median_age = customers.groupby(['customer_id', 'postal_code'])['age'].median().reset_index(name = 'median_age_area')
median_age.drop(columns = 'postal_code', inplace = True)
median_age

Unnamed: 0,customer_id,median_age_area
0,4245900472157,21.0
1,23962613628581,34.0
2,25398598941468,21.0
3,28847241659200,21.0
4,41046458195168,18.0
...,...,...
1371975,18446630855572834764,33.0
1371976,18446662237889060501,75.0
1371977,18446705133201055310,60.0
1371978,18446723086055369602,33.0


In [11]:
candidates_definitive = pd.merge(candidates_last_purchase_2020, mean_price_purchases, on = 'customer_id') #merge of values of mean fo the purchases
candidates_definitive = pd.merge(candidates_definitive, most_present_gender, on = 'customer_id') #merge of values of the dominant gender in teh geographic area of the customer
candidates_definitive = pd.merge(candidates_definitive, median_age, on = 'customer_id') #merge of values of the dominant gender in teh geographic area of the customer

candidates_definitive.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'week', 'mean price purchases', 'most_present_gender_in_area',
       'median_age_area'],
      dtype='object')

In [12]:
candidates_definitive = candidates_definitive.drop_duplicates(subset = 'customer_id')

In [13]:
#dataset about candidates in the analyzed period
candidates_definitive.reset_index(drop = True, inplace = True)
candidates_definitive

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,mean price purchases,most_present_gender_in_area,median_age_area
0,2020-01-01,4195624216542755,799417004,0.030492,2,68,0.023881,2,56.0
1,2020-01-01,10296145678877316,682771005,0.015237,2,68,0.019812,2,21.0
2,2020-01-01,18673653377351162,775077001,0.015237,2,69,0.028413,2,28.0
3,2020-01-01,25713010999983855,687948001,0.042356,2,71,0.026250,2,22.0
4,2020-01-01,29659536698466345,777148005,0.050831,2,69,0.052631,0,60.0
...,...,...,...,...,...,...,...,...,...
862719,2020-09-22,18409112236320621348,860833002,0.025407,2,105,0.029644,2,21.0
862720,2020-09-22,18417769707947924979,729860001,0.022017,2,105,0.019475,2,40.0
862721,2020-09-22,18418054986721795659,873279003,0.042356,2,105,0.042356,0,28.0
862722,2020-09-22,18421175435799911749,863583001,0.033881,2,105,0.033881,2,28.0


In [14]:
#test set about transactions
test_set_transactions = filtered_transactions.drop_duplicates('customer_id')
test_set_transactions.reset_index(drop = True, inplace = True)
test_set_transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
0,2020-01-01,4195624216542755,799417004,0.030492,2,67
1,2020-01-01,10296145678877316,682771005,0.015237,2,67
2,2020-01-01,18673653377351162,775077001,0.015237,2,67
3,2020-01-01,25713010999983855,687948001,0.042356,2,67
4,2020-01-01,29659536698466345,777148005,0.050831,2,67
...,...,...,...,...,...,...
862719,2020-09-22,18409112236320621348,860833002,0.025407,2,104
862720,2020-09-22,18417769707947924979,729860001,0.022017,2,104
862721,2020-09-22,18418054986721795659,873279003,0.042356,2,104
862722,2020-09-22,18421175435799911749,863583001,0.033881,2,104


In [15]:
#bestsellers for 2020
mean_price = filtered_transactions \
    .groupby(['week', 'article_id'])['price'].mean()

sales = filtered_transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(50).rename('bestseller_rank').astype('int8')

bestsellers_2020 = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()

# Combining transactions and candidates / negative examples

##### Images to create negative candidates

In [16]:
#print(bestsellers_2020[bestsellers_2020['bestseller_rank'] == 1]) #first bestseller for each considered week

In [17]:
top_5_bestsellers_article_ref = bestsellers_2020['article_id'][bestsellers_2020['bestseller_rank'].between(1, 5)]
top_5_bestsellers_article_ref = top_5_bestsellers_article_ref.reset_index(drop = True)
#top_5_bestsellers_article_ref = pd.DataFrame(top_5_bestsellers_article_ref)
top_5_bestsellers_article_ref

0      736870005
1      746775001
2      720125001
3      831211001
4      706016001
         ...    
186    924243001
187    924243002
188    918522001
189    923758001
190    866731001
Name: article_id, Length: 191, dtype: int64

In [18]:
top_5_bestsellers_article_ref = article_id_int_to_str(top_5_bestsellers_article_ref)
top_5_bestsellers_article_ref

0      0736870005
1      0746775001
2      0720125001
3      0831211001
4      0706016001
          ...    
186    0924243001
187    0924243002
188    0918522001
189    0923758001
190    0866731001
Name: article_id, Length: 191, dtype: object

In [19]:
import os 

directory = '../input/h-and-m-personalized-fashion-recommendations/images'
found_images_5bestsellers = []

for root, dirs, files in os.walk(directory):
    for file in files:
        article_id = file.split('.')[0].strip().lower()
        if article_id in [x.strip().lower() for x in top_5_bestsellers_article_ref]:
            found_images_5bestsellers.append(article_id)

In [20]:
bestsellers_2020_str = article_id_int_to_str(bestsellers_2020['article_id'])
bestsellers_2020_str

0       0736870005
1       0746775001
2       0720125001
3       0831211001
4       0706016001
           ...    
1895    0911699002
1896    0923340001
1897    0904571001
1898    0896169002
1899    0881942001
Name: article_id, Length: 1900, dtype: object

In [21]:
matching_article_ids_2020 = []

for root, dirs, files in os.walk(directory):
    for file in files:
        article_id = file.split('.')[0].strip().lower()
        if article_id in [x.strip().lower() for x in bestsellers_2020_str]:
            matching_article_ids_2020.append(article_id)

In [22]:
'''directory = '/kaggle/input/h-and-m-personalized-fashion-recommendations/images'

image_count = 0

for root, dirs, files in os.walk(directory):
    for file in files:
        # Check if the file is an image (you can customize the extension list)
        if file.lower().endswith(('.jpg')):
            image_count += 1

print(f"Total number of images in {directory} and its subdirectories: {image_count}")'''

'directory = \'/kaggle/input/h-and-m-personalized-fashion-recommendations/images\'\n\nimage_count = 0\n\nfor root, dirs, files in os.walk(directory):\n    for file in files:\n        # Check if the file is an image (you can customize the extension list)\n        if file.lower().endswith((\'.jpg\')):\n            image_count += 1\n\nprint(f"Total number of images in {directory} and its subdirectories: {image_count}")'

In [23]:
#unique elements of bestsellers_2020_str and top_5_bestsellers_article_ref
unique_article_count_bestsellers_2020 = len(set(bestsellers_2020_str))
unique_article_count_5_bestsellers = len(set(top_5_bestsellers_article_ref))

print(f"Total number of unique articles in bestsellers_2020_str: {unique_article_count_bestsellers_2020}")
print(f"Total number of unique articles in top_5_bestsellers_article_ref: {unique_article_count_5_bestsellers}")

Total number of unique articles in bestsellers_2020_str: 792
Total number of unique articles in top_5_bestsellers_article_ref: 111


In [24]:
#remove of duplicate elements from the arrays
found_images_5bestsellers = set(found_images_5bestsellers)
matching_article_ids_2020 = set(matching_article_ids_2020)

In [25]:
#keeping of only the elements inside my dataset bestsellers_2020 with an image that is similar to the other images of the bestseller articles

import numpy as np
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image
from sklearn.metrics.pairwise import cosine_similarity

model = VGG16(weights = 'imagenet', include_top = False)

def extract_features(image_path):
    img = image.load_img(image_path, target_size = (224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis = 0)
    img_array = preprocess_input(img_array)
    features = model.predict(img_array)
    return features.flatten()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [26]:
#code to delete the images with low similarity 
'''#find the images into the subfolders
def find_images(image_filename, base_folder='/kaggle/input/h-and-m-personalized-fashion-recommendations/images'):
    for root, dirs, files in os.walk(base_folder):
        if image_filename in files:
            return os.path.join(root, image_filename)
    return None

#remove images with low similarity
def remove_similar_images(df, threshold = 0.5):
    for image_filename_i in matching_article_ids_2020:
        for image_filename_j in matching_article_ids_2020:
            if image_filename_i != image_filename_j:
                img1_path = find_images(image_filename_i)
                img2_path = find_images(image_filename_j)

                if img1_path is not None and img2_path is not None:
                    features1 = extract_features(img1_path)
                    features2 = extract_features(img2_path)

                    similarity = cosine_similarity([features1], [features2])[0][0]

                    if similarity < threshold:
                        print(f"Removing images: {image_filename_i} and {image_filename_j}")
                        # Assuming 'Image' is the column containing image filenames
                        indices_to_remove = df[df['article_id'].isin([image_filename_i, image_filename_j])].index
                        df.drop(indices_to_remove, inplace=True)
    return df

bestsellers_2020_def = remove_similar_images(bestsellers_2020['article_id'])'''

'#find the images into the subfolders\ndef find_images(image_filename, base_folder=\'/kaggle/input/h-and-m-personalized-fashion-recommendations/images\'):\n    for root, dirs, files in os.walk(base_folder):\n        if image_filename in files:\n            return os.path.join(root, image_filename)\n    return None\n\n#remove images with low similarity\ndef remove_similar_images(df, threshold = 0.5):\n    for image_filename_i in matching_article_ids_2020:\n        for image_filename_j in matching_article_ids_2020:\n            if image_filename_i != image_filename_j:\n                img1_path = find_images(image_filename_i)\n                img2_path = find_images(image_filename_j)\n\n                if img1_path is not None and img2_path is not None:\n                    features1 = extract_features(img1_path)\n                    features2 = extract_features(img2_path)\n\n                    similarity = cosine_similarity([features1], [features2])[0][0]\n\n                    if simi

In [28]:
#code to add the column similarity_value to my dataframe

# Find images in subfolders
'''def find_images(image_filename, base_folder='/kaggle/input/h-and-m-personalized-fashion-recommendations/images'):
    for root, dirs, files in os.walk(base_folder):
        if image_filename in files:
            return os.path.join(root, image_filename)
    return None

def calculate_similarity(df):
    similarity_values = [] 

    for image_filename_i in matching_article_ids_2020:
        for image_filename_j in matching_article_ids_2020:
            if image_filename_i != image_filename_j:
                img1_path = find_images(image_filename_i)
                img2_path = find_images(image_filename_j)

                if img1_path is not None and img2_path is not None:
                    features1 = extract_features(img1_path)
                    features2 = extract_features(img2_path)

                    similarity = cosine_similarity([features1], [features2])[0][0]

                    similarity_values.append(similarity)

    df['similarity_value'] = similarity_values
    return df

bestsellers_2020_def = calculate_similarity(bestsellers_2020)'''

KeyboardInterrupt: 

In [None]:
#bestsellers_2020_def.columns

##### Creation of the dataset to use

In [None]:
filtered_transactions.loc[:, 'purchased'] = 1

In [None]:
data = pd.concat([filtered_transactions, candidates_last_purchase_2020])
data = pd.merge(data, bestsellers_2020[['article_id', 'bestseller_rank']], on = 'article_id', how = 'left')
data.fillna(0, inplace = True)

In [None]:
#adding candidates info
data = pd.merge(data, candidates_definitive[['customer_id', 'mean price purchases', 'most_present_gender_in_area', 'median_age_area']], on = 'customer_id', how = 'left')

In [None]:
#adding initial datasets to my new dataset
data = pd.merge(data, articles, on = 'article_id', how = 'left')
data = pd.merge(data, customers, on = 'customer_id', how = 'left')

In [None]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [None]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [None]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'bestseller_rank', 'mean price purchases', 'most_present_gender_in_area', 'median_age_area']

In [None]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

# Model training

In [None]:
from lightgbm.sklearn import LGBMRanker

In [None]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [None]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group = train_baskets,
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

# Calculate predictions

In [None]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_2020_all = \
    bestsellers_2020[bestsellers_2020.week == bestsellers_2020.week.max()]['article_id'].tolist()

# Create submission

In [None]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [None]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_2020_all
    preds.append(pred[:50])

In [None]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [None]:
sub_name = 'submission'
sub.to_csv(f'{sub_name}.csv', index=False)

# Evaluation of the model V1

In [None]:
test.columns

In [None]:
clothes_purchased = []
filtered_transactions = filtered_transactions.reset_index(drop = True)

for index, purchase in enumerate(filtered_transactions['purchased']):
    if purchase == 1:
        clothes_purchased.append(filtered_transactions.at[index, 'article_id'])

In [None]:
#value of k to calculate precision, recall
k_value = [10, 20, 50, 100]

In [None]:
#precision at K 
def precision_at_k(clothes_purchased, preds, k):
    predicted_at_k = preds[:k]
    relevant_at_k = clothes_purchased and predicted_at_k
    precision = len(relevant_at_k) / k
    return precision


#precision = precision_at_k(clothes_purchased, preds, k_value)
#print(f'Precision at {k_value}: {precision}')

In [None]:
#recall at k
def recall_at_k(clothes_purchased, preds, k):
    predicted_at_k = preds[:k]
    relevant_at_k = clothes_purchased and predicted_at_k
    recall_at_k = len(relevant_at_k) / len(clothes_purchased)
    return recall_at_k

#recall = recall_at_k(clothes_purchased, preds, k_value)
#print(f'Recall at {k_value}: {recall}')

In [None]:
precisions = []
recalls = []

for k in k_value:
    precision = precision_at_k(clothes_purchased, preds, k)
    recall = recall_at_k(clothes_purchased, preds, k)
    precisions.append(precision)
    recalls.append(recall)
    print(f'Precision at {k}: {precision:.4f}, Recall at {k}: {recall:.10f}')

In [None]:
#graph about recall

plt.plot(k_value, recalls, label = 'Recall')
plt.xlabel('K')
plt.ylabel('Value')
plt.title('Recall at K')
plt.legend()
plt.show()

In [None]:
#graph about precision

plt.plot(k_value, precisions, label = 'Precision')
plt.xlabel('K')
plt.ylabel('Value')
plt.title('Precision at K')
plt.legend()
plt.show()

# Evaluation of the model V2

In [None]:
!pip install recmetrics

In [None]:
#reference: https://github.com/statisticianinstilettos/recmetrics

from recmetrics import metrics

#precision
recommender_precision = metrics.recommender_precision(predicted = preds, actual = test_X)
print('Value of precision:', recommender_precision)

#recall
recommender_recall = metrics.recommender_recall(predicted = preds, actual = test_X)
print('Value of recall:', recommender_recall)

#MAP --> not working: isn't finding the function
#recommender_mapk = metrics.mapk(predicted = preds, actual = test_X, k = 10)
#print('Value of MAP@K:', recommender_mapk)

#MARK
reccomender_mark = metrics.mark(predicted = preds, actual = test_X, k = 100)
print('Value of MAR@K:', reccomender_mark)

#personalization --> reccomendation similarity across users: high = personalization are different and good; low = personalization are similar and not good
personalization_score = metrics.personalization(predicted = preds[:20000])
print('Value of personalization:', personalization_score)

# Evaluation of the model V3 --> not working

In [None]:
#!pip install recpack

In [None]:
'''%cd '../input/recpack/recpack-master'
!python setup.py install'''

In [None]:
'''from recpack.metrics.precision import precision_k
import numpy as np

y_true_array = test_X.to_numpy()

preds_array = np.array(preds)

nonzero_users = list(set(y_true_array.nonzero()[0]))

nonzero_users_indices = np.array(nonzero_users)

y_true_filtered = y_true_array[nonzero_users_indices, :]

if np.any(nonzero_users_indices >= preds_array.shape[0]):
    print("Error: Some indices are out of bounds.")
else:
    preds_filtered = np.array([preds_array[user_idx] for user_idx in nonzero_users_indices])
    precision = precision_k(y_true=y_true_filtered, y_pred=preds_filtered, k=10)
    print(precision)'''