# Recommender library competition 
## By Amélie Madrona & Linne Verhoeven
Link to the [kaggle competition](https://www.kaggle.com/competitions/library-recommender-competition/overview)

Goal:
* Train recommender system on training interactions data
* Generate recommendations for each user ID in the dataset. 
* Provide the top 10 recommendations of your model for each user
* Make sure that your submission file has the same format as the sample_submission.csv file in the Data tab (i.e. separated by a space)

In [5]:
import pandas as pd
import numpy as np
import importlib


In [6]:
interactions = pd.read_csv('data/interactions_train.csv')
print(interactions.shape)
interactions['t'] = pd.to_datetime(interactions['t'], unit='s')
interactions.head()

(87047, 3)


Unnamed: 0,u,i,t
0,4456,8581,2023-06-23 17:24:46
1,142,1964,2023-03-23 15:30:06
2,362,3705,2024-02-02 11:00:59
3,1809,11317,2023-01-12 14:19:22
4,4384,1323,2023-04-13 16:09:22


In [7]:
items = pd.read_csv('data/items.csv')
print(items.shape)
items.head()

(15291, 6)


Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4


In [8]:
sample_submission = pd.read_csv('data/sample_submission.csv')
print(sample_submission.shape)
print(sample_submission.iloc[0]['recommendation'])
print(type(sample_submission.iloc[0]['recommendation']))

sample_submission.head()


(7838, 2)
3758 11248 9088 9895 5101 6074 9295 14050 10961 8240
<class 'str'>


Unnamed: 0,user_id,recommendation
0,0,3758 11248 9088 9895 5101 6074 9295 14050 1096...
1,1,3263 726 1589 14911 6432 10897 6484 7961 8249 ...
2,2,13508 9848 12244 2742 11120 2893 2461 5439 116...
3,3,2821 10734 6357 5934 2085 12608 12539 10551 10...
4,4,12425 219 11602 1487 14178 489 13888 2110 4413...


In [9]:
n_users = interactions.u.nunique()
n_items = items.i.nunique()
print(f'Number of users = {n_users}, \nNumber of movies = {n_items} \nNumber of interactions = {len(interactions)}')
# So the sample submission is the top x items for each user
# And we have info for all the books in items

Number of users = 7838, 
Number of movies = 15291 
Number of interactions = 87047


In [10]:
# TODO: EDA on the interactions data and the items metadata. 

In [11]:
# Let's first sort the interactions by user and time stamp
interactions = interactions.sort_values(["u", "t"])
# Next we can use the percentage rank from pandas to get a proportional ranking of the timestamps for each user.
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)
interactions.head(10)

Unnamed: 0,u,i,t,pct_rank
0,0,0,2023-03-30 15:44:30,0.04
1,0,1,2023-04-06 12:13:54,0.08
2,0,2,2023-04-06 17:15:08,0.12
3,0,3,2023-05-10 10:35:45,0.16
4,0,3,2023-05-10 10:35:50,0.2
5,0,4,2023-06-12 11:20:35,0.24
6,0,5,2023-06-17 14:59:04,0.28
7,0,6,2023-06-17 14:59:24,0.32
8,0,7,2023-06-17 14:59:31,0.36
9,0,8,2023-06-20 11:21:46,0.4


In [12]:
# Not quite sure how we're gonna use cross validation here
# Could split the data into train and test based on the pct_rank like in the lab, but not sure that this is what is meant by cross validation
train_data = interactions[interactions["pct_rank"] < 0.8]
test_data = interactions[interactions["pct_rank"] >= 0.8]

In [13]:
import utils

interactions = pd.read_csv('data/interactions_train.csv')
interactions['t'] = pd.to_datetime(interactions['t'], unit='s')

items = pd.read_csv('data/items.csv')
n_users = interactions.u.nunique()
n_items = items.i.nunique()


interactions


ModuleNotFoundError: No module named 'utils'

In [14]:
data_matrix = utils.create_data_matrix(interactions, n_users, n_items)
data_matrix


NameError: name 'utils' is not defined

In [15]:
pd.read_csv('user_based.csv').iloc[0]['recommendation']

FileNotFoundError: [Errno 2] No such file or directory: 'user_based.csv'

In [None]:
data = np.zeros((3, 3))
data[2, 1] = 1
data

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.]])

In [None]:
import pandas as pd
import numpy as np
import utils
import importlib
from sklearn.metrics.pairwise import cosine_similarity
importlib.reload(utils)

interactions = pd.read_csv('data/interactions_train.csv')
interactions['t'] = pd.to_datetime(interactions['t'], unit='s')

items = pd.read_csv('data/items.csv')
n_users = interactions.u.nunique()
n_items = items.i.nunique()


data_matrix = utils.create_data_matrix(interactions, n_users, n_items)
print(data_matrix.shape)

print(data_matrix[:10, :10])
print(n_users, n_items)


(7838, 15291)
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
7838 15291


In [None]:
interactions[interactions['u'] == 0].sort_values(by='i').head(10)

Unnamed: 0,u,i,t
21035,0,0,2023-03-30 15:44:30
28842,0,1,2023-04-06 12:13:54
3958,0,2,2023-04-06 17:15:08
6371,0,3,2023-05-10 10:35:50
29592,0,3,2023-05-10 10:35:45
41220,0,4,2023-06-12 11:20:35
12217,0,5,2023-06-17 14:59:04
19703,0,6,2023-06-17 14:59:24
64522,0,7,2023-06-17 14:59:31
29380,0,8,2023-06-20 11:21:46


In [None]:
# Visualize a subset of the training and testing matrices
#utils.plot_interaction_heatmap(data_matrix, 'User-Item Interaction Matrix')

# Cosine similarity between users
user_similarity = cosine_similarity(data_matrix)
print(user_similarity)

def user_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on user-user similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The user-user similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    # Calculate the weighted sum of interactions based on user similarity
    pred = similarity.dot(interactions) / (np.abs(similarity).sum(axis=1)[:, np.newaxis] + epsilon)
    return pred

# Calculate the user-based predictions for positive interactions
user_prediction = user_based_predict(data_matrix, user_similarity)

# Extract the 10 recommended items for all users, and separate them by a space
top_10_recs = user_prediction.argsort(axis=1)[:, -10:]

top_10_recs_str = [' '.join(map(str, recs)) for recs in top_10_recs]
print(top_10_recs_str[0])

pd.DataFrame(range(n_users), columns=['user_id']).assign(recommendation=top_10_recs_str).to_csv('user_based.csv', index=False)

In [None]:
items['ImageLinks'] = ""
items['ISBN_1'] = items['ISBN Valid'].str.split(';').str[0]
items

Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i,ImageLinks,ISBN_1
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0,,9782871303336
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1,,9782278058327
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2,,2343190194
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3,,9782365350020
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4,,9782702180815
...,...,...,...,...,...,...,...,...
15286,Le vagabond de Tokyo /,"Fukutani, Takashi, 1952-2000",9782353480111; 235348011X; 9782353480241; 2353...,Le Lézard noir,Mangas,15286,,9782353480111
15287,God of high school : le match contre les dieux /,"Park, Yong-Je",9782382880203; 2382880201; 9782382880210; 2382...,Kbooks,,15287,,9782382880203
15288,Blue Lock /,"Kaneshiro, Muneyuki",9782811650254; 2811650253; 9782811661274; 2811...,Pika,Compétitions; Football; Entraînement (sports);...,15288,,9782811650254
15289,Red eyes sword : akame ga kill ! Zero /,Takahiro,9782368522134; 2368522131; 9782368522141; 2368...,Kurokawa,Bandes dessinées; Mangas,15289,,9782368522134


In [None]:
# add image links to items - final YES
for i in range(len(items)):
    if items.iloc[i]['ImageLinks'] == "":
        isbn = items.iloc[i]['ISBN_1']
        items.at[i, 'ImageLinks'] = f"https://covers.openlibrary.org/b/isbn/{isbn}-L.jpg"

In [16]:
# Books with no ISBN
nan_books = items[items['ISBN Valid'].isna()]
nan_books

Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i
261,Controverses ; et Suasoires /,"Seneca, Lucius Annaeus, l'Ancien",,Garnier frères,,261
264,Controverses ; et Suasoires /,"Seneca, Lucius Annaeus, l'Ancien",,Garnier,,264
269,"Oeuvres / Poésies de jeunesse, poésies diverse...","Baudelaire, Charles",,puis Gallimard; La Pléiade,,269
367,L'hallucination /,"Quercy, Pierre",,F Alcan,,367
393,Les droits réels dans le Code civil suisse /,"Wieland, Karl, 1864-1936",,M Giard et E Brière,droits réels--* droit civil--Suisse; Sachenrec...,393
...,...,...,...,...,...,...
15195,Code pénal suisse : [du 21 décembre 1937 (état...,,,diff OFCL; Chancellerie fédérale,Strafrecht; Strafgesetzbuch; Droit pénal; Schw...,15195
15203,La Commedia dell'Arte : storia e testo /,,,Sansoni Antiquariato,commedia dell'arte--[anthologie],15203
15232,"Payerne vracs : [ville, rues, archives, campag...",,,Editions du Caïon rodze,,15232
15245,Polnoe sobranie sochineniĭ i pisem : v tridt︠s...,"Chekhov, Anton Pavlovich",,Nauka,,15245


In [None]:
# Total number of books with no ISBN in the interactions dataser
len(interactions[interactions['i'].isin(nan_books['i'])]['i'].unique())

719

In [None]:
def get_clean_author(author_series):
    def clean_single_author(author):
        if pd.isna(author):
            return None
        # Step 1: Remove content in parentheses
        author = re.sub(r'\s*\([^)]*\)', '', author)
        # Step 2: Split by comma and clean whitespace
        parts = [part.strip() for part in author.split(',')]
        # Step 3: Filter out parts with digits
        parts = [part for part in parts if not any(char.isdigit() for char in part)]
        # Step 4: Group every two elements into "First Last"
        grouped = [f"{parts[i]} {parts[i+1]}" for i in range(0, len(parts)-1, 2)]
        return ', '.join(grouped) if grouped else None

    return author_series.apply(clean_single_author)

items['author_clean'] = get_clean_author(items['Author'])
