# Recommender library competition 
## By Amélie Madrona & Linne Verhoeven
Link to the [kaggle competition](https://www.kaggle.com/competitions/library-recommender-competition/overview)

Goal:
* Train recommender system on training interactions data
* Generate recommendations for each user ID in the dataset. 
* Provide the top 10 recommendations of your model for each user
* Make sure that your submission file has the same format as the sample_submission.csv file in the Data tab (i.e. separated by a space)

In [7]:
import pandas as pd
import numpy as np
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/Users/amelie/Documents/GitHub/ML/utils.py'>

In [8]:
interactions = pd.read_csv('data/interactions_train.csv')
print(interactions.shape)
interactions['t'] = pd.to_datetime(interactions['t'], unit='s')
interactions.head()

(87047, 3)


Unnamed: 0,u,i,t
0,4456,8581,2023-06-23 17:24:46
1,142,1964,2023-03-23 15:30:06
2,362,3705,2024-02-02 11:00:59
3,1809,11317,2023-01-12 14:19:22
4,4384,1323,2023-04-13 16:09:22


In [114]:
items = pd.read_csv('data/items.csv')
print(items.shape)
items.head()

(15291, 6)


Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4


In [10]:
sample_submission = pd.read_csv('data/sample_submission.csv')
print(sample_submission.shape)
print(sample_submission.iloc[0]['recommendation'])
print(type(sample_submission.iloc[0]['recommendation']))

sample_submission.head()


(7838, 2)
3758 11248 9088 9895 5101 6074 9295 14050 10961 8240
<class 'str'>


Unnamed: 0,user_id,recommendation
0,0,3758 11248 9088 9895 5101 6074 9295 14050 1096...
1,1,3263 726 1589 14911 6432 10897 6484 7961 8249 ...
2,2,13508 9848 12244 2742 11120 2893 2461 5439 116...
3,3,2821 10734 6357 5934 2085 12608 12539 10551 10...
4,4,12425 219 11602 1487 14178 489 13888 2110 4413...


In [11]:
n_users = interactions.u.nunique()
n_items = items.i.nunique()
print(f'Number of users = {n_users}, \nNumber of movies = {n_items} \nNumber of interactions = {len(interactions)}')
# So the sample submission is the top x items for each user
# And we have info for all the books in items

Number of users = 7838, 
Number of movies = 15291 
Number of interactions = 87047


In [67]:
# TODO: EDA on the interactions data and the items metadata. 

In [68]:
# Let's first sort the interactions by user and time stamp
interactions = interactions.sort_values(["u", "t"])
# Next we can use the percentage rank from pandas to get a proportional ranking of the timestamps for each user.
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)
interactions.head(10)

Unnamed: 0,u,i,t,pct_rank
0,0,0,2023-03-30 15:44:30,0.04
1,0,1,2023-04-06 12:13:54,0.08
2,0,2,2023-04-06 17:15:08,0.12
3,0,3,2023-05-10 10:35:45,0.16
4,0,3,2023-05-10 10:35:50,0.2
5,0,4,2023-06-12 11:20:35,0.24
6,0,5,2023-06-17 14:59:04,0.28
7,0,6,2023-06-17 14:59:24,0.32
8,0,7,2023-06-17 14:59:31,0.36
9,0,8,2023-06-20 11:21:46,0.4


In [69]:
# Not quite sure how we're gonna use cross validation here
# Could split the data into train and test based on the pct_rank like in the lab, but not sure that this is what is meant by cross validation
train_data = interactions[interactions["pct_rank"] < 0.8]
test_data = interactions[interactions["pct_rank"] >= 0.8]

In [None]:
import utils

interactions = pd.read_csv('data/interactions_train.csv')
interactions['t'] = pd.to_datetime(interactions['t'], unit='s')

items = pd.read_csv('data/items.csv')
n_users = interactions.u.nunique()
n_items = items.i.nunique()


interactions


Unnamed: 0,u,i,t,i_consecutive
0,4456,8581,2023-06-23 17:24:46,8509
1,142,1964,2023-03-23 15:30:06,1944
2,362,3705,2024-02-02 11:00:59,3677
3,1809,11317,2023-01-12 14:19:22,11216
4,4384,1323,2023-04-13 16:09:22,1306
...,...,...,...,...
87042,924,8171,2023-11-06 15:15:47,8103
87043,1106,9009,2023-11-13 10:48:13,8931
87044,5207,13400,2023-05-09 10:02:52,13282
87045,698,5779,2023-06-13 14:37:48,5735


In [74]:
data_matrix = utils.create_data_matrix(interactions, n_users, n_items)
data_matrix


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
pd.read_csv('user_based.csv').iloc[0]['recommendation']

'9 5 8 11 14 15 23 12 4 13'

In [16]:
data = np.zeros((3, 3))
data[2, 1] = 1
data

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.]])

In [21]:
import pandas as pd
import numpy as np
import utils
import importlib
from sklearn.metrics.pairwise import cosine_similarity
importlib.reload(utils)

interactions = pd.read_csv('data/interactions_train.csv')
interactions['t'] = pd.to_datetime(interactions['t'], unit='s')

items = pd.read_csv('data/items.csv')
n_users = interactions.u.nunique()
n_items = items.i.nunique()


data_matrix = utils.create_data_matrix(interactions, n_users, n_items)
print(data_matrix.shape)

print(data_matrix[:10, :10])
print(n_users, n_items)


(7838, 15291)
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
7838 15291


In [20]:
interactions[interactions['u'] == 0].sort_values(by='i').head(10)

Unnamed: 0,u,i,t
21035,0,0,2023-03-30 15:44:30
28842,0,1,2023-04-06 12:13:54
3958,0,2,2023-04-06 17:15:08
6371,0,3,2023-05-10 10:35:50
29592,0,3,2023-05-10 10:35:45
41220,0,4,2023-06-12 11:20:35
12217,0,5,2023-06-17 14:59:04
19703,0,6,2023-06-17 14:59:24
64522,0,7,2023-06-17 14:59:31
29380,0,8,2023-06-20 11:21:46


In [None]:
# Visualize a subset of the training and testing matrices
#utils.plot_interaction_heatmap(data_matrix, 'User-Item Interaction Matrix')

# Cosine similarity between users
user_similarity = cosine_similarity(data_matrix)
print(user_similarity)

def user_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on user-user similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The user-user similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    # Calculate the weighted sum of interactions based on user similarity
    pred = similarity.dot(interactions) / (np.abs(similarity).sum(axis=1)[:, np.newaxis] + epsilon)
    return pred

# Calculate the user-based predictions for positive interactions
user_prediction = user_based_predict(data_matrix, user_similarity)

# Extract the 10 recommended items for all users, and separate them by a space
top_10_recs = user_prediction.argsort(axis=1)[:, -10:]

top_10_recs_str = [' '.join(map(str, recs)) for recs in top_10_recs]
print(top_10_recs_str[0])

pd.DataFrame(range(n_users), columns=['user_id']).assign(recommendation=top_10_recs_str).to_csv('user_based.csv', index=False)

In [115]:
items['ImageLinks'] = ""
items['ISBN_1'] = items['ISBN Valid'].str.split(';').str[0]
items

Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i,ImageLinks,ISBN_1
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0,,9782871303336
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1,,9782278058327
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2,,2343190194
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3,,9782365350020
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4,,9782702180815
...,...,...,...,...,...,...,...,...
15286,Le vagabond de Tokyo /,"Fukutani, Takashi, 1952-2000",9782353480111; 235348011X; 9782353480241; 2353...,Le Lézard noir,Mangas,15286,,9782353480111
15287,God of high school : le match contre les dieux /,"Park, Yong-Je",9782382880203; 2382880201; 9782382880210; 2382...,Kbooks,,15287,,9782382880203
15288,Blue Lock /,"Kaneshiro, Muneyuki",9782811650254; 2811650253; 9782811661274; 2811...,Pika,Compétitions; Football; Entraînement (sports);...,15288,,9782811650254
15289,Red eyes sword : akame ga kill ! Zero /,Takahiro,9782368522134; 2368522131; 9782368522141; 2368...,Kurokawa,Bandes dessinées; Mangas,15289,,9782368522134


In [None]:
# add image links to items - final YES
for i in range(len(items)):
    if items.iloc[i]['ImageLinks'] == "":
        isbn = items.iloc[i]['ISBN_1']
        items.at[i, 'ImageLinks'] = f"https://covers.openlibrary.org/b/isbn/{isbn}-L.jpg"

In [None]:
# Books with no ISBN
nan_books = items[items['ISBN Valid'].isna()]
nan_books

In [None]:
# Total number of books with no ISBN in the interactions dataser
len(interactions[interactions['i'].isin(nan_books['i'])]['i'].unique())

719

In [65]:
import requests
import time

def find_isbn_from_google_books(title, author=None, publisher=None):
    """Search Google Books API and return first matching ISBN."""
    query = f'intitle:{title}'
    if author and pd.notna(author):
        query += f'+inauthor:{author}'
    if publisher and pd.notna(publisher):
        query += f'+inpublisher:{publisher}'

    params = {
        'q': query,
        'maxResults': 1,
        'printType': 'books',
        'projection': 'lite'
    }

    try:
        response = requests.get('https://www.googleapis.com/books/v1/volumes', params=params)
        data = response.json()
        if 'items' in data:
            industry_ids = data['items'][0]['volumeInfo'].get('industryIdentifiers', [])
            for id_info in industry_ids:
                if id_info['type'] == 'ISBN_13':
                    return id_info['identifier']
                elif id_info['type'] == 'ISBN_10':
                    return id_info['identifier']
    except Exception as e:
        print("Error:", e)

    return 'pouet'  # fallback if nothing found

for i, row in items[items['ISBN Valid'].isna()].iterrows():
    isbn = find_isbn_from_google_books(row['Title'], row['Author'], row['Publisher'])
    if isbn:
        items.at[i, 'ISBN Valid'] = isbn
        print(f"Updated index {i} with ISBN: {isbn}")
    time.sleep(0.2)  # polite delay to avoid rate limiting

Updated index 261 with ISBN: pouet
Updated index 264 with ISBN: pouet
Updated index 269 with ISBN: pouet
Updated index 367 with ISBN: pouet
Updated index 393 with ISBN: pouet
Updated index 397 with ISBN: pouet
Updated index 398 with ISBN: pouet
Updated index 401 with ISBN: pouet
Updated index 613 with ISBN: pouet
Updated index 633 with ISBN: pouet
Updated index 646 with ISBN: pouet
Updated index 647 with ISBN: pouet
Updated index 669 with ISBN: pouet


KeyboardInterrupt: 

In [85]:
find_isbn_from_google_books("Harry Potter and the Philosopher's Stone")

'pouet'

In [None]:
def get_clean_author(author_series):
    def clean_single_author(author):
        if pd.isna(author):
            return None
        # Step 1: Remove content in parentheses
        author = re.sub(r'\s*\([^)]*\)', '', author)
        # Step 2: Split by comma and clean whitespace
        parts = [part.strip() for part in author.split(',')]
        # Step 3: Filter out parts with digits
        parts = [part for part in parts if not any(char.isdigit() for char in part)]
        # Step 4: Group every two elements into "First Last"
        grouped = [f"{parts[i]} {parts[i+1]}" for i in range(0, len(parts)-1, 2)]
        return ', '.join(grouped) if grouped else None

    return author_series.apply(clean_single_author)

items['author_clean'] = get_clean_author(items['Author'])


In [None]:
items['author_clean'].iloc[:10]

0    Jean Pierre, Paul Jacques
1             Cicurel Francine
2                         None
3                Mazas Sylvain
4              Lemaitre Pierre
5             Lussier Francine
6             Bouchut Fabienne
7                 Robbes Bruno
8                         None
9              Auffret Anthony
Name: author_clean, dtype: object

In [137]:
for i in get_clean_author(items['Author']).head(40):
    print(i)

Jean Pierre, Paul Jacques
Cicurel Francine
None
Mazas Sylvain
Lemaitre Pierre
Lussier Francine
Bouchut Fabienne
Robbes Bruno
None
Auffret Anthony
Kock Marie
Béal Yves
Larmer John
Bordalo Isabelle
Staquet Christian
Tilman Francis
Williams Nicola
Vanderhaeghe Katherine
None
None
Catling Christopher
Durpaire François
Gluck Marion
Winckler Martin
Kellerhals Jean
Fenaert Mélanie
None
Larcenet Manu
Darcq Sophie
Tenaillon Nicolas
None
Johnson Wendy
Bourdieu Pierre
None
Maroh Julie
None
None
Bocquet José-Louis
Boniface Pascal
Sohn Lili


In [119]:

author_parts = items.iloc[i]['Author'].split(',')
# Option 1: Keep all parts that don't contain numbers and separate them with a comma
cleaned_author = ' '.join([part.strip() for part in author_parts if not any(char.isdigit() for char in part)])
# Option 2: For multiple authors, format as "lastname firstname, secondlastname secondfirstname"
#cleaned_author = ', '.join([' '.join(reversed(part.strip().split())) for part in author_parts if not any(char.isdigit() for char in part)])
print(author_parts,cleaned_author)

['Paka (1985-....)'] 


In [147]:
for i in range(20,40):
    print(items.iloc[i]['Author'])    
    

Catling, Christopher
Durpaire, François
Gluck, Marion
Winckler, Martin
Kellerhals, Jean, 1941-
Fenaert, Mélanie
nan
Larcenet, Manu
Darcq, Sophie, 1976-.....
Tenaillon, Nicolas
Laitinen, Niina 19..-....
Johnson, Wendy
Bourdieu, Pierre
Matz
Maroh, Julie
Nay, Olivier 1968-....
Paka (1985-....)
Bocquet, José-Louis
Boniface, Pascal
Sohn, Lili


In [None]:
if pd.notna(items.iloc[i]['Author']):
        author_parts = items.iloc[i]['Author'].split(',')
        cleaned_author = author_parts[0].split('-')[0].strip()
        items.at[i, 'Author'] = cleaned_author
        print(f"Updated Author at index {i}: {items.iloc[i]['Author']}")


Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i,ImageLinks,ISBN_1
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0,https://covers.openlibrary.org/b/isbn/97828713...,9782871303336
1,Les interactions dans l'enseignement des langu...,Cicurel,9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1,https://covers.openlibrary.org/b/isbn/97822780...,9782278058327
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2,https://covers.openlibrary.org/b/isbn/23431901...,2343190194
3,Ce livre devrait me permettre de résoudre le c...,Mazas,9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3,https://covers.openlibrary.org/b/isbn/97823653...,9782365350020
4,Les années glorieuses : roman,Lemaitre,9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4,https://covers.openlibrary.org/b/isbn/97827021...,9782702180815
...,...,...,...,...,...,...,...,...
15286,Le vagabond de Tokyo,"Fukutani, Takashi, 1952-2000",9782353480111; 235348011X; 9782353480241; 2353...,Le Lézard noir,Mangas,15286,https://covers.openlibrary.org/b/isbn/97823534...,9782353480111
15287,God of high school : le match contre les dieux,"Park, Yong-Je",9782382880203; 2382880201; 9782382880210; 2382...,Kbooks,,15287,https://covers.openlibrary.org/b/isbn/97823828...,9782382880203
15288,Blue Lock,"Kaneshiro, Muneyuki",9782811650254; 2811650253; 9782811661274; 2811...,Pika,Compétitions; Football; Entraînement (sports);...,15288,https://covers.openlibrary.org/b/isbn/97828116...,9782811650254
15289,Red eyes sword : akame ga kill ! Zero,Takahiro,9782368522134; 2368522131; 9782368522141; 2368...,Kurokawa,Bandes dessinées; Mangas,15289,https://covers.openlibrary.org/b/isbn/97823685...,9782368522134


In [89]:
items

Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i,ImageLinks,ISBN_1
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0,https://covers.openlibrary.org/b/isbn/97828713...,9782871303336
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1,https://covers.openlibrary.org/b/isbn/97822780...,9782278058327
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2,https://covers.openlibrary.org/b/isbn/23431901...,2343190194
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3,https://covers.openlibrary.org/b/isbn/97823653...,9782365350020
4,Les années glorieuses : roman,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4,https://covers.openlibrary.org/b/isbn/97827021...,9782702180815
...,...,...,...,...,...,...,...,...
15286,Le vagabond de Tokyo /,"Fukutani, Takashi, 1952-2000",9782353480111; 235348011X; 9782353480241; 2353...,Le Lézard noir,Mangas,15286,https://covers.openlibrary.org/b/isbn/97823534...,9782353480111
15287,God of high school : le match contre les dieux /,"Park, Yong-Je",9782382880203; 2382880201; 9782382880210; 2382...,Kbooks,,15287,https://covers.openlibrary.org/b/isbn/97823828...,9782382880203
15288,Blue Lock /,"Kaneshiro, Muneyuki",9782811650254; 2811650253; 9782811661274; 2811...,Pika,Compétitions; Football; Entraînement (sports);...,15288,https://covers.openlibrary.org/b/isbn/97828116...,9782811650254
15289,Red eyes sword : akame ga kill ! Zero /,Takahiro,9782368522134; 2368522131; 9782368522141; 2368...,Kurokawa,Bandes dessinées; Mangas,15289,https://covers.openlibrary.org/b/isbn/97823685...,9782368522134
