<a href="https://colab.research.google.com/github/ML-brooowss/ML/blob/main/V2_Linne_w_KNN_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.neighbors import NearestNeighbors
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split

# Recommender Systems

## Task 1: Exploring the Dataset with Implicit Feedback




In [2]:
# Load the datasets
interactions = pd.read_csv('https://raw.githubusercontent.com/linneverh/MachineLearning/main/interactions_train.csv')
items = pd.read_csv("https://raw.githubusercontent.com/linneverh/MachineLearning/main/items.csv")

# Display the first rows of each dataset
display(interactions.head())
display(items.head())

Unnamed: 0,u,i,t
0,4456,8581,1687541000.0
1,142,1964,1679585000.0
2,362,3705,1706872000.0
3,1809,11317,1673533000.0
4,4384,1323,1681402000.0


Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4


In [3]:
# Rename columns in the interactions DataFrame
interactions = interactions.rename(columns={'u': 'user_id', 'i': 'book_id', 't': 'timestamp'})
items=items.rename(columns={'i':'book_id'})
# Display the first rows of the updated interactions DataFrame
display(interactions.head())
display(items.head())


Unnamed: 0,user_id,book_id,timestamp
0,4456,8581,1687541000.0
1,142,1964,1679585000.0
2,362,3705,1706872000.0
3,1809,11317,1673533000.0
4,4384,1323,1681402000.0


Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,book_id
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4



##Task 2: Split the Data into Training and Test Sets

In [4]:
# let's first sort the interactions by user and time stamp
interactions = interactions.sort_values(["user_id", "timestamp"])
interactions.head(10)

Unnamed: 0,user_id,book_id,timestamp
21035,0,0,1680191000.0
28842,0,1,1680783000.0
3958,0,2,1680801000.0
29592,0,3,1683715000.0
6371,0,3,1683715000.0
41220,0,4,1686569000.0
12217,0,5,1687014000.0
19703,0,6,1687014000.0
64522,0,7,1687014000.0
29380,0,8,1687260000.0


In [5]:
interactions["pct_rank"] = interactions.groupby("user_id")["timestamp"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)
interactions.head(10)

Unnamed: 0,user_id,book_id,timestamp,pct_rank
0,0,0,1680191000.0,0.04
1,0,1,1680783000.0,0.08
2,0,2,1680801000.0,0.12
3,0,3,1683715000.0,0.16
4,0,3,1683715000.0,0.2
5,0,4,1686569000.0,0.24
6,0,5,1687014000.0,0.28
7,0,6,1687014000.0,0.32
8,0,7,1687014000.0,0.36
9,0,8,1687260000.0,0.4


In [6]:
train_data = interactions[interactions["pct_rank"] < 0.8]
test_data = interactions[interactions["pct_rank"] >= 0.8]

In [7]:
print("Training set size:", train_data.shape[0])
print("Testing set size:", test_data.shape[0])

Training set size: 65419
Testing set size: 21628


## Task 3: Creating User-Item Matrices for Implicit Feedback


In [14]:
n_users = interactions["user_id"].nunique()
n_items = items["book_id"].nunique()

In [15]:
print('number of users =', n_users, '| number of movies =', n_items)

number of users = 7838 | number of movies = 15291


#### Step 1: Define the Function to Create the Data Matrix


In [16]:
# Define a function to create the data matrix
def create_data_matrix(data, n_users, n_items):
    """
    This function returns a numpy matrix with shape (n_users, n_items).
    Each entry is a binary value indicating positive interaction.
    """
    data_matrix = np.zeros((n_users, n_items))
    data_matrix[data["user_id"].values, data["book_id"].values] = 1
    return data_matrix

#### Step 2: Create the Training and Testing Matrices

In [17]:
# Create the training and testing matrices
train_data_matrix = create_data_matrix(train_data, n_users, n_items)
test_data_matrix = create_data_matrix(test_data, n_users, n_items)

# Display the matrices to understand their structure
print('train_data_matrix')
print(train_data_matrix)
print("number of non-zero values: ", np.sum(train_data_matrix))
print('test_data_matrix')
print(test_data_matrix)
print("number of non-zero values: ", np.sum(test_data_matrix))


train_data_matrix
[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
number of non-zero values:  49689.0
test_data_matrix
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
number of non-zero values:  19409.0


In [18]:
#give the dimensions of matrices

print("Train data matrix dimensions:", train_data_matrix.shape)
print("Test data matrix dimensions:", test_data_matrix.shape)

Train data matrix dimensions: (7838, 15291)
Test data matrix dimensions: (7838, 15291)


## Step 3: KNN model


### Basic definitions

In [26]:
# Recommendation frame generation
def create_recommendation_table(user_predictions, top_n=10, separator=" "):
    """
    Creates a table of top-N recommendations for each user.

    Args:
        user_predictions (numpy.ndarray): Rows = users, columns = items. Predicted scores.
        top_n (int): Number of top recommendations per user.
        separator (str): Delimiter to join recommended book IDs.

    Returns:
        pandas.DataFrame: Columns = ['user_id', 'recommendation'].
    """
    recommendations = []
    num_users = user_predictions.shape[0]

    for user_id in range(num_users):
        top_items = np.argsort(user_predictions[user_id, :])[-top_n:][::-1]
        recommendations.append({
            'user_id': user_id,
            'recommendation': separator.join(map(str, top_items))
        })

    return pd.DataFrame(recommendations)

In [27]:
# Def for the precision_recall_at_k function
def precision_recall_at_k(prediction, ground_truth, k=10):
    """
    Calculates Precision@K and Recall@K for top-K recommendations.
    Parameters:
        prediction (numpy array): The predicted interaction matrix with scores.
        ground_truth (numpy array): The ground truth interaction matrix (binary).
        k (int): Number of top recommendations to consider.
    Returns:
        precision_at_k (float): The average precision@K over all users.
        recall_at_k (float): The average recall@K over all users.
    """
    num_users = prediction.shape[0]
    precision_at_k, recall_at_k = 0, 0

    for user in range(num_users):
        # TODO: Get the indices of the top-K items for the user based on predicted scores
        top_k_items = np.argsort(prediction[user, :])[-k:]

        # TODO: Calculate the number of relevant items in the top-K items for the user
        relevant_items_in_top_k = np.isin(top_k_items, np.where(ground_truth[user, :] == 1)[0]).sum()

        # TODO: Calculate the total number of relevant items for the user
        total_relevant_items = ground_truth[user, :].sum()

        # Precision@K and Recall@K for this user
        precision_at_k += relevant_items_in_top_k / k
        recall_at_k += relevant_items_in_top_k / total_relevant_items if total_relevant_items > 0 else 0

    # Average Precision@K and Recall@K over all users
    precision_at_k /= num_users
    recall_at_k /= num_users

    return precision_at_k, recall_at_k

In [28]:
# Create random splits def.
def random_split_per_user(interactions_df, test_size=0.2):
    train_list = []
    test_list = []
    for user_id, user_df in interactions_df.groupby('user_id'):
        train_df, test_df = train_test_split(user_df, test_size=test_size)
        train_list.append(train_df)
        test_list.append(test_df)
    return pd.concat(train_list), pd.concat(test_list)

###CF Item-based w. KNN

In [34]:
# Predict interactions using item-based KNN with sparse NearestNeighbors
def item_based_knn_predict_sparse(train_matrix, k, metric='cosine'):
    n_users, n_items = train_matrix.shape
    predictions = np.zeros((n_users, n_items))

    knn = NearestNeighbors(n_neighbors=k+1, metric=metric, algorithm='brute')
    knn.fit(train_matrix.T)  # Transpose to shape (n_items, n_users)

    for item_idx in range(n_items):
        distances, indices = knn.kneighbors(train_matrix.T[item_idx].reshape(1, -1))
        top_k_items = indices.flatten()[1:]  # Exclude the item itself
        sim_weights = 1 - distances.flatten()[1:]  # cosine similarity

        sim_weights_sum = np.sum(sim_weights) + 1e-9
        for user_idx in range(n_users):
            ratings = train_matrix[user_idx, top_k_items]
            predictions[user_idx, item_idx] = np.dot(ratings, sim_weights) / sim_weights_sum

    return predictions

# Cross-validation function
def evaluate_knn_item_based(k, seed):
    train_df, test_df = random_split_per_user(interactions, test_size=0.2)
    train_matrix = create_data_matrix(train_df, n_users, n_items)
    test_matrix = create_data_matrix(test_df, n_users, n_items)

    preds = item_based_knn_predict_sparse(train_matrix, k)
    precision, _ = precision_recall_at_k(preds, test_matrix, k=10)
    return precision

# Define hyperparameters
k_values = list(range(10, 100, 10))  # Try k = 10 to 90
seeds = list(range(1))  # 3-fold cross-validation

# Run parallel evaluations
precision_results = {
    k: Parallel(n_jobs=-1)(delayed(evaluate_knn_item_based)(k, seed) for seed in seeds)
    for k in k_values
}

# Aggregate results
avg_precision = [np.mean(precision_results[k]) for k in k_values]
std_precision = [np.std(precision_results[k]) for k in k_values]

# Best k
best_index = np.argmax(avg_precision)
best_k = k_values[best_index]
best_score = avg_precision[best_index]

# Plot results
plt.figure(figsize=(10, 5))
plt.errorbar(k_values, avg_precision, yerr=std_precision, fmt='-o', capsize=5, label='Mean ± Std Dev')
plt.axvline(best_k, linestyle='--', color='red', label=f'Best k = {best_k}')
plt.title(f'Item-Based KNN (Sparse) — Precision@10 vs k\n(Averaged over {len(seeds)} random splits)')
plt.xlabel('k (Number of Nearest Items)')
plt.ylabel('Precision@10')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

print(f"Best k = {best_k} with avg Precision@10 = {best_score:.4f}")


KeyboardInterrupt: 

## Task 4: User-to-User Collaborative Filtering with Implicit Feedback

In this task, we’ll create a recommender system using **User-to-User Collaborative Filtering**. This approach translates to “Users who are similar to you also liked…”. We’ll predict the likelihood of a user interacting with an item based on the behavior of similar users.

In [None]:
# Predict interactions using user-based KNN with NearestNeighbors
def user_based_knn_predict_sparse(train_matrix, k, metric='cosine'):
    n_users, n_items = train_matrix.shape
    predictions = np.zeros((n_users, n_items))

    knn = NearestNeighbors(n_neighbors=k+1, metric=metric, algorithm='brute')
    knn.fit(train_matrix)  # Each user is a row

    for user_idx in range(n_users):
        distances, indices = knn.kneighbors(train_matrix[user_idx].reshape(1, -1))
        top_k_users = indices.flatten()[1:]  # Exclude the user itself
        sim_weights = 1 - distances.flatten()[1:]  # Convert distance → similarity

        sim_weights_sum = np.sum(sim_weights) + 1e-9
        for item_idx in range(n_items):
            ratings = train_matrix[top_k_users, item_idx]
            predictions[user_idx, item_idx] = np.dot(ratings, sim_weights) / sim_weights_sum

    return predictions

# Cross-validation function for user-based CF
def evaluate_knn_user_based(k, seed):
    train_df, test_df = random_split_per_user(interactions, test_size=0.2)
    train_matrix = create_data_matrix(train_df, n_users, n_items)
    test_matrix = create_data_matrix(test_df, n_users, n_items)

    preds = user_based_knn_predict_sparse(train_matrix, k)
    precision, _ = precision_recall_at_k(preds, test_matrix, k=10)
    return precision

# Hyperparameter values and seeds
k_values = list(range(10, 100, 10))
seeds = list(range(3))

# Run evaluations in parallel
precision_results = {
    k: Parallel(n_jobs=-1)(delayed(evaluate_knn_user_based)(k, seed) for seed in seeds)
    for k in k_values
}

# Compute averages and std deviations
avg_precision = [np.mean(precision_results[k]) for k in k_values]
std_precision = [np.std(precision_results[k]) for k in k_values]

# Determine best k
best_index = np.argmax(avg_precision)
best_k = k_values[best_index]
best_score = avg_precision[best_index]

# Plot results
plt.figure(figsize=(10, 5))
plt.errorbar(k_values, avg_precision, yerr=std_precision, fmt='-o', capsize=5, label='Mean ± Std Dev')
plt.axvline(best_k, linestyle='--', color='red', label=f'Best k = {best_k}')
plt.title(f'User-Based KNN — Precision@10 vs k\n(Averaged over {len(seeds)} random splits)')
plt.xlabel('k (Number of Nearest Users)')
plt.ylabel('Precision@10')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

print(f"Best k = {best_k} with avg Precision@10 = {best_score:.4f}")
