<a href="https://colab.research.google.com/github/ML-brooowss/ML/blob/main/gridsearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#Library
# !pip install sentence-transformers
# !pip install joblib
# !pip install collections
from collections import defaultdict
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import os


# Recommender Systems
| Recommender Type     | Similarity Between | Based On           | Example Statement                                      |
|----------------------|--------------------|--------------------|--------------------------------------------------------|
| CF – Item-Item       | Items              | User behavior      | “You liked A, others who liked A also liked B”         |
| CF – User-User       | Users              | User behavior      | “People like you liked B, so you might too”            |
| Content-Based        | Items              | Item text/content  | “These books are similar in description/topic”         |
| Hybrid               | Items              | Content + Behavior | “You liked A; B is similar and liked by others too”    |


## Task 1: Exploring



#### Step 1:Get the data

In [2]:
# Load the datasets

interactions = pd.read_csv('https://raw.githubusercontent.com/linneverh/MachineLearning/main/interactions_train.csv')

#FOR: Google enhanced & ISBN enhanced - author_date_title_subjects priority
items1 = pd.read_csv("https://media.githubusercontent.com/media/ML-brooowss/ML/refs/heads/main/final_items/author_date_title_subjects/embeddings_part1.csv")
items2 = pd.read_csv("https://media.githubusercontent.com/media/ML-brooowss/ML/refs/heads/main/final_items/author_date_title_subjects/embeddings_part2.csv")
items = pd.concat([items1, items2])

#rename columns
interactions = interactions.rename(columns={'u': 'user_id', 'i': 'book_id', 't': 'timestamp'})
items=items.rename(columns={'i':'book_id'})

# Display the first rows of the updated interactions DataFrame
display(interactions.head())
display(items.head())

# Display the first rows of each dataset
display(interactions.head())
display(items.head())

Unnamed: 0,user_id,book_id,timestamp
0,4456,8581,1687541000.0
1,142,1964,1679585000.0
2,362,3705,1706872000.0
3,1809,11317,1673533000.0
4,4384,1323,1681402000.0


Unnamed: 0.1,Unnamed: 0,CanonicalLink,Description,ISBN,ImageLink,Language,PublishedDate,Publisher,Subjects,Title,...,book_id,title_clean,title_description,date_title_description,author_title_description,author_date_title_description,author_date_title,author_date_title_subjects,author_title_subjects,embedding
0,723,https://books.google.com/books/about/Classific...,,9782871303336,https://images.isbndb.com/covers/2000472348298...,fr,2012.0,Ed du CEFAL,Classification décimale universelle; Indexatio...,Classification décimale universelle : édition ...,...,0,Classification décimale universelle : édition ...,Classification décimale universelle : édition ...,2012 Classification décimale universelle : édi...,UDC Consortium (The Hague) Classification déci...,UDC Consortium (The Hague) 2012 Classification...,UDC Consortium (The Hague) 2012 Classification...,UDC Consortium (The Hague) 2012 Classification...,UDC Consortium (The Hague) Classification déc...,"[-0.004826885, -0.0587869, -0.06438997, -0.007..."
1,724,https://books.google.com/books/about/Les_inter...,C'est dans l'interaction en classe que s'actua...,9782278058327,https://images.isbndb.com/covers/2384333482926...,fr,2011.0,Didier,didactique--langue étrangère - enseignement; d...,Les interactions dans l'enseignement des langu...,...,1,Les interactions dans l'enseignement des langu...,Les interactions dans l'enseignement des langu...,2011 Les interactions dans l'enseignement des ...,"Cicurel, Francine, Les interactions dans l'ens...","Cicurel, Francine, 2011 Les interactions dans ...","Cicurel, Francine, 2011 Les interactions dans ...","Cicurel, Francine, 2011 Les interactions dans ...","Cicurel, Francine, Les interactions dans l'en...","[0.0041115503, -0.012976925, 0.0044452655, 0.0..."
2,725,https://books.google.com/books/about/Histoire_...,Depuis la parution en 1918 de l'ouvrage fondat...,2343190194,http://books.google.com/books/content?id=Q2PMD...,fr,2020.0,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,Histoire de vie et recherche biographique : pe...,...,2,Histoire de vie et recherche biographique : pe...,Histoire de vie et recherche biographique : pe...,2020 Histoire de vie et recherche biographique...,"Aneta Slowik, Hervé Breton, Gaston Pineau Hist...","Aneta Slowik, Hervé Breton, Gaston Pineau 2020...","Aneta Slowik, Hervé Breton, Gaston Pineau 2020...","Aneta Slowik, Hervé Breton, Gaston Pineau 2020...","Aneta Slowik, Hervé Breton, Gaston Pineau His...","[0.027354596, -0.025706276, -0.051459163, 0.00..."
3,726,https://books.google.com/books/about/Ce_livre_...,Juin 1940. Les Allemands entrent dans Paris.Pa...,9782365350020,https://images.isbndb.com/covers/1994518348298...,fr,2012.0,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,Ce livre devrait me permettre de résoudre le c...,...,3,Ce livre devrait me permettre de résoudre le c...,Ce livre devrait me permettre de résoudre le c...,2012 Ce livre devrait me permettre de résoudre...,"Mazas, Sylvain, Ce livre devrait me permettre ...","Mazas, Sylvain, 2012 Ce livre devrait me perme...","Mazas, Sylvain, 2012 Ce livre devrait me perme...","Mazas, Sylvain, 2012 Ce livre devrait me perme...","Mazas, Sylvain, Ce livre devrait me permettre...","[0.036929574, -0.0399203, -0.033997424, -0.006..."
4,727,https://books.google.com/books/about/Le_grand_...,"Trois histoires d'amour, un lanceur d'alerte, ...",9782702180815,http://books.google.com/books/content?id=f5u3z...,fr,1984.0,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,Les années glorieuses : roman /,...,4,Les années glorieuses : roman,Les années glorieuses : roman Trois histoires ...,1984 Les années glorieuses : roman Trois histo...,"Lemaitre, Pierre, Les années glorieuses : roma...","Lemaitre, Pierre, 1984 Les années glorieuses :...","Lemaitre, Pierre, 1984 Les années glorieuses :...","Lemaitre, Pierre, 1984 Les années glorieuses :...","Lemaitre, Pierre, Les années glorieuses : rom...","[0.05324783, -0.026807835, -0.009055429, 0.005..."


Unnamed: 0,user_id,book_id,timestamp
0,4456,8581,1687541000.0
1,142,1964,1679585000.0
2,362,3705,1706872000.0
3,1809,11317,1673533000.0
4,4384,1323,1681402000.0


Unnamed: 0.1,Unnamed: 0,CanonicalLink,Description,ISBN,ImageLink,Language,PublishedDate,Publisher,Subjects,Title,...,book_id,title_clean,title_description,date_title_description,author_title_description,author_date_title_description,author_date_title,author_date_title_subjects,author_title_subjects,embedding
0,723,https://books.google.com/books/about/Classific...,,9782871303336,https://images.isbndb.com/covers/2000472348298...,fr,2012.0,Ed du CEFAL,Classification décimale universelle; Indexatio...,Classification décimale universelle : édition ...,...,0,Classification décimale universelle : édition ...,Classification décimale universelle : édition ...,2012 Classification décimale universelle : édi...,UDC Consortium (The Hague) Classification déci...,UDC Consortium (The Hague) 2012 Classification...,UDC Consortium (The Hague) 2012 Classification...,UDC Consortium (The Hague) 2012 Classification...,UDC Consortium (The Hague) Classification déc...,"[-0.004826885, -0.0587869, -0.06438997, -0.007..."
1,724,https://books.google.com/books/about/Les_inter...,C'est dans l'interaction en classe que s'actua...,9782278058327,https://images.isbndb.com/covers/2384333482926...,fr,2011.0,Didier,didactique--langue étrangère - enseignement; d...,Les interactions dans l'enseignement des langu...,...,1,Les interactions dans l'enseignement des langu...,Les interactions dans l'enseignement des langu...,2011 Les interactions dans l'enseignement des ...,"Cicurel, Francine, Les interactions dans l'ens...","Cicurel, Francine, 2011 Les interactions dans ...","Cicurel, Francine, 2011 Les interactions dans ...","Cicurel, Francine, 2011 Les interactions dans ...","Cicurel, Francine, Les interactions dans l'en...","[0.0041115503, -0.012976925, 0.0044452655, 0.0..."
2,725,https://books.google.com/books/about/Histoire_...,Depuis la parution en 1918 de l'ouvrage fondat...,2343190194,http://books.google.com/books/content?id=Q2PMD...,fr,2020.0,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,Histoire de vie et recherche biographique : pe...,...,2,Histoire de vie et recherche biographique : pe...,Histoire de vie et recherche biographique : pe...,2020 Histoire de vie et recherche biographique...,"Aneta Slowik, Hervé Breton, Gaston Pineau Hist...","Aneta Slowik, Hervé Breton, Gaston Pineau 2020...","Aneta Slowik, Hervé Breton, Gaston Pineau 2020...","Aneta Slowik, Hervé Breton, Gaston Pineau 2020...","Aneta Slowik, Hervé Breton, Gaston Pineau His...","[0.027354596, -0.025706276, -0.051459163, 0.00..."
3,726,https://books.google.com/books/about/Ce_livre_...,Juin 1940. Les Allemands entrent dans Paris.Pa...,9782365350020,https://images.isbndb.com/covers/1994518348298...,fr,2012.0,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,Ce livre devrait me permettre de résoudre le c...,...,3,Ce livre devrait me permettre de résoudre le c...,Ce livre devrait me permettre de résoudre le c...,2012 Ce livre devrait me permettre de résoudre...,"Mazas, Sylvain, Ce livre devrait me permettre ...","Mazas, Sylvain, 2012 Ce livre devrait me perme...","Mazas, Sylvain, 2012 Ce livre devrait me perme...","Mazas, Sylvain, 2012 Ce livre devrait me perme...","Mazas, Sylvain, Ce livre devrait me permettre...","[0.036929574, -0.0399203, -0.033997424, -0.006..."
4,727,https://books.google.com/books/about/Le_grand_...,"Trois histoires d'amour, un lanceur d'alerte, ...",9782702180815,http://books.google.com/books/content?id=f5u3z...,fr,1984.0,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,Les années glorieuses : roman /,...,4,Les années glorieuses : roman,Les années glorieuses : roman Trois histoires ...,1984 Les années glorieuses : roman Trois histo...,"Lemaitre, Pierre, Les années glorieuses : roma...","Lemaitre, Pierre, 1984 Les années glorieuses :...","Lemaitre, Pierre, 1984 Les années glorieuses :...","Lemaitre, Pierre, 1984 Les années glorieuses :...","Lemaitre, Pierre, Les années glorieuses : rom...","[0.05324783, -0.026807835, -0.009055429, 0.005..."



#### Step 2: Check the Number of interactions, users and movies

In [4]:
n_users = interactions.user_id.nunique()
n_items = items.book_id.nunique()
print(f'Number of users = {n_users}, \n Number of books = {n_items} \n Number of interactions = {len(interactions)}')


Number of users = 7838, 
 Number of books = 15291 
 Number of interactions = 87047


In [5]:
n_items = items.book_id.nunique()
print(f'Number of books = {n_items}')

Number of books = 15291



#### Step 3: Split the Data into Training and Test Sets

In [6]:
# let's first sort the interactions by user and time stamp
interactions = interactions.sort_values(["user_id", "timestamp"])
interactions.head(100)

Unnamed: 0,user_id,book_id,timestamp
21035,0,0,1.680191e+09
28842,0,1,1.680783e+09
3958,0,2,1.680801e+09
29592,0,3,1.683715e+09
6371,0,3,1.683715e+09
...,...,...,...
20068,2,53,1.694861e+09
12721,2,53,1.695226e+09
86745,2,53,1.695226e+09
19329,2,53,1.695226e+09


In [7]:
interactions["pct_rank"] = interactions.groupby("user_id")["timestamp"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)
interactions.head(10)

Unnamed: 0,user_id,book_id,timestamp,pct_rank
0,0,0,1680191000.0,0.04
1,0,1,1680783000.0,0.08
2,0,2,1680801000.0,0.12
3,0,3,1683715000.0,0.16
4,0,3,1683715000.0,0.2
5,0,4,1686569000.0,0.24
6,0,5,1687014000.0,0.28
7,0,6,1687014000.0,0.32
8,0,7,1687014000.0,0.36
9,0,8,1687260000.0,0.4


Now all remains to do is to pick the first 80% of the interactions of each user in the training set and the rest in the test set. We can do so using the `pct_rank` column.

In [8]:
train_data = interactions[interactions["pct_rank"] < 0.8]
test_data = interactions[interactions["pct_rank"] >= 0.8]

In [9]:
print("Training set size:", train_data.shape[0])
print("Testing set size:", test_data.shape[0])

Training set size: 65419
Testing set size: 21628


## Task 2: Creating User-Item Matrices for Implicit Feedback


In [10]:
print('number of users =', n_users, '| number of movies =', n_items)

number of users = 7838 | number of movies = 15291


#### Step 1: Define the Function to Create the Data Matrix


In [11]:
# Define a function to create the data matrix
def create_data_matrix(data, n_users, n_items):
    """
    This function returns a numpy matrix with shape (n_users, n_items).
    Each entry is a binary value indicating positive interaction.
    """
    data_matrix = np.zeros((n_users, n_items))
    data_matrix[data["user_id"].values, data["book_id"].values] = 1
    return data_matrix

#### Step 2: Create the Training and Testing Matrices

Now we can use the function to create matrices for both the training and testing data. Each cell in the matrix will show a 1 if there was a positive interaction in the training or testing data, and a 0 otherwise.

In [12]:
entire_data=create_data_matrix(interactions, n_users, n_items)

In [None]:
# Create the training and testing matrices
train_data_matrix = create_data_matrix(train_data, n_users, n_items)
test_data_matrix = create_data_matrix(test_data, n_users, n_items)

# Display the matrices to understand their structure
print('train_data_matrix')
print(train_data_matrix)
print("number of non-zero values: ", np.sum(train_data_matrix))
print('test_data_matrix')
print(test_data_matrix)
print("number of non-zero values: ", np.sum(test_data_matrix))


train_data_matrix
[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
number of non-zero values:  49689.0
test_data_matrix
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
number of non-zero values:  19409.0


In [None]:
#give the dimensions of matrices
print("Train data matrix dimensions:", train_data_matrix.shape)
print("Test data matrix dimensions:", test_data_matrix.shape)

Train data matrix dimensions: (7838, 15291)
Test data matrix dimensions: (7838, 15291)


#### Basic Definitions

In [13]:
# Recommendation frame generation
def create_recommendation_table(user_predictions, top_n=10, separator=" "):
    """
    Creates a table of top-N recommendations for each user.

    Args:
        user_predictions (numpy.ndarray): Rows = users, columns = items. Predicted scores.
        top_n (int): Number of top recommendations per user.
        separator (str): Delimiter to join recommended book IDs.

    Returns:
        pandas.DataFrame: Columns = ['user_id', 'recommendation'].
    """
    recommendations = []
    num_users = user_predictions.shape[0]

    for user_id in range(num_users):
        top_items = np.argsort(user_predictions[user_id, :])[-top_n:][::-1]
        recommendations.append({
            'user_id': user_id,
            'recommendation': separator.join(map(str, top_items))
        })

    return pd.DataFrame(recommendations)

In [14]:
# Def for the precision_recall_at_k function
def precision_recall_at_k(prediction, ground_truth, k=10):
    """
    Calculates Precision@K and Recall@K for top-K recommendations.
    Parameters:
        prediction (numpy array): The predicted interaction matrix with scores.
        ground_truth (numpy array): The ground truth interaction matrix (binary).
        k (int): Number of top recommendations to consider.
    Returns:
        precision_at_k (float): The average precision@K over all users.
        recall_at_k (float): The average recall@K over all users.
    """
    num_users = prediction.shape[0]
    precision_at_k, recall_at_k = 0, 0

    for user in range(num_users):
        # TODO: Get the indices of the top-K items for the user based on predicted scores
        top_k_items = np.argsort(prediction[user, :])[-k:]

        # TODO: Calculate the number of relevant items in the top-K items for the user
        relevant_items_in_top_k = np.isin(top_k_items, np.where(ground_truth[user, :] == 1)[0]).sum()

        # TODO: Calculate the total number of relevant items for the user
        total_relevant_items = ground_truth[user, :].sum()

        # Precision@K and Recall@K for this user
        precision_at_k += relevant_items_in_top_k / k
        recall_at_k += relevant_items_in_top_k / total_relevant_items if total_relevant_items > 0 else 0

    # Average Precision@K and Recall@K over all users
    precision_at_k /= num_users
    recall_at_k /= num_users

    return precision_at_k, recall_at_k

In [15]:
# Create random splits def.
def random_split_per_user(interactions_df, test_size=0.2):
    train_list = []
    test_list = []
    for user_id, user_df in interactions_df.groupby('user_id'):
        train_df, test_df = train_test_split(user_df, test_size=test_size)
        train_list.append(train_df)
        test_list.append(test_df)
    return pd.concat(train_list), pd.concat(test_list)

In [16]:
# Define the function to predict interactions based on item similarity
def item_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on item-item similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The item-item similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    # np.dot does the matrix multiplication. Here we are calculating the
    # weighted sum of interactions based on item similarity
    pred = similarity.dot(interactions.T) / (similarity.sum(axis=1)[:, np.newaxis] + epsilon)
    return pred.T  # Transpose to get users as rows and items as columns

#### TF-IDF
w. ['Publisher', 'Subjects', 'google_api_title', 'author_clean', 'ISBN']<br>
Mean Precision@10 = 0.0149 <br>
Mean Recall@10    = 0.091

w.

In [17]:
#TF-IDF

# STEP 1: Build and clean the combined text feature
text_fields = ['Publisher', 'Subjects', 'google_api_title', 'author_clean']
items['combined_text'] = items[text_fields].fillna('').agg(' '.join, axis=1)

# # STEP 2: Align items with those used in the train_data_matrix (e.g., by book_id)
# # to ensure the order of books in the TF-IDF matrix exactly matches the item columns in the collaborative filtering matrix, so similarity scores align correctly with item IDs.
items_ordered = items.set_index('book_id').loc[range(entire_data.shape[1])]

# # STEP 3: Compute TF-IDF matrix and cosine similarity
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(items_ordered['combined_text'])

# # Cosine similarity between item vectors
tfidf_sim = cosine_similarity(tfidf_matrix)

#### Google API similarity <BR>
Item-based EMBED Precision@K: 0.04866037254401807 <BR>
Item-based EMBED Recall@K: 0.2707247031495884

In [18]:
# Select only the item IDs in the training data matrix
train_item_ids = range(entire_data.shape[1])

# Ensure correct item order by aligning to the item indices used in the train matrix
items_ordered = items.set_index('book_id').loc[train_item_ids]

# Parse the embedding strings into numpy arrays
items_ordered['embedding'] = items_ordered['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=','))

# Drop rows with missing or malformed embeddings (if any)
valid_items = items_ordered[items_ordered['embedding'].notna()].reset_index(drop=True)

# Stack embeddings into a matrix
embedding_matrix = np.vstack(valid_items['embedding'].values)

# Compute cosine similarity
embedding_sim = cosine_similarity(embedding_matrix)

#### BERT Similarity
Mean Precision@10 = 0.0272 <br>
Mean Recall@10    = 0.1760

In [2]:
# STEP 1: Combine text features
text_fields = ['Publisher', 'Subjects', 'google_api_title', 'author_clean']
items['combined_text'] = items[text_fields].fillna('').agg(' '.join, axis=1)

# STEP 2: Align with train_data_matrix
items_ordered = items.set_index('book_id').loc[range(entire_data.shape[1])]

# STEP 3: Load BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# STEP 4: Encode book texts into embeddings
bert_embeddings = model.encode(items_ordered['combined_text'].tolist(), show_progress_bar=True)

# STEP 5: Compute cosine similarity
bert_sim = cosine_similarity(bert_embeddings)

NameError: name 'items' is not defined

#### CF Item-based
Mean Precision@10 = 0.0585 <br>
Mean Recall@10    = 0.2823

In [20]:
# Compute the item-item similarity matrix
item_similarity = cosine_similarity(entire_data.T)
print("Item-Item Similarity Matrix:")
print(item_similarity)
print(item_similarity.shape)

Item-Item Similarity Matrix:
[[1.         0.40824829 0.33333333 ... 0.         0.         0.        ]
 [0.40824829 1.         0.40824829 ... 0.         0.         0.        ]
 [0.33333333 0.40824829 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
(15291, 15291)


#### Hybrid Model testing

between:
1.   item_similarity x  embedding_sim x  bert_sim
2.   item_similarity x  embedding_sim x  tfidf_sim

<table border="1" style="border-collapse: collapse; text-align: left;">
  <caption style="font-size: 18px; font-weight: bold; padding: 10px;"> </caption>
  <thead>
    <tr>
      <th>Blending Stage</th>
      <th>Use When...</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Prediction-level</td>
      <td>Models use different mechanics (CF, BERT, popularity)</td>
    </tr>
    <tr>
      <td>Similarity-level</td>
      <td>All models are similarity-based and aligned (e.g., TF-IDF + BERT)</td>
    </tr>
  </tbody>
</table>


--> Use similarity-level & perform gridsearch



In [None]:
# # grid search for bert_sim
# # Define feasible combinations of weights (a: bert_sim, b: item_similarity, c: embedding_sim)
# feasible_combinations = [
#     (0.1, 0.6, 0.3),
#     (0.2, 0.6, 0.2),
#     (0.25, 0.5, 0.25),
#     (0.3, 0.6, 0.1),
#     (0.2, 0.7, 0.1),
#     (0.1, 0.7, 0.2),
#     (0.15, 0.7, 0.15),
#     (0.1, 0.8, 0.1),
#     (0.05, 0.85, 0.1)
# ]

# results_file = "hybrid_grid_results_progress.csv"

# # Load previous progress if it exists
# if os.path.exists(results_file):
#     results_df = pd.read_csv(results_file)
#     done_combinations = set(tuple(row[:3]) for row in results_df.values)
# else:
#     results_df = pd.DataFrame(columns=['a', 'b', 'c', 'precision@10'])
#     done_combinations = set()

# # Run evaluation
# for a, b, c in feasible_combinations:
#     if (a, b, c) in done_combinations:
#         print(f"Skipping already evaluated combination: a={a}, b={b}, c={c}")
#         continue

#     print(f"Evaluating: a={a}, b={b}, c={c}")

#     # Build the hybrid similarity matrix
#     hybrid_sim = a * bert_sim + b * item_similarity + c * embedding_sim

#     # Predict using item-based CF
#     prediction_matrix = item_based_predict(train_data_matrix, hybrid_sim)

#     # Evaluate
#     p_at_k, _ = precision_recall_at_k(prediction_matrix, test_data_matrix, k=10)

#     # Store result
#     new_row = pd.DataFrame([{'a': a, 'b': b, 'c': c, 'precision@10': p_at_k}])
#     results_df = pd.concat([results_df, new_row], ignore_index=True)
#     results_df.to_csv(results_file, index=False)

#     print(f"Saved result for a={a}, b={b}, c={c} to {results_file}")

# # Sort and show final results
# results_df_bert = results_df.sort_values(by='precision@10', ascending=False)
# results_df_bert.to_csv("hybrid_grid_results_final.csv", index=False)
# print("Final results saved to hybrid_grid_results_final.csv")


NameError: name 'os' is not defined

In [None]:
# # grid search for tfidf_sim
# # Define feasible combinations of weights (a: tfidf_sim, b: item_similarity, c: embedding_sim)
# feasible_combinations = [
#     (0.1, 0.6, 0.3),
#     (0.2, 0.6, 0.2),
#     (0.25, 0.5, 0.25),
#     (0.3, 0.6, 0.1),
#     (0.2, 0.7, 0.1),
#     (0.1, 0.7, 0.2),
#     (0.15, 0.7, 0.15),
#     (0.1, 0.8, 0.1),
#     (0.05, 0.85, 0.1)
# ]

# results_file = "hybrid_grid_results_progress.csv"

# # Load previous progress if it exists
# if os.path.exists(results_file):
#     results_df = pd.read_csv(results_file)
#     done_combinations = set(tuple(row[:3]) for row in results_df.values)
# else:
#     results_df = pd.DataFrame(columns=['a', 'b', 'c', 'precision@10'])
#     done_combinations = set()

# # Run evaluation
# for a, b, c in feasible_combinations:
#     if (a, b, c) in done_combinations:
#         print(f"Skipping already evaluated combination: a={a}, b={b}, c={c}")
#         continue

#     print(f"Evaluating: a={a}, b={b}, c={c}")

#     # Build the hybrid similarity matrix
#     hybrid_sim = a * tfidf_sim + b * item_similarity + c * embedding_sim

#     # Predict using item-based CF
#     prediction_matrix = item_based_predict(train_data_matrix, hybrid_sim)

#     # Evaluate
#     p_at_k, _ = precision_recall_at_k(prediction_matrix, test_data_matrix, k=10)

#     # Store result
#     new_row = pd.DataFrame([{'a': a, 'b': b, 'c': c, 'precision@10': p_at_k}])
#     results_df = pd.concat([results_df, new_row], ignore_index=True)
#     results_df.to_csv(results_file, index=False)

#     print(f"Saved result for a={a}, b={b}, c={c} to {results_file}")

# # Sort and show final results
# results_df_tfidf = results_df.sort_values(by='precision@10', ascending=False)
# results_df_tfidf.to_csv("hybrid_grid_results_final.csv", index=False)
# print("Final results saved to hybrid_grid_results_final.csv")

In [None]:
# grid search for all 4
# Define feasible combinations of weights (a: tfidf_sim, b: item_similarity, c: embedding_sim)
feasible_combinations = [
   (0.2, 0.6, 0.1, 0.1)
(0.15, 0.6, 0.15, 0.1)
(0.25, 0.5, 0.15, 0.1)
(0.3, 0.5, 0.1, 0.1)
(0.2, 0.5, 0.2, 0.1)
(0.1, 0.6, 0.2, 0.1)
(0.15, 0.55, 0.2, 0.1)
(0.05, 0.7, 0.15, 0.1)
(0.1, 0.5, 0.3, 0.1)
]

results_file = "hybrid4_grid_results_progress.csv"

# Load previous progress if it exists
if os.path.exists(results_file):
    results_df = pd.read_csv(results_file)
    done_combinations = set(tuple(row[:3]) for row in results_df.values)
else:
    results_df = pd.DataFrame(columns=['a', 'b', 'c', 'd' 'precision@10'])
    done_combinations = set()

# Run evaluation
for a, b, c, d in feasible_combinations:
    if (a, b, c, d) in done_combinations:
        print(f"Skipping already evaluated combination: a={a}, b={b}, c={c}, d={d}")
        continue

    print(f"Evaluating: a={a}, b={b}, c={c}, d={d}")

    # Build the hybrid similarity matrix
    hybrid_sim = a * tfidf_sim + b * item_similarity + c * embedding_sim +d* bert_sim

    # Predict using item-based CF
    prediction_matrix = item_based_predict(train_data_matrix, hybrid_sim)

    # Evaluate
    p_at_k, _ = precision_recall_at_k(prediction_matrix, test_data_matrix, k=10)

    # Store result
    new_row = pd.DataFrame([{'a': a, 'b': b, 'c': c, 'd':d, 'precision@10': p_at_k}])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_df.to_csv(results_file, index=False)

    print(f"Saved result for a={a}, b={b}, c={c}, d={d} to {results_file}")

# Sort and show final results
results_df_tfidf = results_df.sort_values(by='precision@10', ascending=False)
results_df_tfidf.to_csv("hybrid4_grid_results_final.csv", index=False)
print("Final results saved to hybrid4_grid_results_final.csv")

In [21]:
# #Upload gridsearch results
# grid_bert=pd.read_csv("https://media.githubusercontent.com/media/ML-brooowss/ML/refs/heads/main/hybrid_grid_results_final_bert.csv")
# grid_tfidf=pd.read_csv("https://media.githubusercontent.com/media/ML-brooowss/ML/refs/heads/main/hybrid_grid_results_final_tfidf.csv")
# print(grid_bert)
# print(grid_tfidf)

      a     b     c  precision@10
0  0.25  0.50  0.25      0.062656
1  0.20  0.60  0.20      0.061840
2  0.10  0.60  0.30      0.061776
3  0.30  0.60  0.10      0.061687
4  0.20  0.70  0.10      0.061355
5  0.15  0.70  0.15      0.061240
6  0.10  0.70  0.20      0.060921
7  0.10  0.80  0.10      0.060589
8  0.05  0.85  0.10      0.060117
      a     b     c  precision@10
0  0.25  0.50  0.25      0.062631
1  0.30  0.60  0.10      0.062325
2  0.20  0.60  0.20      0.062018
3  0.10  0.60  0.30      0.061980
4  0.15  0.70  0.15      0.061738
5  0.20  0.70  0.10      0.061725
6  0.10  0.70  0.20      0.061444
7  0.10  0.80  0.10      0.060959
8  0.05  0.85  0.10      0.060615


In [1]:
#  hybrid_sim = 0.2 * bert_sim + 0.5 * item_similarity + 0.2 * embedding_sim

NameError: name 'bert_sim' is not defined

In [23]:
# # Calculate the item-based predictions for positive interactions
# hybrid_prediction4 = item_based_predict(entire_data, hybrid_sim)
# print("Predicted Interaction Matrix:")
# print(hybrid_prediction)
# print(hybrid_prediction.shape)

Predicted Interaction Matrix:
[[0.00290516 0.00304285 0.0026322  ... 0.00163776 0.00155606 0.00172682]
 [0.00072044 0.00070297 0.00071525 ... 0.00074196 0.0008213  0.00070783]
 [0.00339528 0.00311079 0.00300202 ... 0.00395767 0.00455984 0.00304226]
 ...
 [0.00018084 0.0001624  0.00016105 ... 0.00023027 0.00025359 0.00017887]
 [0.00012423 0.00011479 0.00011343 ... 0.00014965 0.00018694 0.00011532]
 [0.00018606 0.00018718 0.00018463 ... 0.00020436 0.00023141 0.00019829]]
(7838, 15291)


In [25]:
# # Create df
# hybrid_df = create_recommendation_table(hybrid_prediction, top_n=10, separator=" ")

# # Save and display
# hybrid_df.to_csv('hybrid_recommendations.csv', index=False)

# print("\nItem-based Recommendations:")
# display(hybrid_df)


Item-based Recommendations:


Unnamed: 0,user_id,recommendation
0,0,11 14 20 18 1 12 24 15 22 0
1,1,31 35 33 30 36 29 37 34 38 32
2,2,80 76 94 92 93 54 78 44 79 81
3,3,132 157 123 151 172 168 161 166 155 156
4,4,192 200 194 204 203 191 199 202 206 197
...,...,...
7833,7833,975 7322 7760 7312 7306 5119 10460 7307 7308 7323
7834,7834,15276 13891 7128 1367 13952 101 10651 7116 305...
7835,7835,3055 4820 6791 3057 7116 7125 8370 3053 7112 1...
7836,7836,14550 3471 15184 14619 14552 14081 14166 14555...
