In [1]:
import ast
import faiss
import numpy as np
import pandas as pd

In [2]:
df = pd.read_json('../data/embeddings_data.json')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14740 entries, 0 to 14739
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   unique_id     14740 non-null  object
 1   input_string  14740 non-null  object
 2   embedding     14740 non-null  object
dtypes: object(3)
memory usage: 460.6+ KB


In [4]:
vectors = np.stack(df['embedding'].values).astype('float32')

In [5]:
# faiss index for l2 distances
d = vectors[0].shape
d = d[0]
d

768

In [6]:
index = faiss.IndexFlatL2(d) # L2 distance index

In [7]:
index.add(vectors)

In [8]:
# Check the number of vectors in the index
print(f"Number of vectors in the index: {index.ntotal}")

Number of vectors in the index: 14740


In [9]:
# storing unique ids and their corresponding faiss indexes
id_mapping = df['unique_id'].to_list()  # help us map faiss indices back to unique id

In [11]:
import faiss
import numpy as np
import pandas as pd

In [2]:
index = faiss.read_index('../data/faiss_index.index')

In [3]:
anime_dataframe = pd.read_csv('../data/test_data_pre_processed.csv')

In [7]:
index_df = pd.read_json('../data/embeddings_data.json')

In [21]:
def get_3_new_recommendations(query_vector, already_recommended, index, num_recommendations=5):
    """
    Given a query vector and a list of already recommended items, this function
    performs a search in the FAISS index to retrieve a set of new recommendations 
    while ensuring the already recommended items are excluded.
    
    Args:
        query_vector (numpy.ndarray): The vector representing the query item for which similar items are to be found.
                                      Should be a 1D array of shape (d,).
        already_recommended (list): A list of indices of items that have already been recommended.
        index (faiss.Index): A FAISS index containing the vectors to search against.
        num_recommendations (int, optional): The number of new recommendations to return. Defaults to 3.
    
    Returns:
        recommendations (list): A list of indices of the new recommended items.
        rec_distances (list): A list of distances to the recommended items.
    """
    
    recommendations = list()
    rec_distances = list()

    # Ensure the query vector is in the correct shape (1, d)
    query_vector = query_vector.reshape(1, -1)

    # Set a high initial k to ensure enough results even after filtering
    k = num_recommendations + len(already_recommended) + 5

    # Perform the FAISS search for the k nearest neighbors
    distances, faiss_indices = index.search(query_vector, k)

    # Filter out already recommended IDs and collect unique recommendations
    for idx, dist in zip(faiss_indices[0], distances[0]):
        # Ensure the index is not already recommended or in the list of recommendations
        if idx not in already_recommended and idx not in recommendations:
            recommendations.append(idx)
            rec_distances.append(dist)
        
        # Stop when we've collected the desired number of recommendations
        if len(recommendations) == num_recommendations:
            break

    return recommendations, rec_distances


In [22]:
def anime(recommendations):
    for i, anime_index in enumerate(recommendations):
        print(f"{i}. -> {anime_dataframe.loc[anime_index].name_english}")

In [25]:
anime_index = 11
already_recommended = [0, 6]
already_recommended.append(anime_index)
query_vector = np.array(index_df['embedding'].loc[anime_index])

recommendations, distances = get_3_new_recommendations(query_vector, already_recommended, index)
print("Recommendations:", recommendations)
print("Distances:", distances)

print(f"Anime similar to : {anime_dataframe.loc[anime_index].name_english}")
anime(recommendations)

Recommendations: [617, 403, 29, 1849, 526]
Distances: [23.983263, 24.013151, 25.307737, 26.214542, 26.5361]
Anime similar to : Kaguya-sama: Love is War - The First Kiss That Never Ends
0. -> Tsukigakirei
1. -> Honey and Clover II
2. -> The Dangers in My Heart Season 2
3. -> To LOVE Ru Darkness 2
4. -> Nodame Cantabile: Paris-hen


In [70]:
faiss.write_index(index, "../data/faiss_index.index")