## Project - Animes Recomendation System

In [None]:
# Imports
import os 
import openai
import pandas as pd
import numpy as np
from dotenv import load_dotenv, find_dotenv

In [None]:
# Auth
load_dotenv(find_dotenv(), override=True)
openai.api_key = os.getenv('OPENAI_API_KEY')
from openai import OpenAI
client = OpenAI()

In [None]:
# Loading the Dataset into Pandas DataFrame

df = pd.read_csv('./anime.csv')
df.dropna(inplace=True)
df = df.sort_values('synopsis', ascending=False).head(2000)
df

In [None]:
# Embedding Cost Calculator
import tiktoken
enc = tiktoken.encoding_for_model('text-embedding-3-small')
synopsis = list(df['synopsis'])
total_tokens = sum([len(enc.encode(item)) for item in synopsis])
print(f'Total Tokens:  {total_tokens}')
cost = total_tokens * (0.0004/1000)
print(f'Estimated cost in USD:  {cost:.10f}')

In [None]:
# Calcute the embedding and cache locally

def get_embeddings_batch(texts, model="text-embedding-3-small"):
    response = client.embeddings.create(
        input=texts,
        model=model
    )
    return [item.embedding for item in response.data]


def get_embeddings_and_save(embedding_cache_file):
    texts = df["synopsis"].astype(str).tolist()
    embeddings = get_embeddings_batch(texts)

    df["embedding"] = embeddings
    df.to_csv(embedding_cache_file, index=False)

In [None]:
embedding_cache_file = "anime_embeddings.csv"
get_embeddings_and_save(embedding_cache_file)

In [None]:
# Load the Embeddings

embedding_cache_file = 'anime_embeddings.csv'
df_embeddings = pd.read_csv(embedding_cache_file)
df_embeddings['embedding'] = df_embeddings['embedding'].apply(eval).apply(np.array)

df_embeddings

In [None]:
# Get Recommendation from Title

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def get_recommendation_from_title(df_embeddings, title, k=5):

    # Verifica se o título existe
    if title not in df_embeddings["title"].values:
        print("Título não encontrado.")
        return None

    # Pega embedding do anime escolhido
    target_embedding = df_embeddings.loc[
        df_embeddings["title"] == title, "embedding"
    ].values[0]

    # Converter string para lista se estiver salvo como texto no CSV
    if isinstance(target_embedding, str):
        target_embedding = np.array(eval(target_embedding))
    else:
        target_embedding = np.array(target_embedding)

    similarities = []

    for idx, row in df_embeddings.iterrows():
        emb = row["embedding"]

        if isinstance(emb, str):
            emb = np.array(eval(emb))
        else:
            emb = np.array(emb)

        sim = cosine_similarity(target_embedding, emb)
        similarities.append((row["title"], sim))

    # Ordena por similaridade (maior primeiro)
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Remove o próprio anime
    similarities = [item for item in similarities if item[0] != title]

    return similarities[:k]


In [None]:
get_recommendation_from_title(df_embeddings, 'Boku', 10)