# Instalação de bibliotecas

In [1]:
%pip install pandas
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Importação de bibliotecas

In [2]:
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from joblib import dump

# Dataframe

In [3]:
# Carrega os dados do arquivo CSV em um DataFrame Pandas
df = pd.read_csv('./data/processed/base-de-dados-final.csv')

# Cria uma matriz de avaliações onde cada linha representa um avaliador, 
# cada coluna representa um projeto e os valores representam as avaliações dadas pelos avaliadores aos projetos.
# Qualquer célula sem avaliação (NaN) é preenchida com 0.
matriz_avaliacoes = df.pivot(index='avaliador_id', columns='projeto_id', values='avaliacao').fillna(0)

matriz_avaliacoes

projeto_id,1,2,3,4,5,6,7,8,9,10,...,1415,1416,1417,1418,1419,1420,1421,1422,1423,1424
avaliador_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Treinamento do modelo

In [4]:
# Define o modelo de recomendação usando NMF
modelo = NMF(n_components=10, init='random', random_state=42)

# Treina o modelo NMF usando as avaliações dos usuários nos projetos
# W contém as características latentes dos usuários
# H contém as características latentes dos projetos
W = modelo.fit_transform(matriz_avaliacoes)
H = modelo.components_

# Exporta o modelo treinado para uso futuro
# O arquivo é salvo no diretório './models' com o nome 'modelo-de-recomendacao-por-filtragem-colaborativa-por-usuario.pkl'
dump((modelo, W, H, matriz_avaliacoes), './models/modelo-de-recomendacao-por-filtragem-colaborativa-por-usuario2.pkl')



['./models/modelo-de-recomendacao-por-filtragem-colaborativa-por-usuario2.pkl']

# Recomendação (Função de Inferência)

In [5]:
# Função para recomendar projetos para um usuário
def recomendar(usuario_id):
    # Índice do usuário na matriz
    idx_usuario = matriz_avaliacoes.index.get_loc(usuario_id)
    
    # Pontuações previstas para o usuário
    pontuacoes = W[idx_usuario, :].dot(H)
    
    # Projetos já avaliados pelo usuário
    projetos_avaliados = matriz_avaliacoes.columns[matriz_avaliacoes.iloc[idx_usuario, :] > 0].tolist()

    # Filtra as pontuações para excluir projetos já avaliados pelo usuário
    # Cria uma lista de dicionários contendo o ID do projeto e sua pontuação
    pontuacoes = [{'id': matriz_avaliacoes.columns[index], 'pontuacao': pontuacao} for index, pontuacao in enumerate(pontuacoes) if projeto_id not in projetos_avaliados]
    
    # Ordena as recomendações por pontuação, do maior para o menor
    pontuacoes = sorted(pontuacoes, key=lambda x: x['pontuacao'], reverse=True)
    
    return pontuacoes

# Exemplo de recomendação para o usuário 400
recomendar(10000)

KeyError: 10000

# Avaliação

In [None]:
# Obtém as previsões do modelo para todas as avaliações e arredonda os valores
previsoes = pd.DataFrame(W.dot(H), 
                         columns=matriz_avaliacoes.columns, 
                         index=matriz_avaliacoes.index)

# Calcula o RMSE e o MAE (Root Mean Square Error e Mean Absolute Error) para avaliar o desempenho do modelo
# Primeiro, converte as matrizes reais e previstas em arrays numpy para cálculo
avaliacoes_reais = matriz_avaliacoes.values
avaliacoes_previstas = previsoes.values

# Remove os valores nulos nas previsões (caso haja)
# Isso é necessário para garantir que apenas as avaliações reais e previstas correspondentes sejam usadas nos cálculos das métricas
avaliacoes_reais = avaliacoes_reais[~np.isnan(avaliacoes_previstas)]
avaliacoes_previstas = avaliacoes_previstas[~np.isnan(avaliacoes_previstas)]

# Calcula o RMSE e o MAE usando as métricas do Scikit-Learn
rmse = mean_squared_error(avaliacoes_reais, avaliacoes_previstas, squared=False)
mae = mean_absolute_error(avaliacoes_reais, avaliacoes_previstas)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

RMSE: 0.2174727555834446
MAE: 0.02410406059010038


## Pipeline de Re-treinamento

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine
from sklearn.decomposition import NMF
from joblib import dump
from datetime import datetime

def retreinar_modelo():
    load_dotenv()

    # Definir a string de conexão
    DATABASE_URI = os.getenv('DB_URL').replace('postgres', 'postgresql')

    engine = create_engine(DATABASE_URI)

    # Consultas para carregar os dados
    query_ratings = 'SELECT * FROM Interacao WHERE avaliacao IS NOT NULL'

    # Carregar dados em DataFrames
    df_ratings = pd.read_sql(query_ratings, engine)

    # Criar a matriz de avaliações dos usuários nos projetos
    matriz_avaliacoes = df_ratings.pivot(index='ceo_id', columns='projeto_id', values='avaliacao').fillna(0)

    if matriz_avaliacoes.shape[0] == 0 or matriz_avaliacoes.shape[1] == 0:
        return None
    
    # Define o modelo de recomendação usando NMF
    modelo = NMF(n_components=10, init='random', random_state=42)

    # Treina o modelo NMF usando as avaliações dos usuários nos projetos
    W = modelo.fit_transform(matriz_avaliacoes)
    H = modelo.components_

    # Adiciona timestamp ao nome do arquivo
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    modelo_path = f'./models/modelo-de-recomendacao-por-filtragem-colaborativa-{timestamp}.pkl'

    # Exporta o modelo treinado para uso futuro
    dump((modelo, W, H, matriz_avaliacoes), modelo_path)

    return modelo_path

# Chamando a função de retreinamento
retreinar_modelo()


In [None]:
import os
from joblib import load

def get_latest_model_path(models_dir='./models'):
    # Lista todos os arquivos no diretório de modelos
    model_files = [f for f in os.listdir(models_dir) if f.startswith('modelo-de-recomendacao-por-filtragem-colaborativa')]
    
    # Ordena os arquivos por timestamp
    model_files.sort(reverse=True)
    
    # Retorna o caminho do arquivo mais recente
    latest_model_path = os.path.join(models_dir, model_files[0])
    
    return latest_model_path

def gerar_recomendacoes(usuario_id):
    # Carregar o modelo mais recente
    latest_model_path = get_latest_model_path()
    
    _, W, H, matriz_avaliacoes = load(latest_model_path)

    # Índice do usuário na matriz
    idx_usuario = matriz_avaliacoes.index.get_loc(usuario_id)
    
    # Pontuações previstas para o usuário
    pontuacoes = W[idx_usuario, :].dot(H)
    
    # Projetos já avaliados pelo usuário
    projetos_avaliados = matriz_avaliacoes.columns[matriz_avaliacoes.iloc[idx_usuario, :] > 0].tolist()

    # Filtra as pontuações para excluir projetos já avaliados pelo usuário
    # Cria uma lista de dicionários contendo o ID do projeto e sua pontuação
    pontuacoes = [{'id': matriz_avaliacoes.columns[index], 'pontuacao': pontuacao} for index, pontuacao in enumerate(pontuacoes) if matriz_avaliacoes.columns[index] not in projetos_avaliados]
    
    # Ordena as recomendações por pontuação, do maior para o menor
    pontuacoes = sorted(pontuacoes, key=lambda x: x['pontuacao'], reverse=True)
    
    return pontuacoes

# Exemplo de recomendação para o usuário 1
recomendacoes = gerar_recomendacoes(1)
print(recomendacoes)

[{'id': 703, 'pontuacao': 0.4151836282310964}, {'id': 615, 'pontuacao': 0.2800197443190649}, {'id': 588, 'pontuacao': 0.2750839825743286}, {'id': 1277, 'pontuacao': 0.19363687829882187}, {'id': 1313, 'pontuacao': 0.1912388414739678}, {'id': 253, 'pontuacao': 0.18715196010310672}, {'id': 367, 'pontuacao': 0.1800239704141252}, {'id': 520, 'pontuacao': 0.16818726785491386}, {'id': 231, 'pontuacao': 0.16688052878171813}, {'id': 838, 'pontuacao': 0.1635126835784698}, {'id': 1098, 'pontuacao': 0.15429208052570445}, {'id': 880, 'pontuacao': 0.1466558943317783}, {'id': 464, 'pontuacao': 0.13093910200042697}, {'id': 306, 'pontuacao': 0.11525639600273026}, {'id': 945, 'pontuacao': 0.11466459127937631}, {'id': 1272, 'pontuacao': 0.1144721005337908}, {'id': 1355, 'pontuacao': 0.11219309203187224}, {'id': 853, 'pontuacao': 0.1105859298144189}, {'id': 1142, 'pontuacao': 0.10931141742814983}, {'id': 701, 'pontuacao': 0.10927261776405163}, {'id': 742, 'pontuacao': 0.10156399582743031}, {'id': 925, 'po