In [2]:
import numpy as np
import polars as pl 
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
RUTA_CLIENTE_PRODUCTO_RATING = r"99. NBO\data\interim\cliente_producto_ratings.parquet"
RUTA_CLUSTER_PRODUCTO_RATING = r"99. NBO\data\interim\cluster_producto_rating.parquet"
RUTA_INFO_CLIENTE = r"99. NBO\data\interim\informacion_cliente.parquet"
RUTA_RATINGS = r"99. NBO\data\interim\cliente_rating.parquet"

In [90]:
rating_cliente_df = pl.scan_parquet(RUTA_RATINGS).collect().to_pandas()
info_cliente = pl.scan_parquet(RUTA_INFO_CLIENTE).collect()
cliente_producto_ratings = pl.scan_parquet(RUTA_CLIENTE_PRODUCTO_RATING).collect().fill_null(0)
cluster_rating = pl.scan_parquet(RUTA_CLUSTER_PRODUCTO_RATING).collect().fill_null(0)
matrix_client_product = pl.scan_parquet(RUTA_CLIENTE_PRODUCTO_RATING).collect().fill_null(0).to_numpy()[:,1:]
matrix_cluster_product = pl.scan_parquet(RUTA_CLUSTER_PRODUCTO_RATING).collect().fill_null(0).to_numpy()[:,3:]
LISTA_PRODUCTOS = pl.scan_parquet(RUTA_CLIENTE_PRODUCTO_RATING).drop('ID_CLIENTE').collect().columns

In [91]:
info_cliente = (
    info_cliente
    .with_columns(pl.concat_list(['SECTOR_ECONOMICO','REGION','CLUSTER']).list.join('-').alias('CLUSTER'))
    .drop(['SECTOR_ECONOMICO','REGION','PRODUCTO'])
    .unique()
    .to_pandas()
)
info_cliente.head()

Unnamed: 0,ID_CLIENTE,CLUSTER
0,553052,OTRO-Rural-Poco Transaccional
1,563528,COMBUSTIBLES PARA VEHICULO_--Rural-Lovers Activos
2,374712,TRANSPORTE TERRESTRE_-¯-Rural-Pequeños
3,286481,COMERCIO INTERNO (GUATEMALA)_-¯-Rural-Pequeños
4,561260,COMERCIO INTERNO (GUATEMALA)_-¯-Rural-Ahorrado...


In [92]:
cluster_rating = (
    cluster_rating
    .with_columns(pl.concat_list(['SECTOR_ECONOMICO','REGION','CLUSTER']).list.join('-').alias('CLUSTER'))
    .drop(['SECTOR_ECONOMICO','REGION'])
    .to_pandas()
)
cluster_rating.head()

Unnamed: 0,CLUSTER,Depósitos Monetarios Dólares,Disvisas,Monetarios Plus Quetzales,Planes de Pago,Depósitos Monetarios Quetzales,Visa Banco Internacional,InterDía Dólares,Ahorro Corriente Quetzales,InterDía Quetzales,...,Financiamiento de Importación,Cartas de Crédito de Importación,Inversión Creciente Dólares,Bonos Hipotecarios,Cuenta Monedero Quetzales,Cartas de Crédito Stand By,Cuenta InterNómina Quetzales,Cobranza,Cartas de Crédito de Exportación,FHA
0,OTRO-Rural-Poco Transaccional,3.585106,4.540107,3.396088,0.0,3.991667,1.0,3.230769,3.75,2.62069,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,COMERCIO DE IMPORTACION_-¯-Rural-Poco Transacc...,3.317073,4.83125,3.576471,0.0,4.340426,0.0,4.333333,4.0,3.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,INMOBILIARIA_--Rural-Poco Transaccional,3.5,4.531746,3.758865,0.0,4.237288,0.0,3.0,3.615385,3.888889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,INMOBILIARIA_--Rural-Potenciales,3.571429,4.125,3.513514,5.243902,0.0,1.0,0.0,5.0,5.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OTRO-Rural-Ahorrador Potencial,4.911392,4.705645,4.939024,0.0,5.107843,0.0,4.777778,4.333333,4.954545,...,0.0,0.0,3.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Obtenemos customer-product rating matrix

In [12]:
# Customer-Product Rating Matrix
# - Rows representing products.
# - Columns representing customers.
# - Values representing the ratings customers gave to products.
matrix_client_product.T

array([[4, 4, 0, ..., 0, 0, 0],
       [0, 6, 0, ..., 0, 0, 0],
       [0, 4, 2, ..., 3, 6, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [14]:
# Obtenemos similaridad de coseno entre productos
# similarity between product pairs
product_similarity_client = cosine_similarity(matrix_client_product.T)

In [16]:
product_similarity_df = pd.DataFrame(
    product_similarity_client,
    index=LISTA_PRODUCTOS,
    columns=LISTA_PRODUCTOS
)

In [19]:
product_similarity_df.head()

Unnamed: 0,Depósitos Monetarios Dólares,Disvisas,Monetarios Plus Quetzales,Planes de Pago,Depósitos Monetarios Quetzales,Visa Banco Internacional,InterDía Dólares,Ahorro Corriente Quetzales,InterDía Quetzales,InterSorteo Quetzales,...,Financiamiento de Importación,Cartas de Crédito de Importación,Inversión Creciente Dólares,Bonos Hipotecarios,Cuenta Monedero Quetzales,Cartas de Crédito Stand By,Cuenta InterNómina Quetzales,Cobranza,Cartas de Crédito de Exportación,FHA
Depósitos Monetarios Dólares,1.0,0.431267,0.295151,0.126139,0.225415,0.00922,0.076016,0.053219,0.077953,0.008477,...,0.045533,-0.02507,0.055791,0.0,0.0,-0.021711,0.0,-0.017727,-0.037605,0.0
Disvisas,0.431267,1.0,0.442658,0.237068,0.286797,0.127462,0.116068,0.102158,0.137147,0.064095,...,0.037857,-0.004078,0.036571,0.0,-0.023545,-0.042381,0.026462,-0.017302,-0.004078,0.011719
Monetarios Plus Quetzales,0.295151,0.442658,1.0,0.247968,0.050482,0.052806,0.071092,0.050762,0.152584,0.029327,...,0.01692,0.0,0.018146,0.0,-0.020502,-0.029044,0.0,-0.025109,-0.017755,0.002501
Planes de Pago,0.126139,0.237068,0.247968,1.0,0.105105,0.222566,0.02214,0.048001,0.103181,0.152361,...,0.016278,0.0,0.0,0.0,0.0,-0.015524,0.04689,-0.01426,-0.03361,0.020452
Depósitos Monetarios Quetzales,0.225415,0.286797,0.050482,0.105105,1.0,0.044941,0.059555,0.103053,0.079159,0.022759,...,0.038579,-0.030185,0.020567,0.0,0.0,-0.017427,0.000816,-0.007115,0.0,0.010204


# Obtenemos cluster-product rating matrix

In [65]:
# Cluster-Product Rating Matrix
# - Rows representing products.
# - Columns representing cluster.
# - Values representing the ratings customers gave to products.
matrix_cluster_product.T

array([[3.5851063829787235, 3.317073170731707, 3.5, ..., 0.0, 0.0, 0.0],
       [4.540106951871658, 4.83125, 4.531746031746032, ..., 0.0, 0.0,
        0.0],
       [3.3960880195599024, 3.5764705882352943, 3.7588652482269502, ...,
        6.0, 2.0, 1.0],
       ...,
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

# Recomendador

## Predicted ratings

- item-based collaborative filtering
- 

Obtenemos top-N productos similares a cada producto

In [20]:
def get_similar_products(product_id, product_similarity_df, top_n=3):
    # Sort products by similarity score in descending order
    similar_products = product_similarity_df[product_id].sort_values(ascending=False)
    # Exclude the product itself and select the top-N similar products
    return similar_products.drop(product_id).head(top_n)

In [40]:
get_similar_products('Visa Banco Internacional',product_similarity_df,5).index

Index(['InterPréstamo', 'Planes de Pago', 'InterSorteo Quetzales', 'InterCasa',
       'Cuenta InterNómina Quetzales'],
      dtype='object')

### Predicted ratings

In [41]:
(rating_cliente_df.head(1)[get_similar_products('Visa Banco Internacional',product_similarity_df,5).index].values)[0]

array([0, 0, 0, 0, 0])

In [56]:
def rating_new_product(df, product_similarity_df, producto, id_cliente):

    top_similar_products = get_similar_products(producto,product_similarity_df,5).index
    top_similarity = get_similar_products(producto,product_similarity_df,5).values

    # Avoid division by zero
    if np.sum(top_similarity) == 0:
        return 0

    rating = df.loc[df.ID_CLIENTE==id_cliente][top_similar_products].values[0]

    return np.dot(rating,top_similarity) / np.sum(top_similarity)


In [57]:
rating_new_product(rating_cliente_df, product_similarity_df, 'Disvisas', '564160')

np.float64(2.572506506871034)

### Cluster based recommendation

In [96]:
info_cliente.loc[info_cliente.ID_CLIENTE=='553052'].CLUSTER.values[0]

'OTRO-Rural-Poco Transaccional'

In [115]:
cluster_rating.loc[cluster_rating.CLUSTER=='OTRO-Rural-Poco Transaccional']['Disvisas'].values[0]

np.float64(4.540106951871658)

In [113]:
'458171' not in list(info_cliente.ID_CLIENTE.values)

False

In [147]:
def get_df_cluster_rating(id_cliente:str, producto:str, df_info_cliente, df_cluster_rating):

    if id_cliente not in list(df_info_cliente.ID_CLIENTE.values):
        return 0  # No cluster information
    
    cluster = df_info_cliente.loc[df_info_cliente.ID_CLIENTE==id_cliente].CLUSTER.values[0]

    rating = df_cluster_rating.loc[df_cluster_rating.CLUSTER==cluster][producto].values[0]

    return (rating)
    

In [148]:
(get_df_cluster_rating('458171','Disvisas', info_cliente, cluster_rating))

np.float64(5.333333333333333)

### Final rating

In [None]:
rating_new_product()

In [166]:
def compute_final_rating(customer_id, product_id, product_similarity_df, rating_cliente_df, df_info_cliente, df_cluster_rating):
    """
    Compute final rating by combining collaborative filtering and cluster-based recommendations.
    """
    pred_rating = rating_new_product(rating_cliente_df, product_similarity_df, product_id, customer_id)
    cluster_rating = get_df_cluster_rating(customer_id, product_id, df_info_cliente, df_cluster_rating)

    # Dynamic weighting
    num_rated_products = sum(rating_cliente_df.loc[rating_cliente_df.ID_CLIENTE==customer_id].values[0][1:] != 0)
    alpha = min(0.3 + (num_rated_products / 100), 1)  # Adjust alpha based on customer data
    beta = 1 - alpha  # Ensure weights sum to 1

    return alpha * pred_rating + beta * cluster_rating

In [178]:
compute_final_rating(
    '563528',
    'Depósitos Monetarios Dólares',
    product_similarity_df,
    rating_cliente_df,
    info_cliente,
    cluster_rating
)

np.float64(4.307928648095074)