filtrado colaborativo: knn, svd
filtrado basado en contenido:

In [74]:
import pandas as pd

products = pd.read_parquet("../data/processed/products.parquet")
countries = pd.read_parquet("../data/processed/countries.parquet")
sales_orders = pd.read_parquet("../data/processed/sales_orders.parquet")
order_lines = pd.read_parquet("../data/processed/order_lines.parquet")
CRM = pd.read_parquet("../data/processed/CRM.parquet")

print("\n📌 Primeros registros de cada dataset:")
print("\n Products")
print(products.head())
print("\n Countries")
print(countries.head())
print("\n Sales orders")
print(sales_orders.head())
print("\n Order lines")
print(order_lines.head())
print("\n CRM")
print(CRM.head())



📌 Primeros registros de cada dataset:

 Products
   id                                    name     category
0   5                           communication          all
1   7                       standard delivery   deliveries
2   6                                expenses     expenses
3  12  audifonos argom bluetooth arg-hs-2552b  electronics
4   1                                   meals          all

 Countries
   id code            name
0   3   af     afghanistan
1   6   al         albania
2  62   dz         algeria
3  11   as  american samoa
4   1   ad         andorra

 Sales orders
  order_id   partner_invoice    partner_shipping          date_order state  \
0   s00051    lilianna perez  harvard university 2025-01-29 00:23:15  sale   
1   s00050     milena garcia           microsoft 2025-01-28 21:39:34  sale   
2   s00049     jordana alphy  harvard university 2025-01-28 21:39:00  sale   
3   s00048  humberto marcebo           microsoft 2025-01-28 21:38:35  sale   
4   s00047  humbe

In [75]:
CRM = CRM.rename(columns={"contact_name":"partner_invoice" })
products = products.rename(columns={"name":"product_name" })
order_data = pd.merge(order_lines, sales_orders, on="order_id", how="left")
order_data = pd.merge(order_data, CRM, on="partner_invoice", how="left")
order_data = pd.merge(order_data, products, on="product_name", how="left")

print(order_data.head())

                                   id_x order_id  \
0  f12dbcd1-e2d0-47a0-b320-a7f61be2fc9d   s00051   
1  abeb05a7-f00a-4da3-b473-62f042b24d0f   s00050   
2  947cf4ca-f16a-4580-a33f-a64b6272329b   s00050   
3  4d0ba4b1-35ad-4155-83fb-19ad272e624e   s00049   
4  6a5685ca-4aa7-45e4-a7c0-14ba69a65421   s00049   

                                 product_name  quantity  unit_price  subtotal  \
0  celular samsung a35 256gb negro 356ezkggto       1.0     26995.0   26995.0   
1                   sandwichera nikkei 264124       1.0       995.0     995.0   
2                    pan sobao buenhorno 10/1       4.0        60.0     240.0   
3  celular samsung a35 256gb negro 356ezkggto       1.0     26995.0   26995.0   
4         galletas dulces oreo regular 12 und       1.0       198.0     198.0   

  partner_invoice    partner_shipping          date_order state  ...  \
0  lilianna perez  harvard university 2025-01-29 00:23:15  sale  ...   
1   milena garcia           microsoft 2025-01-28 21:39:3

In [76]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

knn_features = order_data[["partner_invoice", "product_name", "category", "quantity", "unit_price"]]

knn_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ["partner_invoice", "product_name", "category"]),
        ('num', StandardScaler(), ['quantity', "unit_price"])
    ]
)

knn_processed_features = knn_preprocessor.fit_transform(knn_features)

In [77]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(n_neighbors=5, metric="cosine")

knn_model.fit(knn_processed_features)

In [78]:
knn_user_index = order_data[order_data["partner_invoice"] == 'lilianna perez'].index[0]

distances, indices = knn_model.kneighbors(knn_processed_features[knn_user_index].reshape(1, -1))

knn_fav_products = order_data.iloc[indices[0]]['product_name']
print("recommended product is: ")
print(knn_fav_products)
purchased_products = order_data[order_data['partner_invoice'] == 'lilianna perez']['product_name']
knn_recommended_not_purchase_products = knn_fav_products[~knn_fav_products.isin(purchased_products)]

print("recommended not purchase product is: ")
print(knn_recommended_not_purchase_products)




recommended product is: 
0     celular samsung a35 256gb negro 356ezkggto
46    celular samsung a35 256gb negro 356ezkggto
34        laptop lenovo 14p 8gb 256gb 82yt00q3us
3     celular samsung a35 256gb negro 356ezkggto
49    celular samsung a35 256gb negro 356ezkggto
Name: product_name, dtype: object
recommended not purchase product is: 
Series([], Name: product_name, dtype: object)


In [79]:
user_product_matrix = order_data.pivot_table(index='partner_invoice', columns='product_name', values='quantity', aggfunc='sum', fill_value=0)
print(user_product_matrix)

product_name        audifonos argom bluetooth arg-hs-2552b  \
partner_invoice                                              
brandon freeman                                        0.0   
harvard university                                     0.0   
humberto marcebo                                      27.0   
jordana alphy                                          2.0   
joshua barom                                           9.0   
kendry murkan                                          5.0   
lilianna perez                                         1.0   
milena garcia                                          0.0   
mylan ramos                                            1.0   

product_name        batidora cuisinart pedestal sm-50bc  \
partner_invoice                                           
brandon freeman                                     0.0   
harvard university                                  0.0   
humberto marcebo                                    5.0   
jordana alphy         

In [80]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2)
svd_user_product_matrix = svd.fit_transform(user_product_matrix)

U = svd_user_product_matrix
Sigma = svd.singular_values_
Vt = svd.components_

print("\nUsuarios latentes (U):")
print(U)
print("\n Valores singulares (Sigma):")
print(Sigma)
print("\nProductos latentes (Vt):")
print(Vt)


Usuarios latentes (U):
[[  6.23879225   6.07556473]
 [  3.61165136  -2.2427328 ]
 [ 10.74695942   5.56861686]
 [ 19.68956459  35.62013742]
 [  6.27594442   8.02458736]
 [ 47.95691576 -15.62916459]
 [ 19.92948148  -6.1332592 ]
 [  5.30445486   5.49215253]
 [  5.68695842   0.22323927]]

 Valores singulares (Sigma):
[57.89696764 41.45167369]

Productos latentes (Vt):
[[ 0.19433774  0.03692121  0.42167373  0.35349358  0.72211478  0.18447797
   0.24807899  0.08867194  0.18224786  0.00107744]
 [ 0.12207737  0.0552428  -0.14679362  0.38450042 -0.44828551  0.23691953
   0.71964991  0.1780189  -0.07731246 -0.00130525]]


In [81]:
import numpy as np

Sigma_matrix = np.diag(Sigma)

predictions = np.dot(np.dot(U, Sigma_matrix), Vt)

predicted_ratings_df = pd.DataFrame(predictions, columns=user_product_matrix.columns, index=user_product_matrix.index)
print(predicted_ratings_df)




product_name        audifonos argom bluetooth arg-hs-2552b  \
partner_invoice                                              
brandon freeman                                 100.940432   
harvard university                               29.287807   
humberto marcebo                                149.099058   
jordana alphy                                   401.787420   
joshua barom                                    111.221113   
kendry murkan                                   460.501964   
lilianna perez                                  193.201675   
milena garcia                                    87.475425   
mylan ramos                                      65.116848   

product_name        batidora cuisinart pedestal sm-50bc  \
partner_invoice                                           
brandon freeman                               27.248682   
harvard university                             2.584712   
humberto marcebo                              35.724595   
jordana alphy         

In [82]:
user_name = 'lilianna perez'  
user_index = user_product_matrix.index.get_loc(user_name)
user_ratings = predicted_ratings_df.loc[user_name]  

unrated_products = user_product_matrix.loc[user_name] == 0  

svd_not_unrated_recommended_products = user_ratings[unrated_products]

print("Top recommended products:")
print(svd_not_unrated_recommended_products.head())

svd_fav_products = user_ratings.sort_values(ascending=False)  

print("\n fav products:")
print(svd_fav_products.head())



Top recommended products:
product_name
galletas dulces oreo regular 12 und    310.127856
licuadora oster cromada 3v blst4655    152.628145
refresco coca cola clasica 2.5 lt       57.056271
standard delivery                        1.575053
Name: lilianna perez, dtype: float64

 fav products:
product_name
laptop lenovo 14p 8gb 256gb 82yt00q3us        947.186223
celular samsung a35 256gb negro 356ezkggto    523.870896
galletas dulces oreo regular 12 und           310.127856
sandwichera nikkei 264124                     229.943335
audifonos argom bluetooth arg-hs-2552b        193.201675
Name: lilianna perez, dtype: float64


In [83]:
import pickle

with open("../models/collaborative_filtering/svd_model.pkl", "wb") as f:
    pickle.dump((U, Sigma, Vt), f)

with open("../models/collaborative_filtering/knn_model.pkl", "wb") as f:
    pickle.dump(knn_model, f)