### Instalaciones necesarias

In [6]:
# !pip install tensorflow
# !pip install -q tensorflow-recommenders
# !pip install -q --upgrade tensorflow-datasets

### Importación de librerías

In [7]:
# Importación de Tensorflow 
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_recommenders as tfrs

In [8]:
# Importación de pandas para el manejo de dataframes, además de expresiones regulares (re)
import pandas as pd
import re

### Variables generales

Se establecen para facilitar el cambio de variables recurrentes y/o cambiantes de forma sencilla

In [12]:
pathToDF = "../../../Inputs/Creados - Proyecto/"
fileToDF = "dfVentasDefinitivo.csv"

byColumn = "product_id"

### Importación de datos

In [13]:
df = pd.read_csv(f"{pathToDF}{fileToDF}")

In [14]:
#Sólo ejecutar una vez, elimina primera columna si la exportación de dicho CSV no se realizó con el parámetro "index=False"
df.drop(columns=df.columns[0], axis=1, inplace=True)

In [16]:
df.head(2)

Unnamed: 0,item_id,num_order,created_at,product_id,qty_ordered,base_cost,price,discount_percent,customer_id,Zipcode,...,hour,week,day,margin_total,price_total,name,marca_value,analytic_category,nombre_corto,num_compras
0,000010d95384a6ba3d57dd870e7b337c,65717498f0771a49497d80f11160093c,2017-09-22 15:46:37,5645,1,7.4441,10.38,7.0,da5b59745c6a4699dee7684eba901bba,28028,...,15,38,Friday,2.9359,10.38,Weleda Hombre Crema Hidratante 30 ml,weleda,cosmética y belleza,Weleda Hombre Crema Hidratante 30,1
1,00001a8fb0bd42b1e16ba731e30cc490,09b538e85ce396ecbb70695f91007830,2018-09-12 21:27:08,28743,2,35.3166,43.08,7.0,531a918355010bacbe506243a5f05c30,12194,...,21,89,Wednesday,15.5268,86.16,Gynea Gestagyn Men 60 Capsulas,gynea,vida íntima,Gynea Gestagyn Men 60 Capsulas,1


### Depuración de datos

Aplicación de expresión regular que elimina cantidades de los nombres originales de los productos, con el fin de mejorar la legilibilidad y posterior reducción.

In [17]:
df["name"] = df["name"].apply(lambda x: re.sub("\d+\s*\S*\w+\s*\S*\w", "", x))

**Creación de dataset de ventas**

In [18]:
df2 = df[["customer_id", byColumn]]

In [19]:
df3 = df2.drop_duplicates(subset=['customer_id'])
df3.reset_index(inplace=True)
df3.drop(df3.columns[[0]], axis=1, inplace=True)
df3.reset_index(inplace=True)
df3 = df3[["index", "customer_id"]]

df4 = df2
df4 = pd.merge(df3, df4, how="inner", on=["customer_id"])
df4 =df4[["index",byColumn]]
df4['index'] = df4['index'].astype(str)
df4.rename(columns = {'index':'customer_id'}, inplace = True)
df4[byColumn] = df4[byColumn].astype("str")

print(df4.shape)
print(df4.dtypes)
df4.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(df3.columns[[0]], axis=1, inplace=True)


(810167, 2)
customer_id    object
product_id     object
dtype: object


Unnamed: 0,customer_id,product_id
0,0,5645
1,0,36943
2,0,5645
3,0,8635
4,0,4629


In [24]:
# Conversión de dataframe de pandas para uso en Tensorflow
sales = tf.data.Dataset.from_tensor_slices(dict(df4))

Comprobación de conversión

In [25]:
for row in sales.take(5):
  print(row)

{'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, 'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'5645'>}
{'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, 'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'36943'>}
{'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, 'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'5645'>}
{'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, 'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'8635'>}
{'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, 'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'4629'>}


**Creación de dataset de productos**

In [26]:
dfProduct = df2[byColumn].unique().tolist()
dfProduct = pd.DataFrame(dfProduct, columns=[byColumn])
dfProduct[byColumn] = dfProduct[byColumn].astype("str")
print(dfProduct.shape)
dfProduct.head(5)

(19787, 1)


Unnamed: 0,product_id
0,5645
1,28743
2,68986
3,9692
4,81921


In [27]:
# Conversión de dataframe de pandas para uso en Tensorflow
products = tf.data.Dataset.from_tensor_slices(dict(dfProduct))

Comprobación de conversión

In [28]:
for row in products.take(3):
  print(row)

{'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'5645'>}
{'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'28743'>}
{'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'68986'>}


### Aplicación de TensorFlow

Se define el alcance de las variables

In [30]:
sales = sales.map(lambda x: {
    byColumn: x[byColumn],
    "customer_id": x["customer_id"],
})
products = products.map(lambda x: x[byColumn])

In [31]:
sales

<MapDataset element_spec={'product_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'customer_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [33]:
sales_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
sales_vocabulary.adapt(sales.map(lambda x: x["customer_id"]))

products_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
products_vocabulary.adapt(products)

In [62]:
class RecomendatorModel(tfrs.Model):

  def __init__(
      self,
      sales_model: tf.keras.Model,
      products_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    self.sales_model = sales_model
    self.products_model = products_model

    self.task = task


  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    sales_embeddings = self.sales_model(features["customer_id"])
    products_embeddings = self.products_model(features[byColumn])

    return self.task(sales_embeddings, products_embeddings)

In [60]:
# Define capas modelos de ventas y productos
sales_model = tf.keras.Sequential([
    sales_vocabulary,
    tf.keras.layers.Embedding(sales_vocabulary.vocabulary_size(), 64)
])

products_model = tf.keras.Sequential([
    products_vocabulary,
    tf.keras.layers.Embedding(products_vocabulary.vocabulary_size(), 64)
])

# Define los objetivos
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    products.batch(128).map(products_model)
  )
)


In [63]:
# Creación del modelo
model = RecomendatorModel(sales_model, products_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Entrenamiento para tres 'epochs'
model.fit(ratings.batch(4096), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4ce374a920>

In [67]:
# Uso de fuerza bruta
index = tfrs.layers.factorized_top_k.BruteForce(model.sales_model)
index.index_from_dataset(
    products.batch(100).map(lambda title: (title, model.products_model(title))))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f4ce374bd30>

**Guardado de modelo**

In [68]:
#model.save_weights("checkpoint")
#model.save('my_model.tf')
#model.summary()

**Chequeo preliminar de resultados**

In [69]:
numCustomer = "40"
_, titles = index(np.array([numCustomer]))
print(f"Top 3 recommendations for User{numCustomer}: {titles[0, :5]}")

Top 3 recommendations for User40: [b'8362' b'62785' b'10906' b'88497' b'9537']


In [70]:
numCustomer = "1"
_, titles = index(np.array([numCustomer]))
print(f"Top 3 recommendations for User{numCustomer}: {titles[0, :5]}")

Top 3 recommendations for User1: [b'81348' b'56981' b'9972' b'91125' b'94992']


### Depuración de resultados para dataframe final

In [71]:
df5=df4
print(df5.shape)
df5.head()

(810167, 2)


Unnamed: 0,customer_id,product_id
0,0,5645
1,0,36943
2,0,5645
3,0,8635
4,0,4629


In [72]:
df5['recomendation1'] = np.nan
df5['recomendation2'] = np.nan
df5['recomendation3'] = np.nan
df5.head()

Unnamed: 0,customer_id,product_id,recomendation1,recomendation2,recomendation3
0,0,5645,,,
1,0,36943,,,
2,0,5645,,,
3,0,8635,,,
4,0,4629,,,


A continuación se trasladan las recomendaciones del modelo a un dataframe de Pandas

In [None]:
for element in range(df5.shape[0]):
    _, titles = index(np.array([str(element)]))
    result = [element2.decode('utf-8') for element2 in titles.numpy()[0]][:3]
    df5.iloc[element, df5.columns.get_loc('recomendation1')] = result[0]
    df5.iloc[element, df5.columns.get_loc('recomendation2')] = result[1]
    df5.iloc[element, df5.columns.get_loc('recomendation3')] = result[2]
    if element%10000==0:
        print(element)
df5.head()

En este punto, se prepara el dataframe de Pandas obtenido para su uso en el Recomendador

In [75]:
df6 = df5

In [76]:
df7 = pd.merge(df6, df2, right_index=True, left_index=True)
df7=df7[["customer_id_y", f"{byColumn}_y", "recomendation1", "recomendation2", "recomendation3"]]
df7.rename(columns = {'customer_id_y':'customer_id',"name_y":"name"}, inplace = True)
df7.head()

Unnamed: 0,customer_id,product_id_y,recomendation1,recomendation2,recomendation3
0,da5b59745c6a4699dee7684eba901bba,5645,36943,5645,72324
1,531a918355010bacbe506243a5f05c30,28743,81348,56981,9972
2,14e6f6400d1c114d509844be3687cb19,68986,72920,95729,9070
3,872bd419dfb24caf4f996a2cd2b8a9b4,9692,10504,72240,12493
4,8a1b78fb0503a964a7fb19135d429b78,81921,81921,62707,17647


In [80]:
df8=df7
df8 = df8.groupby(by="customer_id", dropna=False).first().reset_index()

print(df8.shape)
df8.tail()

(113522, 5)


Unnamed: 0,customer_id,product_id_y,recomendation1,recomendation2,recomendation3
113517,fffc9e0a62f07e67ff85803a8b5f30cf,12137,2605,3714,12576
113518,fffe0497986df50816e428af728f8900,76271,10463,24945,25566
113519,fffed4187f3b5f17cb58536f7fac8dee,9964,10692,23410,53173
113520,ffff748a7ac35759d9fef57a34fd4a21,8153,27765,134,9744
113521,ffffb88e89a23a34d3d98282bad3889a,33997,10692,23410,53173


### Exportación de resultados

In [82]:
df8.to_csv(f"recomendations({byColumn})_V7.csv", index=False)