## Item Embeddings

We are going to create an AutoEncoder algorithm in order to have an easy way to find the most similar items in the dataset.

In [2]:
import pandas as pd
import numpy as np
import category_encoders as ce
import joblib
import tensorflow as tf
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer
import os

In [3]:
def num_downcast(df: pd.DataFrame) -> pd.DataFrame:
    """
    Downcasts all numeric columns to lowest int/float type.
    This should help with RAM and speed up certain processes.
    This function will attempt to cast to unsigned ints when
    possible.

    :param df: the DataFrame to be downcasted
    :return: a DataFrame with numeric columns downcasted
    """

    #converts to int, then unsigned int
    for col in df.select_dtypes('number'):
        if df[col].min() < 0:
            df[col] = pd.to_numeric(df[col], downcast='unsigned')
        else:
            df[col] = pd.to_numeric(df[col], downcast='integer')
            
    #downcast to float
    for col in df.select_dtypes(np.float):
        df[col] = pd.to_numeric(df[col], downcast='float')

    return df


def binary_encoding(df: pd.DataFrame,
                    col:str) -> pd.DataFrame:
    """
    Uses library `category_encoders` to encode a nominal categorical feature to a set of columns with 0s and 1s.

    The number of columns needed is:
    feature_cardinality <= 2^n, where n is the number of columns

    Or in other words, it's the math.ceil(log_2(cardinality))


    If a feature has 256 unique values / classes, then log_2(256) = 8, so the data can be encoded in 8 columns

    """
    # use sklearn encoder
    lbe = LabelEncoder()

    # ensure type is str/obj
    X = df[[col]].astype(str).copy()

    y = lbe.fit_transform(np.ravel(X.values))
    be = ce.BinaryEncoder()

    encodings = be.fit_transform(X, y).copy()

    # for some reason, an empty column is always included in the output...
    encodings.drop('{}_0'.format(col), axis=1, inplace=True)
    return encodings

In [5]:
items = pd.read_json('item_data.jl.gz', lines = True)
items.head()

Unnamed: 0,item_id,title,domain_id,product_id,price,category_id,condition
0,111260,Casa Sola En Venta Con Gran Patio Solo Pago De...,MLM-INDIVIDUAL_HOUSES_FOR_SALE,,1150000.0,MLM170527,new
1,871377,Resident Evil Origins Collection Nintendo Swit...,MLM-VIDEO_GAMES,15270800.0,1392.83,MLM151595,new
2,490232,Falda De Imitación Piel Negra,MLM-SKIRTS,,350.0,MLM7697,new
3,1150706,Powercolor Red Devil Radeon Rx 580 8gb Gddr5,MLM-GRAPHICS_CARDS,,3200.0,MLM9761,used
4,934912,Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...,MLM-NOTEBOOKS,,1599.0,MLM1652,used


In [6]:
##there are some nan values in the domain column
##we need to fix this before creating the embeddings
items.domain_id = np.where(items.domain_id.isna(), 'servicio', items.domain_id)
items_enc = items.copy()

## Items Embeddings

In [7]:
cat_le = LabelEncoder()
for cat_feat in ['domain_id','category_id']:
    old_name = f'original_{cat_feat}'
    items_enc.rename({cat_feat:old_name}, axis=1, inplace=True)
    items_enc[cat_feat] = cat_le.fit_transform(items_enc[old_name])
    
category_to_code_map = dict(zip(items_enc.original_category_id.tolist(), items_enc.category_id.tolist()))
code_to_category_map = {v: k for k, v in category_to_code_map.items()}

domain_to_code_map = dict(zip(items_enc.original_domain_id.tolist(), items_enc.domain_id.tolist()))
code_to_domain_map = {v: k for k, v in domain_to_code_map.items()}

domain_binary_encoding = binary_encoding(items_enc, 'domain_id')
category_binary_encoding = binary_encoding(items_enc, 'category_id')

encoded_df = num_downcast(pd.concat([items_enc, domain_binary_encoding,category_binary_encoding], axis=1))

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [8]:
encoded_df['new'] = np.where(encoded_df['condition'] == 'new', np.int8(1), np.int8(0))

## Normalizing price

In [9]:
gauss_feats = [
    'price',
]
encoded_df['price'] = np.where(encoded_df['price'].isna(), np.int8(0), encoded_df['price'])

transformer_dict = {key: None for key in gauss_feats}
for feat in gauss_feats: 
    new_col = f'{feat}_transformed'
    qt_transform_full = QuantileTransformer(output_distribution='normal')
    encoded_df[new_col] = qt_transform_full.fit_transform(encoded_df[feat].to_numpy().reshape(-1,1))

In [10]:
encoded_df.reset_index(inplace = True, drop = True)
encoded_df.head().T

Unnamed: 0,0,1,2,3,4
item_id,111260,871377,490232,1150706,934912
title,Casa Sola En Venta Con Gran Patio Solo Pago De...,Resident Evil Origins Collection Nintendo Swit...,Falda De Imitación Piel Negra,Powercolor Red Devil Radeon Rx 580 8gb Gddr5,Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...
original_domain_id,MLM-INDIVIDUAL_HOUSES_FOR_SALE,MLM-VIDEO_GAMES,MLM-SKIRTS,MLM-GRAPHICS_CARDS,MLM-NOTEBOOKS
product_id,,1.52708e+07,,,
price,1.15e+06,1392.83,350,3200,1599
original_category_id,MLM170527,MLM151595,MLM7697,MLM9761,MLM1652
condition,new,new,new,used,used
domain_id,6105,7752,7273,5890,6705
category_id,7187,6749,11280,11486,7028
domain_id_1,0,0,0,0,0


In [11]:
print(encoded_df.shape)
encoded_df.drop_duplicates(subset='item_id', inplace=True)
print(encoded_df.shape)

(2102277, 38)
(2102277, 38)


## Net

In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras import layers, losses
import tensorflow as tf

In [13]:
transformed_features = [i for i in encoded_df.columns if '_transformed' in i]
domain_features = [i for i in encoded_df.columns if 'domain_' in i and 'original' not in i]
category_features = [i for i in encoded_df.columns if 'category_' in i and 'original' not in i]

In [14]:
binary_features_2 = ['new'] + domain_features + category_features
training_features = transformed_features + binary_features_2

In [15]:
#alt method
latent_dim = 6

class Autoencoder(Model):
    def __init__(self, latent_dim, input_dim):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim
        self.input_dim = input_dim
        self.encoder = tf.keras.Sequential([
            layers.Flatten(),
            layers.GaussianNoise(.01),
            layers.Dense(input_dim, activity_regularizer=tf.keras.regularizers.l1(10e-5)),
            layers.LeakyReLU(),
            layers.GaussianNoise(.01),
            layers.Dense(latent_dim),
            layers.LeakyReLU()
        ])
        self.decoder = tf.keras.Sequential([
          layers.Dense(input_dim),
          layers.LeakyReLU(),
          layers.Dense(input_dim, activation='linear')
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    
autoencoder = Autoencoder(latent_dim, len(training_features)) 

In [16]:
for c in training_features:
    encoded_df[c] = encoded_df[c].astype(float)

In [17]:
autoencoder_full = Autoencoder(latent_dim, len(training_features)) 
autoencoder_full.compile(optimizer='adam', loss=losses.MeanAbsoluteError(),run_eagerly=True)
autoencoder_full.fit(
    encoded_df[training_features].to_numpy(), encoded_df[training_features].to_numpy(),
                epochs=30,
                shuffle=True,
                batch_size=256,
                    )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fad8d461be0>

In [19]:
p = autoencoder_full.encoder(encoded_df[training_features].to_numpy().astype(np.float32)).numpy()

In [20]:
items = pd.DataFrame(p, index = encoded_df.item_id, columns = [str(x) for x in range(6)])
items.to_parquet('items_embeddings.parquet')