# Customer shopping

## Librerías

In [1]:
import keras
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import warnings
import tensorflow as tf

from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

In [2]:
warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline

plt.style.use('ggplot')

## Cargas de datos

In [4]:
path = os.path.join('data', 'customers_model.csv')
customer_data = pd.read_csv(path)

In [5]:
customer_data.head()

Unnamed: 0,age,category,price,gender_Male,payment_method_Credit Card,payment_method_Debit Card,shopping_mall_Emaar Square Mall,shopping_mall_Forum Istanbul,shopping_mall_Istinye Park,shopping_mall_Kanyon,shopping_mall_Mall of Istanbul,shopping_mall_Metrocity,shopping_mall_Metropol AVM,shopping_mall_Viaport Outlet,shopping_mall_Zorlu Center
0,19,Shoes,3000.85,0,0,1,0,0,0,0,0,1,0,0,0
1,23,Shoes,3000.85,1,0,0,0,0,0,1,0,0,0,0,0
2,69,Shoes,3000.85,1,0,0,0,1,0,0,0,0,0,0,0
3,67,Shoes,3000.85,0,0,1,0,0,0,0,0,0,0,0,1
4,42,Shoes,3000.85,1,1,0,1,0,0,0,0,0,0,0,0


## Preparación

In [6]:
x = customer_data.drop('category', axis=1)
y = customer_data['category']

### Escalamiento

In [7]:
min_max = MinMaxScaler()

x_scale = min_max.fit_transform(x)

x_scale = pd.DataFrame(x_scale, columns=x.columns)

### División de datos

In [8]:
x_train, x_test, y_train, y_test = train_test_split(
    x_scale, 
    y, 
    test_size=0.2, 
    shuffle=True, 
    random_state=0)

In [9]:
print("Tamaño del conjunto de entrenamiento:")
print("x_train:", x_train.shape)
print("y_train:", y_train.shape)

print("Tamaño del conjunto de prueba:")
print("x_test:", x_test.shape)
print("y_test:", y_test.shape)

Tamaño del conjunto de entrenamiento:
x_train: (75568, 14)
y_train: (75568,)
Tamaño del conjunto de prueba:
x_test: (18893, 14)
y_test: (18893,)


## **Modelos de clasificación**

### Naive Bayes

In [10]:
modelGNB = GaussianNB()
modelBNB = BernoulliNB(binarize=0.5)
modelMNB = MultinomialNB()

In [11]:
modelGNB.fit(x_train, y_train)
modelBNB.fit(x_train, y_train)
modelMNB.fit(x_train, y_train)

In [12]:
y_modelGNB = modelGNB.predict(x_test)
y_modelBNB = modelBNB.predict(x_test)
y_modelMNB = modelMNB.predict(x_test)

In [13]:
print('modelGNB:', metrics.accuracy_score(y_test, y_modelGNB))
print('modelBNB:', metrics.accuracy_score(y_test, y_modelBNB))
print('modelMNB:', metrics.accuracy_score(y_test, y_modelMNB))

modelGNB: 0.7753665378711693
modelBNB: 0.4273011168157519
modelMNB: 0.36420896628380883


#### Guardar modelos

In [14]:
def save_model(model):
    name = type(model).__name__
    print(name)
    path = os.path.join('..', 'output', 'modelos', f'{name}.pkl')
    with open(path, 'wb') as file:
        pickle.dump(model, file)

In [15]:
save_model(modelGNB)
save_model(modelBNB)
save_model(modelMNB)

GaussianNB
BernoulliNB
MultinomialNB


**GridSearchCV** es una técnica de validación cruzada incluida en el paquete de scikit learn. Lo que hace es ejecutarse a través de los diferentes parámetros que se introducen en la cuadrícula de parámetros y extraer los mejores valores y combinaciones de parámetros.

### **Random Forest Classifier**

In [16]:
rfc = RandomForestClassifier()

In [17]:
forest_params = [
    {'n_estimators': [10,20,50,100],
     'max_depth': np.arange(5, 15), 
     'max_features': np.arange(5,14)}
]

In [18]:
clf = GridSearchCV(rfc, forest_params, cv=5, scoring='accuracy')

In [19]:
clf.fit(x_train, y_train)

print('Los mejores hiperparámetros: '+ str(clf.best_params_))
print('El mejor puntaje: ' + str(clf.best_score_))

Los mejores hiperparámetros: {'max_depth': 9, 'max_features': 12, 'n_estimators': 20}
El mejor puntaje: 1.0


In [20]:
# Obtener el mejor modelo
best_rfc = clf.best_estimator_
y_pred_rf = best_rfc.predict(x_test)

accuracy_rf = metrics.accuracy_score(y_test, y_pred_rf)

print("Exactitud del modelo: {:.2f}%".format(accuracy_rf * 100))

Exactitud del modelo: 100.00%


**Test con menos árboles, buscando encontrar igual exactitud con menor complejidad**

In [21]:
rfc_ = RandomForestClassifier()

In [22]:
forest_params_ = [
    {'n_estimators': [10,20],
     'max_depth': np.arange(5,15), 
     'max_features': np.arange(5,14)}
]

In [23]:
clf_ = GridSearchCV(rfc_, forest_params_, cv=5, scoring='accuracy')

In [None]:
clf_.fit(x_train, y_train)

print('Los mejores hiperparámetros: ' + str(clf_.best_params_))
print('El mejor puntaje: ' + str(clf_.best_score_))

In [None]:
best_rfc = clf_.best_estimator_
y_pred_rf = best_rfc.predict(x_test)

accuracy_rf_ = metrics.accuracy_score(y_test, y_pred_rf)

print("Exactitud del modelo: {:.2f}%".format(accuracy_rf_ * 100))

* Importancia de las características

In [None]:
importancia = pd.DataFrame({
    'predictor': x.columns,
    'importancia': best_rfc.feature_importances_
})

In [None]:
importancia.sort_values('importancia', ascending=False)

out_rfc = pd.DataFrame(clf.cv_results_)
out_rfc = out_rfc.filter(regex = '(param*|mean_t|std_t)').drop(columns = 'params').sort_values('mean_test_score', ascending=False)
out_rfc

* Validación cruzada, para comprobar que el modelo no haga sobreajuste

In [None]:
scores_rfc = cross_val_score(best_rfc, x_train, y_train, cv=5)

for i, j in enumerate(scores_rfc):
    print(f'cross_val_score --> fold {i+1}: {j}')

print(f'mean acc: {scores_rfc.mean()}')

#### Guardar mejor modelo

In [None]:
save_model(best_rfc)

### **DecisionTreeClassifier**

In [None]:
tree_clas = DecisionTreeClassifier(random_state=1024)

In [None]:
param_grid = {
    'max_features': ['auto', 'sqrt', 'log2'],
    'ccp_alpha': [0.1, .01, .001],
    'max_depth' :  np.arange(3, 10),
    'criterion' :['gini', 'entropy'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
}

In [None]:
dec_tree = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=5)

In [None]:
dec_tree.fit(x_train, y_train)

print('Los mejores hiperparámetros: ' + str(dec_tree.best_params_))
print('El mejor puntaje: ' + str(dec_tree.best_score_))

In [None]:
best_tree = dec_tree.best_estimator_
y_pred_tree = best_tree.predict(x_test)

accuracy_tree = metrics.accuracy_score(y_test, y_pred_tree)

print("Exactitud del modelo: {:.2f}%".format(accuracy_tree * 100))

* Crossvalidation

In [None]:
scores_tree = cross_val_score(best_tree, x_train, y_train, cv=5)

for i, j in enumerate(scores_tree):
    print(f'cross_val_score --> fold {i+1}: {j}')

print(f'mean acc: {scores_tree.mean()}')

In [None]:
fig, ax = plt.subplots(figsize=(28, 8))

print(f"Profundidad del árbol: {best_tree.get_depth()}")
print(f"Número de nodos terminales: {best_tree.get_n_leaves()}")

tree_plot = plot_tree(
    decision_tree=best_tree,
    feature_names=x.columns,
    class_names='category',
    filled=True,
    impurity=True,
    fontsize=7,
    ax=ax)

path = os.path.join('..', 'output', 'arbol_decision.png')
plt.savefig(path)

plt.show()

#### Guardar modelo

In [None]:
save_model(best_tree)

### AdaBoost

In [None]:
ada = AdaBoostClassifier()

In [None]:
param_grid = {'n_estimators': range(1, 100, 5),
             'learning_rate':[.001,0.01,.1]
             }

In [None]:
grid = GridSearchCV(
        estimator  = ada,
        param_grid = param_grid,
        scoring    = 'accuracy',
        n_jobs     = - 1,
        cv         = 10, 
        refit      = True,
        verbose    = 0,
        return_train_score = True)

In [None]:
grid.fit(x_train, y_train)

In [None]:
print('Los mejores hiperparámetros: ' + str(grid.best_params_))
print('El mejor puntaje: ' + str(grid.best_score_))

In [None]:
ada_model = grid.best_estimator_
y_pred_ada = ada_model.predict(x_test)

accuracy_ada = metrics.accuracy_score(y_test, y_pred_ada)

print("Exactitud del modelo: {:.2f}%".format(accuracy_ada * 100))

In [None]:
save_model(ada_model)

### Redes neuronales

In [None]:
def one_hot_encode_object_array(arr):
    """Aplica codificación one-hot a un array numpy de objetos (por ejemplo, cadenas).

    Args:
        arr: array de tipo numpy
            El array que contiene objetos a codificar en one-hot.

    Returns:
        numpy.ndarray
            Matriz numpy con la codificación one-hot de los objetos en el array de entrada.

    """
    uniques, ids = np.unique(arr, return_inverse=True)
    return keras.utils.to_categorical(ids, len(uniques))

train_labels = one_hot_encode_object_array(y_train)
test_labels = one_hot_encode_object_array(y_test)

In [None]:
neural_network = Sequential()

neural_network.add(Dense(256, input_shape=(x_train.shape[1],), activation='relu'))
neural_network.add(BatchNormalization())  
neural_network.add(Dropout(0.5))

neural_network.add(Dense(128, activation='relu'))
neural_network.add(BatchNormalization())  
neural_network.add(Dropout(0.5))

neural_network.add(Dense(64, activation='relu'))
neural_network.add(BatchNormalization())  
neural_network.add(Dropout(0.5))

neural_network.add(Dense(32, activation='relu'))
neural_network.add(BatchNormalization())  

neural_network.add(Dense(6, activation='softmax')) 

In [None]:
neural_network.compile(
    optimizer='adam',
    loss='categorical_crossentropy', 
    metrics=['accuracy']
)

In [None]:
neural_network.summary()

In [None]:
neural_network.fit(x_train, train_labels, epochs=100, batch_size=32)

In [None]:
loss, accuracy = neural_network.evaluate(x_test, test_labels)

print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
y_nn = neural_network.predict(x_test)

In [None]:
predicted_classes = y_nn.argmax(axis=1)

In [None]:
y_nn_predcit = [np.unique(y_train)[pred] for pred in predicted_classes]

In [None]:
save_model(neural_network)

## Resultados

In [None]:
modelos = [
    'GaussianNB',
    'MultinomialNB',
    'BernoulliNB',
    'DecisionTreeClassifier',
    'RandomForestClassifier',
    'NeuralNetwork',
    'AdaBoostClassifier']

sc = [
    metrics.accuracy_score(y_test, y_modelGNB),
    metrics.accuracy_score(y_test, y_modelMNB),
    metrics.accuracy_score(y_test, y_modelBNB),
    metrics.accuracy_score(y_test, y_pred_tree),
    metrics.accuracy_score(y_test, y_pred_rf),
    metrics.accuracy_score(y_test, y_nn_predcit),
    metrics.accuracy_score(y_test, y_pred_ada)
]

dic = {'Modelo': modelos, 'Scores': sc}

resultados = pd.DataFrame(dic)
resultados = resultados.sort_values(by='Scores', ascending=False)

resultados