# Customer shopping

## Librerías

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

%matplotlib inline
plt.style.use('ggplot')

## Cargas de datos

In [2]:
path = os.path.join('data', 'customers_model.csv')
customer_data = pd.read_csv(path)

In [3]:
customer_data

Unnamed: 0,age,category,price,gender_Male,payment_method_Credit Card,payment_method_Debit Card,shopping_mall_Emaar Square Mall,shopping_mall_Forum Istanbul,shopping_mall_Istinye Park,shopping_mall_Kanyon,shopping_mall_Mall of Istanbul,shopping_mall_Metrocity,shopping_mall_Metropol AVM,shopping_mall_Viaport Outlet,shopping_mall_Zorlu Center
0,19,Shoes,3000.85,0,0,1,0,0,0,0,0,1,0,0,0
1,23,Shoes,3000.85,1,0,0,0,0,0,1,0,0,0,0,0
2,69,Shoes,3000.85,1,0,0,0,1,0,0,0,0,0,0,0
3,67,Shoes,3000.85,0,0,1,0,0,0,0,0,0,0,0,1
4,42,Shoes,3000.85,1,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94456,54,Food & Beverage,5.23,0,1,0,0,0,0,0,0,0,1,0,0
94457,67,Food & Beverage,5.23,1,0,0,0,0,0,1,0,0,0,0,0
94458,26,Food & Beverage,5.23,0,1,0,0,0,0,0,1,0,0,0,0
94459,19,Food & Beverage,5.23,1,0,1,0,1,0,0,0,0,0,0,0


## Preparación

In [4]:
x = customer_data.drop('category', axis=1)
y = customer_data['category']

### Escalamiento

In [5]:
min_max = MinMaxScaler()

x_scale = min_max.fit_transform(x)

x_scale = pd.DataFrame(x_scale, columns=x.columns)

### División de datos

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x_scale, y, test_size = 0.2, shuffle = True, random_state = 0)

In [7]:
print("Tamaño del conjunto de entrenamiento:")
print("x_train:", x_train.shape)
print("y_train:", y_train.shape)

print("Tamaño del conjunto de prueba:")
print("x_test:", x_test.shape)
print("y_test:", y_test.shape)

Tamaño del conjunto de entrenamiento:
x_train: (75568, 14)
y_train: (75568,)
Tamaño del conjunto de prueba:
x_test: (18893, 14)
y_test: (18893,)


## **Modelos de clasificación**

**GridSearchCV** es una técnica de validación cruzada incluida en el paquete de scikit learn. Lo que hace es ejecutarse a través de los diferentes parámetros que se introducen en la cuadrícula de parámetros y extraer los mejores valores y combinaciones de parámetros.

### **Random Forest Classifier**

In [41]:
# Building Random Forest Classifier
rfc = RandomForestClassifier()
# Initializing GridSearchCV() object and fitting it with hyperparameters
forest_params = [{'n_estimators': [10,20,50,100],
                  'max_depth': np.arange(5, 15), 
                  'max_features': np.arange(5,14)}]
clf = GridSearchCV(rfc, forest_params, cv = 5, scoring='accuracy')
clf.fit(x_train, y_train)
# Getting the Best Hyperparameters
print('Best hyperparameters are: '+str(clf.best_params_))
print('Best score is: ' +str(clf.best_score_))
#  Getting the Best model
best_rfc = clf.best_estimator_
y_pred_rf = best_rfc.predict(x_test)
accuracy_rf = metrics.accuracy_score(y_test, y_pred_rf)
print("Exactitud del modelo: {:.2f}%".format(accuracy_rf * 100))

Best hyperparameters are: {'max_depth': 9, 'max_features': 11, 'n_estimators': 100}
Best score is: 1.0
Exactitud del modelo: 100.00%


**Test con menos árboles, buscando encontrar igual exactitud con menor complejidad**

In [44]:
# Building Random Forest Classifier
rfc = RandomForestClassifier()
# Initializing GridSearchCV() object and fitting it with hyperparameters
forest_params = [{'n_estimators': [10,20],
                  'max_depth': np.arange(5,15), 
                  'max_features': np.arange(5,14)}]
clf = GridSearchCV(rfc, forest_params, cv = 5, scoring='accuracy')
clf.fit(x_train, y_train)
# Getting the Best Hyperparameters
print('Best hyperparameters are: '+str(clf.best_params_))
print('Best score is: ' +str(clf.best_score_))
#  Getting the Best model
best_rfc = clf.best_estimator_
y_pred_rf = best_rfc.predict(x_test)
accuracy_rf = metrics.accuracy_score(y_test, y_pred_rf)
print("Exactitud del modelo: {:.2f}%".format(accuracy_rf * 100))

Best hyperparameters are: {'max_depth': 9, 'max_features': 12, 'n_estimators': 10}
Best score is: 1.0
Exactitud del modelo: 100.00%


In [45]:
# Feauture importances
importancia = pd.DataFrame({
    'predictor': x.columns,
    'importancia': best_rfc.feature_importances_
})
importancia.sort_values('importancia', ascending=False)
# save results
out_rfc = pd.DataFrame(clf.cv_results_)
out_rfc = out_rfc.filter(regex = '(param*|mean_t|std_t)').drop(columns = 'params').sort_values('mean_test_score', ascending=False)
out_rfc
# Cross validation
scores_rfc = cross_val_score(best_rfc, x_train, y_train, cv=5)
for i, j in enumerate(scores_rfc):
    print(f'cross_val_score --> fold {i+1}: {j}')
print(f'mean acc: {scores_rfc.mean()}')

cross_val_score --> fold 1: 0.9993383617837767
cross_val_score --> fold 2: 1.0
cross_val_score --> fold 3: 0.9863702527457986
cross_val_score --> fold 4: 1.0
cross_val_score --> fold 5: 1.0
mean acc: 0.997141722905915


### **DecisionTreeClassifier**

In [49]:
# Building Decision Tree Classifier
tree_clas = DecisionTreeClassifier(random_state=1024)
# Initializing GridSearchCV() object and fitting it with hyperparameters
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' :  np.arange(3, 10),
              'criterion' :['gini', 'entropy'],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10],
             }
dec_tree  = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=5)
dec_tree.fit(x_train, y_train)
# Getting the Best Hyperparameters
print('Best hyperparameters are: '+str(dec_tree.best_params_))
print('Best score is: ' +str(dec_tree.best_score_))
# Getting the Best model
best_tree = dec_tree.best_estimator_
y_pred_tree = best_tree.predict(x_test)
accuracy_tree = metrics.accuracy_score(y_test, y_pred_tree)
print("Exactitud del modelo: {:.2f}%".format(accuracy_tree * 100))

630 fits failed out of a total of 1890.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
630 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ASUS\Anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ASUS\Anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\ASUS\Anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\ASUS\Anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Best hyperparameters are: {'criterion': 'gini', 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5}
Best score is: 0.5754816840478365
Exactitud del modelo: 60.08%


In [11]:
# Cross validation
scores_tree = cross_val_score(best_tree, x_train, y_train, cv=5)
for i, j in enumerate(scores_tree):
    print(f'cross_val_score --> fold {i+1}: {j}')
print(f'mean acc: {scores_tree.mean()}')

cross_val_score --> fold 1: 0.5484319174275506
cross_val_score --> fold 2: 0.5601429138547043
cross_val_score --> fold 3: 0.5451237263464338
cross_val_score --> fold 4: 0.580692119367432
cross_val_score --> fold 5: 0.596440150863495
mean acc: 0.5661661655719231


### Naive Bayes

In [12]:
modelGNB = GaussianNB()
modelBNB = BernoulliNB(binarize=0.5)
modelMNB = MultinomialNB()

In [13]:
modelGNB.fit(x_train, y_train)
modelBNB.fit(x_train, y_train)
modelMNB.fit(x_train, y_train)

In [14]:
y_modelGNB = modelGNB.predict(x_test)
y_modelBNB = modelBNB.predict(x_test)
y_modelMNB = modelMNB.predict(x_test)

In [15]:
print('modelGNB:', metrics.accuracy_score(y_test, y_modelGNB))
print('modelBNB:', metrics.accuracy_score(y_test, y_modelBNB))
print('modelMNB:', metrics.accuracy_score(y_test, y_modelMNB))

modelGNB: 0.7753665378711693
modelBNB: 0.4273011168157519
modelMNB: 0.36420896628380883


### DecisionTreeClassifier

In [16]:
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [17]:
dt = DecisionTreeClassifier(random_state=1234)

In [18]:
grid_search_tree = GridSearchCV(
    estimator=dt, 
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy')

In [19]:
grid_search_tree.fit(x_train, y_train)

In [20]:
params_dt = grid_search_tree.best_params_
dt_model = grid_search_tree.best_estimator_

In [21]:

y_pred_dt = dt_model.predict(x_test)

In [22]:
accuracy = metrics.accuracy_score(y_test, y_pred_dt)
print("Exactitud del modelo: {:.2f}%".format(accuracy * 100))

Exactitud del modelo: 100.00%


* cross validation

In [23]:
scores_tree = cross_val_score(dt_model, x_train, y_train, cv=5)

In [24]:
for i, j in enumerate(scores_tree):
    print(f'fold {i+1}: {j}')

fold 1: 1.0
fold 2: 1.0
fold 3: 1.0
fold 4: 1.0
fold 5: 1.0


In [25]:
print(f'mean acc: {scores_tree.mean()}')

mean acc: 1.0


### Random Forest

In [26]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]}

In [27]:
rf = RandomForestClassifier(random_state=1234)

In [28]:
grid_search = GridSearchCV(
    estimator=rf, 
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy')

In [29]:
grid_search.fit(x_train, y_train)

In [30]:
params = grid_search.best_params_
rf_model = grid_search.best_estimator_

In [31]:
importancia = pd.DataFrame({
    'predictor': x.columns,
    'importancia': rf_model.feature_importances_
})
importancia.sort_values('importancia', ascending=False)

Unnamed: 0,predictor,importancia
1,price,0.94131
0,age,0.047898
2,gender_Male,0.002481
3,payment_method_Credit Card,0.001787
4,payment_method_Debit Card,0.001427
8,shopping_mall_Kanyon,0.00065
9,shopping_mall_Mall of Istanbul,0.000633
11,shopping_mall_Metropol AVM,0.000599
7,shopping_mall_Istinye Park,0.000582
10,shopping_mall_Metrocity,0.00057


In [32]:
resultados = pd.DataFrame(grid_search.cv_results_)
resultados = resultados.filter(regex = '(param*|mean_t|std_t)')
resultados = resultados.drop(columns = 'params')
resultados = resultados.sort_values('mean_test_score', ascending=False)
resultados

Unnamed: 0,param_max_depth,param_n_estimators,mean_test_score,std_test_score
1,,100,0.97884,0.001654
2,,200,0.978284,0.001343
0,,50,0.977239,0.001576
6,10.0,50,0.827004,0.007427
7,10.0,100,0.822293,0.004125
8,10.0,200,0.8222,0.002527
5,5.0,200,0.725095,0.014337
3,5.0,50,0.72409,0.01356
4,5.0,100,0.72319,0.010869


In [33]:
print(f'Los mejores parametros para el RandomForest son: {params}')

Los mejores parametros para el RandomForest son: {'max_depth': None, 'n_estimators': 100}


In [34]:
y_pred_rf = rf_model.predict(x_test)

In [35]:
accuracy = metrics.accuracy_score(y_test, y_pred_rf)
print("Exactitud del modelo: {:.2f}%".format(accuracy * 100))

Exactitud del modelo: 98.18%


* Cross validation

In [36]:
scores = cross_val_score(rf_model, x_train, y_train, cv=5)


In [37]:
for i, j in enumerate(scores):
    print(f'fold {i+1}: {j}')

fold 1: 0.9814741299457457
fold 2: 0.9783644303294958
fold 3: 0.9778351197565172
fold 4: 0.9798186991331966
fold 5: 0.976708793753722


In [38]:
print(f'mean acc: {scores.mean()}')

mean acc: 0.9788402345837355


### AdaBoost

### Gradient Boosting

### Redes neuronales

## Resultados

In [39]:
modelos = [
    'GaussianNB',
    'MultinomialNB',
    'BernoulliNB',
    'DecisionTreeClassifier',
    'RandomForestClassifier']

sc = [
    metrics.accuracy_score(y_test, y_modelGNB),
    metrics.accuracy_score(y_test, y_modelMNB),
    metrics.accuracy_score(y_test, y_modelBNB),
    metrics.accuracy_score(y_test, y_pred_dt),
    metrics.accuracy_score(y_test, y_pred_rf)]

dic = {'Modelo': modelos, 'Scores': sc}

resultados = pd.DataFrame(dic)
resultados = resultados.sort_values(by='Scores', ascending=False)

resultados

Unnamed: 0,Modelo,Scores
3,DecisionTreeClassifier,1.0
4,RandomForestClassifier,0.981792
0,GaussianNB,0.775367
2,BernoulliNB,0.427301
1,MultinomialNB,0.364209
