# Customer shopping

## Librerías

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

%matplotlib inline
plt.style.use('ggplot')

## Cargas de datos

In [3]:
path = os.path.join('data', 'customers_model.csv')
customer_data = pd.read_csv(path)

In [4]:
customer_data

Unnamed: 0,age,category,price,gender_Male,payment_method_Credit Card,payment_method_Debit Card,shopping_mall_Emaar Square Mall,shopping_mall_Forum Istanbul,shopping_mall_Istinye Park,shopping_mall_Kanyon,shopping_mall_Mall of Istanbul,shopping_mall_Metrocity,shopping_mall_Metropol AVM,shopping_mall_Viaport Outlet,shopping_mall_Zorlu Center
0,19,Shoes,3000.85,0,0,1,0,0,0,0,0,1,0,0,0
1,23,Shoes,3000.85,1,0,0,0,0,0,1,0,0,0,0,0
2,69,Shoes,3000.85,1,0,0,0,1,0,0,0,0,0,0,0
3,67,Shoes,3000.85,0,0,1,0,0,0,0,0,0,0,0,1
4,42,Shoes,3000.85,1,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94456,54,Food & Beverage,5.23,0,1,0,0,0,0,0,0,0,1,0,0
94457,67,Food & Beverage,5.23,1,0,0,0,0,0,1,0,0,0,0,0
94458,26,Food & Beverage,5.23,0,1,0,0,0,0,0,1,0,0,0,0
94459,19,Food & Beverage,5.23,1,0,1,0,1,0,0,0,0,0,0,0


## Preparación

In [5]:
x = customer_data.drop('category', axis=1)
y = customer_data['category']

### Escalamiento

In [6]:
min_max = MinMaxScaler()

x_scale = min_max.fit_transform(x)

x_scale = pd.DataFrame(x_scale, columns=x.columns)

### División de datos

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_scale, y, test_size = 0.2, shuffle = True, random_state = 0)

In [9]:
print("Tamaño del conjunto de entrenamiento:")
print("x_train:", x_train.shape)
print("y_train:", y_train.shape)

print("Tamaño del conjunto de prueba:")
print("x_test:", x_test.shape)
print("y_test:", y_test.shape)

Tamaño del conjunto de entrenamiento:
x_train: (75568, 14)
y_train: (75568,)
Tamaño del conjunto de prueba:
x_test: (18893, 14)
y_test: (18893,)


## Modelos de clasificación

[Test] **GridSearchCV** es una técnica de validación cruzada incluida en el paquete de scikit learn. Lo que hace es ejecutarse a través de los diferentes parámetros que se introducen en la cuadrícula de parámetros y extraer los mejores valores y combinaciones de parámetros.

### [Test]**Random Forest Classifier**

In [10]:
# Building Random Forest Classifier
rfc = RandomForestClassifier()
# Initializing GridSearchCV() object and fitting it with hyperparameters
forest_params = [{'n_estimators': [50, 100, 200],
                  'max_depth': np.range(5, 15), 
                  'max_features': np.range(0,14)}]
clf = GridSearchCV(rfc, forest_params, cv = 10, scoring='accuracy')
clf.fit(x_train, y_train)
# Getting the Best Hyperparameters
print('Best hyperparameters are: '+str(clf.best_params_))
print('Best score is: ' +str(clf.best_score_))
# final model
best_rfc = clf.best_estimator_

KeyboardInterrupt: 

### [Test]**DecisionTreeClassifier**

In [None]:
# Building Decision Tree Classifier
tree_clas = DecisionTreeClassifier(random_state=1024)
# Initializing GridSearchCV() object and fitting it with hyperparameters
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' :  np.range(5, 10),
              'criterion' :['gini', 'entropy'],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10],
             }
dec_tree  = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=10, verbose=True)
dec_tree .fit(x_train, y_train)
# Getting the Best Hyperparameters
final_model = dec_tree .best_estimator_

### Naive Bayes

In [8]:
modelGNB = GaussianNB()
modelBNB = BernoulliNB(binarize=0.5)
modelMNB = MultinomialNB()

In [9]:
modelGNB.fit(x_train, y_train)
modelBNB.fit(x_train, y_train)
modelMNB.fit(x_train, y_train)

In [10]:
y_modelGNB = modelGNB.predict(x_test)
y_modelBNB = modelBNB.predict(x_test)
y_modelMNB = modelMNB.predict(x_test)

In [11]:
print('modelGNB:', metrics.accuracy_score(y_test, y_modelGNB))
print('modelBNB:', metrics.accuracy_score(y_test, y_modelBNB))
print('modelMNB:', metrics.accuracy_score(y_test, y_modelMNB))

modelGNB: 0.7728259143598158
modelBNB: 0.4287831471973747
modelMNB: 0.36727888636002753


### DecisionTreeClassifier

In [12]:
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [13]:
dt = DecisionTreeClassifier(random_state=1234)

In [14]:
grid_search_tree = GridSearchCV(
    estimator=dt, 
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy')

In [15]:
grid_search_tree.fit(x_train, y_train)

In [16]:
params_dt = grid_search_tree.best_params_
dt_model = grid_search_tree.best_estimator_

In [17]:
y_pred_dt = dt_model.predict(x_test)

In [18]:
accuracy = metrics.accuracy_score(y_test, y_pred_dt)
print("Exactitud del modelo: {:.2f}%".format(accuracy * 100))

Exactitud del modelo: 100.00%


* cross validation

In [19]:
scores_tree = cross_val_score(dt_model, x_train, y_train, cv=5)

In [20]:
for i, j in enumerate(scores_tree):
    print(f'fold {i+1}: {j}')

fold 1: 1.0
fold 2: 1.0
fold 3: 1.0
fold 4: 1.0
fold 5: 1.0


In [21]:
print(f'mean acc: {scores_tree.mean()}')

mean acc: 1.0


### Random Forest

In [22]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]}

In [23]:
rf = RandomForestClassifier(random_state=1234)

In [24]:
grid_search = GridSearchCV(
    estimator=rf, 
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy')

In [25]:
grid_search.fit(x_train, y_train)

In [26]:
params = grid_search.best_params_
rf_model = grid_search.best_estimator_

In [27]:
importancia = pd.DataFrame({
    'predictor': x.columns,
    'importancia': rf_model.feature_importances_
})
importancia.sort_values('importancia', ascending=False)

Unnamed: 0,predictor,importancia
1,price,0.940632
0,age,0.047752
2,gender_Male,0.002042
3,payment_method_Credit Card,0.001997
4,payment_method_Debit Card,0.001781
8,shopping_mall_Kanyon,0.000767
7,shopping_mall_Istinye Park,0.000726
10,shopping_mall_Metrocity,0.000701
9,shopping_mall_Mall of Istanbul,0.000696
13,shopping_mall_Zorlu Center,0.000636


In [28]:
resultados = pd.DataFrame(grid_search.cv_results_)
resultados = resultados.filter(regex = '(param*|mean_t|std_t)')
resultados = resultados.drop(columns = 'params')
resultados = resultados.sort_values('mean_test_score', ascending=False)
resultados

Unnamed: 0,param_max_depth,param_n_estimators,mean_test_score,std_test_score
1,,100,0.976895,0.002898
2,,200,0.976419,0.001985
0,,50,0.975916,0.002287
8,10.0,200,0.821948,0.002897
7,10.0,100,0.82175,0.005295
6,10.0,50,0.821274,0.00614
5,5.0,200,0.714879,0.008972
4,5.0,100,0.713358,0.015775
3,5.0,50,0.710195,0.017983


In [29]:
print(f'Los mejores parametros para el RandomForest son: {params}')

Los mejores parametros para el RandomForest son: {'max_depth': None, 'n_estimators': 100}


In [30]:
y_pred_rf = rf_model.predict(x_test)

In [31]:
accuracy = metrics.accuracy_score(y_test, y_pred_rf)
print("Exactitud del modelo: {:.2f}%".format(accuracy * 100))

Exactitud del modelo: 98.40%


* Cross validation

In [32]:
scores = cross_val_score(rf_model, x_train, y_train, cv=5)

In [33]:
for i, j in enumerate(scores):
    print(f'fold {i+1}: {j}')

fold 1: 0.981341802302501
fold 2: 0.9737329628159322
fold 3: 0.9747254201402673
fold 4: 0.9792231853371269
fold 5: 0.9754515979620194


In [34]:
print(f'mean acc: {scores.mean()}')

mean acc: 0.9768949937115693


### AdaBoost

### Gradient Boosting

### Redes neuronales

## Resultados

In [35]:
modelos = [
    'GaussianNB',
    'MultinomialNB',
    'BernoulliNB',
    'DecisionTreeClassifier',
    'RandomForestClassifier']

sc = [
    metrics.accuracy_score(y_test, y_modelGNB),
    metrics.accuracy_score(y_test, y_modelMNB),
    metrics.accuracy_score(y_test, y_modelBNB),
    metrics.accuracy_score(y_test, y_pred_dt),
    metrics.accuracy_score(y_test, y_pred_rf)]

dic = {'Modelo': modelos, 'Scores': sc}

resultados = pd.DataFrame(dic)
resultados = resultados.sort_values(by='Scores', ascending=False)

resultados

Unnamed: 0,Modelo,Scores
3,DecisionTreeClassifier,1.0
4,RandomForestClassifier,0.983962
0,GaussianNB,0.772826
2,BernoulliNB,0.428783
1,MultinomialNB,0.367279
