In [1]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split 

### Como los dos modelos son de clasificación utilizamos la base de datos en la que las variables numéricas (edad y peso) se han agrupado en categorías

In [2]:
ENS_2017_C2_agr = pd.read_csv("../../data/Bases_trabajo/ENS_2017_C2_agr.csv", sep =',')

In [3]:
ENS_2017_C2_agr_R = ENS_2017_C2_agr

In [4]:
ENS_2017_C2_agr_R= ENS_2017_C2_agr.loc[:,['Edad_ag','Actividad_economica_actual', 'Freq_ActividadFísica', 'Nivel_estudios','Freq_Consumo_VerdurasEnsaladasHortalizas','ApoyoAfectivoPersonal_AmigosFamiliares', 'Salud_percibida']]

In [5]:
ENS_2017_C2_agr_R.columns

Index(['Edad_ag', 'Actividad_economica_actual', 'Freq_ActividadFísica',
       'Nivel_estudios', 'Freq_Consumo_VerdurasEnsaladasHortalizas',
       'ApoyoAfectivoPersonal_AmigosFamiliares', 'Salud_percibida'],
      dtype='object')

In [6]:
ENS_muestra = ENS_2017_C2_agr_R.sample(n=1500, random_state=101)

In [7]:
X = ENS_muestra[['Edad_ag','Actividad_economica_actual', 'Freq_ActividadFísica', 'Nivel_estudios','Freq_Consumo_VerdurasEnsaladasHortalizas','ApoyoAfectivoPersonal_AmigosFamiliares']]

In [8]:
y = ENS_muestra['Salud_percibida']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Solicitamos con Gridsearch que seleccione el mejor modelo de clasificación entre los dos en los que se ha obtenido mejor score (DecissionTreeClassifier y SVC). Utilizamos en ambos casos los parámetros con los que se han obtenido los mejores score

In [10]:
pipe = Pipeline(steps=[('classifier', DecisionTreeClassifier())])


decision_tree_params = {
    'classifier': [DecisionTreeClassifier()],
    'classifier__max_depth': [3,]
    }

svm_params = {
    'classifier': [svm.SVC()],
    'classifier__kernel':('rbf',), 
    'classifier__C':[10,], 
    'classifier__gamma':[1,]
    }

# hypertuning 
# Create space of candidate learning algorithms and their hyperparameters
search_space = [
    decision_tree_params,
    svm_params
    ]

In [11]:
# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_train, y_train)
prediction_train = best_model.predict(X_train)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
# Exactitud del modelo
print ("Exactitud training data:", accuracy_score (y_true = y_train, y_pred = prediction_train))

#SAVE MODEL
# save the model to disk
# filename = 'finished_model.sav'
# pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: DecisionTreeClassifier(max_depth=3)

############################

clf.best_params_ {'classifier': DecisionTreeClassifier(max_depth=3), 'classifier__max_depth': 3}

############################

clf.best_score 0.6758333333333333
Exactitud training data: 0.6916666666666667


In [12]:
# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_test, y_test)
prediction_test = best_model.predict(X_test)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
# Exactitud del modelo
print ("Exactitud test data:", accuracy_score (y_true = y_test, y_pred = prediction_test))


#SAVE MODEL
# save the model to disk
# filename = 'finished_model.sav'
# pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: DecisionTreeClassifier(max_depth=3)

############################

clf.best_params_ {'classifier': DecisionTreeClassifier(max_depth=3), 'classifier__max_depth': 3}

############################

clf.best_score 0.6200000000000001
Exactitud test data: 0.69


## Gridsearch con todas las variables del cluster 2

In [13]:
ENS_2017_C2_agr = pd.read_csv("../../data/Bases_trabajo/ENS_2017_C2_agr.csv", sep =',')

In [14]:
ENS_muestra = ENS_2017_C2_agr.sample(n=1500, random_state=101)

In [15]:
X = ENS_muestra[['CCAA', 'Sexo', 'Edad_ag',
       'Actividad_economica_actual', 'Nacionalidad_española', 'Convivencia',
       'Estado_civil', 'Nivel_estudios', 'Vacunación_gripe',
       'Toma_tensiónArterial_profesional', 'Medición_colesterol',
       'Prueba_sangreHeces', 'Colonoscopia', 'Peso_ag',
       'Freq_ActividadFísica', 'Freq_Consumo_FrutaFresca',
       'Freq_Consumo_Carne', 'Freq_Consumo_Huevos', 'Freq_Consumo_Pescado',
       'Freq_Consumo_PastaArrozPatatas', 'Freq_Consumo_PanCereales',
       'Freq_Consumo_VerdurasEnsaladasHortalizas', 'Freq_Consumo_Legumbres',
       'Freq_Consumo_EmbutidosFiambres', 'Freq_Consumo_Lácteos',
       'Freq_Consumo_Dulces', 'Freq_Consumo_ComidaRápida',
       'Freq_Consumo_ZumoNatural', 'Freq_Diaria_CepilladoDientes',
       '¿Fuma actualmente', 'Freq_Consumo_Alcohol',
       'ApoyoAfectivoPersonal_AmigosFamiliares', 'ClaseSocial_BasadaOcupación',
       'Índice_MasaCorporal']]

In [16]:
y = ENS_muestra['Salud_percibida']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [18]:
pipe = Pipeline(steps=[('classifier', DecisionTreeClassifier())])


decision_tree_params = {
    'classifier': [DecisionTreeClassifier()],
    'classifier__max_depth': [3,]
    }

svm_params = {
    'classifier': [svm.SVC()],
    'classifier__kernel':('rbf',), 
    'classifier__C':[10,], 
    'classifier__gamma':[1,]
    }

# hypertuning 
# Create space of candidate learning algorithms and their hyperparameters
search_space = [
    decision_tree_params,
    svm_params
    ]

In [19]:

# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_train, y_train)
prediction_train = best_model.predict(X_train)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
# Exactitud del modelo
print ("Exactitud training data:", accuracy_score (y_true = y_train, y_pred = prediction_train))

#SAVE MODEL
# save the model to disk
# filename = 'finished_model.sav'
# pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: SVC(C=10, gamma=1)

############################

clf.best_params_ {'classifier': SVC(C=10, gamma=1), 'classifier__C': 10, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

############################

clf.best_score 0.6683333333333332
Exactitud training data: 1.0


In [20]:
# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_test, y_test)
prediction_test = best_model.predict(X_test)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
# Exactitud del modelo
print ("Exactitud test data:", accuracy_score (y_true = y_test, y_pred = prediction_test))


#SAVE MODEL
# save the model to disk
# filename = 'finished_model.sav'
# pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: DecisionTreeClassifier(max_depth=3)

############################

clf.best_params_ {'classifier': DecisionTreeClassifier(max_depth=3), 'classifier__max_depth': 3}

############################

clf.best_score 0.6599999999999999
Exactitud test data: 0.72
