In [41]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split 

### Como los dos modelos son de clasificación utilizamos la base de datos en la que las variables numéricas (edad y peso) se han agrupado en categorías

In [42]:
ENS_2017_C1_agr = pd.read_csv("../../data/Bases_trabajo/ENS_2017_C1_agr.csv", sep =',')

In [43]:
ENS_2017_C1_agr_R = ENS_2017_C1_agr

In [44]:
ENS_2017_C1_agr_R= ENS_2017_C1_agr.loc[:,['Edad_ag','Actividad_economica_actual', 'Estado_civil', 'Freq_ActividadFísica', 'Freq_Consumo_VerdurasEnsaladasHortalizas','Freq_Consumo_Alcohol', 'Salud_percibida']]

In [45]:
ENS_2017_C1_agr_R.columns

Index(['Edad_ag', 'Actividad_economica_actual', 'Estado_civil',
       'Freq_ActividadFísica', 'Freq_Consumo_VerdurasEnsaladasHortalizas',
       'Freq_Consumo_Alcohol', 'Salud_percibida'],
      dtype='object')

In [46]:
ENS_muestra = ENS_2017_C1_agr_R.sample(n=1500, random_state=101)

In [47]:
X = ENS_muestra[['Edad_ag',
       'Actividad_economica_actual', 
       'Estado_civil', 'Freq_ActividadFísica', 'Freq_Consumo_VerdurasEnsaladasHortalizas', 'Freq_Consumo_Alcohol']]

In [48]:
y = ENS_muestra['Salud_percibida']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
# Le podemos poner cualquier clasificador. Irá cambiando según va probando pero necesita 1.
pipe = Pipeline(steps=[('classifier', DecisionTreeClassifier())])


decision_tree_params = {
    'classifier': [DecisionTreeClassifier()]
    }

svm_params = {
    'classifier': [svm.SVC()],
    'classifier__kernel':('rbf',), 
    'classifier__C':[10,], 
    'classifier__gamma': [1,]
    }

# hypertuning 
# Create space of candidate learning algorithms and their hyperparameters
search_space = [
    decision_tree_params,
    svm_params
    ]

In [51]:
# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_train, y_train)
prediction_train = best_model.predict(X_train)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
# Exactitud del modelo
print ("Exactitud training data:", accuracy_score (y_true = y_train, y_pred = prediction_train))

#SAVE MODEL
# save the model to disk
# filename = 'finished_model.sav'
# pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: SVC(C=10, gamma=1)

############################

clf.best_params_ {'classifier': SVC(C=10, gamma=1), 'classifier__C': 10, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

############################

clf.best_score 0.6391666666666668
Exactitud training data: 0.8733333333333333


In [52]:
# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_test, y_test)
prediction_test = best_model.predict(X_test)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
# Exactitud del modelo
print ("Exactitud test data:", accuracy_score (y_true = y_test, y_pred = prediction_test))

#SAVE MODEL
# save the model to disk
# filename = 'finished_model.sav'
# pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: SVC(C=10, gamma=1)

############################

clf.best_params_ {'classifier': SVC(C=10, gamma=1), 'classifier__C': 10, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

############################

clf.best_score 0.6433333333333333
Exactitud test data: 0.9433333333333334


## Gridsearch con todas las variables del cluster 1

In [53]:
ENS_2017_C1_agr = pd.read_csv("../../data/Bases_trabajo/ENS_2017_C1_agr.csv", sep =',')

In [54]:
ENS_muestra = ENS_2017_C1_agr.sample(n=5000, random_state=101)

In [55]:
X = ENS_muestra[['CCAA', 'Sexo', 
       'Actividad_economica_actual', 'Nacionalidad_española', 'Convivencia',
       'Estado_civil', 'Nivel_estudios', 'Vacunación_gripe',
       'Toma_tensiónArterial_profesional', 'Medición_colesterol',
       'Prueba_sangreHeces', 'Colonoscopia', 
       'Freq_ActividadFísica', 'Freq_Consumo_FrutaFresca',
       'Freq_Consumo_Carne', 'Freq_Consumo_Huevos', 'Freq_Consumo_Pescado',
       'Freq_Consumo_PastaArrozPatatas', 'Freq_Consumo_PanCereales',
       'Freq_Consumo_VerdurasEnsaladasHortalizas', 'Freq_Consumo_Legumbres',
       'Freq_Consumo_EmbutidosFiambres', 'Freq_Consumo_Lácteos',
       'Freq_Consumo_Dulces', 'Freq_Consumo_ComidaRápida',
       'Freq_Consumo_ZumoNatural', 'Freq_Diaria_CepilladoDientes',
       '¿Fuma actualmente', 'Freq_Consumo_Alcohol',
       'ApoyoAfectivoPersonal_AmigosFamiliares', 'ClaseSocial_BasadaOcupación',
       'Índice_MasaCorporal','Edad_ag', 'Peso_ag']]

In [56]:
y = ENS_muestra['Salud_percibida']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [58]:

pipe = Pipeline(steps=[('classifier', DecisionTreeClassifier())])


decision_tree_params = {
    'classifier': [DecisionTreeClassifier()],
    }

svm_params = {
    'classifier': [svm.SVC()],
    'classifier__kernel':('rbf', ), 
    'classifier__C':[10, ], 
    'classifier__gamma': [1, ]
    }

# hypertuning 
# Create space of candidate learning algorithms and their hyperparameters
search_space = [
    decision_tree_params,
    svm_params
    ]

In [59]:
# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_train, y_train)
prediction_train = best_model.predict(X_train)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
# Exactitud del modelo
print ("Exactitud training data:", accuracy_score (y_true = y_train, y_pred = prediction_train))

#SAVE MODEL
# save the model to disk
# filename = 'finished_model.sav'
# pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: SVC(C=10, gamma=1)

############################

clf.best_params_ {'classifier': SVC(C=10, gamma=1), 'classifier__C': 10, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

############################

clf.best_score 0.6690000000000002
Exactitud training data: 1.0


In [60]:
clf = GridSearchCV(estimator=pipe, param_grid=search_space, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_test, y_test)
prediction_test = best_model.predict(X_test)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
# Exactitud del modelo
print ("Exactitud test data:", accuracy_score (y_true = y_test, y_pred = prediction_test))

#SAVE MODEL
# save the model to disk
# filename = 'finished_model.sav'
# pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: SVC(C=10, gamma=1)

############################

clf.best_params_ {'classifier': SVC(C=10, gamma=1), 'classifier__C': 10, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

############################

clf.best_score 0.6779999999999999
Exactitud test data: 1.0
