In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split


In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# evaluate random forest algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold
from sklearn.ensemble import RandomForestClassifier

# Modelos mixtos. Admiten ser utilizados para regresión y clasificación

In [3]:
ENS_2017_C2 = pd.read_csv("../../data/Bases_trabajo/ENS_2017_C2.csv", sep =',')

In [4]:
X = ENS_2017_C2[['CCAA', 'Sexo', 'Edad',
       'Actividad_economica_actual', 'Nacionalidad_española', 'Convivencia',
       'Estado_civil', 'Nivel_estudios', 'Vacunación_gripe',
       'Toma_tensiónArterial_profesional', 'Medición_colesterol',
       'Prueba_sangreHeces', 'Colonoscopia', 'Peso(Kg)',
       'Freq_ActividadFísica', 'Freq_Consumo_FrutaFresca',
       'Freq_Consumo_Carne', 'Freq_Consumo_Huevos', 'Freq_Consumo_Pescado',
       'Freq_Consumo_PastaArrozPatatas', 'Freq_Consumo_PanCereales',
       'Freq_Consumo_VerdurasEnsaladasHortalizas', 'Freq_Consumo_Legumbres',
       'Freq_Consumo_EmbutidosFiambres', 'Freq_Consumo_Lácteos',
       'Freq_Consumo_Dulces', 'Freq_Consumo_ComidaRápida',
       'Freq_Consumo_ZumoNatural', 'Freq_Diaria_CepilladoDientes',
       '¿Fuma actualmente', 'Freq_Consumo_Alcohol',
       'ApoyoAfectivoPersonal_AmigosFamiliares', 'ClaseSocial_BasadaOcupación',
       'Índice_MasaCorporal']]

In [5]:
y = ENS_2017_C2['Salud_percibida']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

## Utilización de los modelos para regresión

## Regression tree

In [7]:
dtr = DecisionTreeRegressor(max_depth=3)
dtr.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=3)

In [8]:
dtr.score(X_train,y_train)

0.18284143567894107

In [9]:
predictions_train = dtr.predict(X_train)
errores_train = abs(predictions_train - y_train)
np.mean(errores_train)

0.46955656810896185

In [10]:
prop = 100 * (errores_train/y_train)
mape = np.mean(prop)
mape

32.95450136104199

In [11]:
import_dict = {
    'features': X_train.columns,
    'importance': dtr.feature_importances_
}

pd.DataFrame(import_dict).sort_values('importance', ascending=False)

Unnamed: 0,features,importance
2,Edad,0.681889
14,Freq_ActividadFísica,0.199433
7,Nivel_estudios,0.063971
3,Actividad_economica_actual,0.038458
31,ApoyoAfectivoPersonal_AmigosFamiliares,0.016248
0,CCAA,0.0
25,Freq_Consumo_Dulces,0.0
21,Freq_Consumo_VerdurasEnsaladasHortalizas,0.0
22,Freq_Consumo_Legumbres,0.0
23,Freq_Consumo_EmbutidosFiambres,0.0


In [12]:
dtr = DecisionTreeRegressor(max_depth=3)
dtr.fit(X_test, y_test)

DecisionTreeRegressor(max_depth=3)

In [13]:
dtr.score(X_test,y_test)

0.1868855423376129

In [14]:
predictions_test = dtr.predict(X_test)
errores_test = abs(predictions_test - y_test)
np.mean(errores_test)

0.4696244140412249

In [15]:
prop = 100 * (errores_test/y_test)
mape = np.mean(prop)
mape

33.00173814339996

In [16]:
import_dict = {
    'features': X_test.columns,
    'importance': dtr.feature_importances_
}

pd.DataFrame(import_dict).sort_values('importance', ascending=False)

Unnamed: 0,features,importance
2,Edad,0.63983
14,Freq_ActividadFísica,0.26203
31,ApoyoAfectivoPersonal_AmigosFamiliares,0.057568
21,Freq_Consumo_VerdurasEnsaladasHortalizas,0.040572
0,CCAA,0.0
25,Freq_Consumo_Dulces,0.0
20,Freq_Consumo_PanCereales,0.0
22,Freq_Consumo_Legumbres,0.0
23,Freq_Consumo_EmbutidosFiambres,0.0
24,Freq_Consumo_Lácteos,0.0


## Random Forest Regressor

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=500,
                               max_depth = 5,
                               random_state=42)

model.fit(X_train, y_train)

y_pred_reg = model.predict(X_train)
mean_absolute_error(y_train, y_pred_reg)

0.4367019701886198

In [18]:
model.score(X_train,y_train)

0.2911435279901874

In [19]:
model = RandomForestRegressor(n_estimators=500,
                               max_depth = 5,
                               random_state=42)

model.fit(X_test, y_test)

y_pred_reg = model.predict(X_test)
mean_absolute_error(y_test, y_pred_reg)

0.399038184419816

In [20]:
model.score(X_test,y_test)

0.43043904650215326

## Utilización de los modelos para clasificación

## Classification tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics


tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_train)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred_tree))


Accuracy: 1.0


In [22]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_test, y_test)
y_pred_tree = tree_clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_tree))



Accuracy: 1.0


## Random Forest Classifier

In [23]:
from sklearn.model_selection import KFold

# define the model
model = RandomForestClassifier()
# evaluate the model 
# THIS DOESN'T TRAIN THE MODEL
k_fold = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=k_fold, n_jobs=-1, error_score='raise')
# report performance
print(n_scores)
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

[0.69318182 0.68506494 0.70616883 0.70454545 0.69967532 0.66396104
 0.68780488 0.70081301 0.69593496 0.68617886 0.68019481 0.67207792
 0.68506494 0.68506494 0.69318182 0.69805195 0.70406504 0.70569106
 0.69430894 0.69756098]
Accuracy: 0.692 (0.011)


In [24]:
max(n_scores)

0.7061688311688312

In [25]:
min(n_scores)

0.663961038961039

In [26]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators = 500,
                                max_depth = 4,
                                random_state = 42)

rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_train)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred_rf))

Accuracy: 0.6775503573749188


In [27]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators = 500,
                                max_depth = 4,
                                random_state = 42)

rnd_clf.fit(X_test, y_test)

y_pred_rf = rnd_clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_rf))

### .....................................................

## Conclusion. Modelos mixtos (Regresión y clasificación)

### Regresión. Los dos modelos de regresión son los que obtienen peores resultados.

###     - Regression tree. Score_train = .1748, Score_test = .1899
###     - Random Forest Regressor. Score_train = .2703, Score_test = .3762

### Clasificación. Entre los dos modelos de clasificación, el de Classification tree es que mejor clasifica los registros.
###     - Classification tree. Accuracy_train = 1.0, Accuracy_test = 1.0
###     - Random Forest Classifier. Accuracy_train = .6771, Accuracy_test = .6843

### Las variables que tienen mayor peso al realizar la clasificación son, por orden de importancia: Edad, Frecuencia de Actividad física, Actividad económica actual, Frecuencia de consumo de alcohol. En la muestra de test se incluyen después de las indicadas: Estado civil y Frecuencia de consumo de verduras, ensaladas y hortalizas