In [27]:
################# SATURDAYSAI #################
### Importación de las librerías necesarias ###
###############################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier

from pylab import rcParams

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedBaggingClassifier
from collections import Counter

rcParams['figure.figsize'] = 14, 8.7 
LABELS = ["1","2"]

from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest

%matplotlib inline

# Carga de datos

In [28]:
### Importación de la Base de datos ###
file = "../Machine Learning Python (ML)/BBDD/EstudioPrevalencia2015.xlsx"
xl = pd.ExcelFile(file)

In [29]:
print(xl.sheet_names)

['BBDD fichero total', 'títulos y códigos', 'Hoja1']


In [30]:
df=xl.parse('BBDD fichero total')

# Limpieza de los datos

In [31]:
# Declaración de funciones para limpiar el Dataframe

def change_99_100(x):
    if (type(x) == int or type(x) == float) and x >= 99.0:
      return None
    else :
      return x

def change_column_P23(x):
    if (type(x) == int or type(x) == float) and x >= 2.0:
      return 2
    else :
      return 1

def cleanDataset(df):
  print("Tamaño del dataframe antes de limpieza ", df.shape)

  new_df = df[df['P23'] <= 3]
  print("Eliminadas filas que no han contestado a la variable objetivo ", new_df.shape )

  new_df = new_df.apply(change_99_100)
  print("Eliminados los valores de 99 o 100", new_df.shape)

  new_df['P23'] = new_df['P23'].apply(change_column_P23)
  print("Unificados Valores de P23 2 y 3 en solo 2", new_df.shape)

  new_df = new_df.dropna(axis=1, how='any')
  print("Eliminadas todas las columnas con al menos un valor nulo", new_df.shape)

  new_df = new_df.drop(['numero', 'modalidad'],1)
  print("Eliminadas columnas que no aportan valor ", new_df.shape)

#   new_df = new_df[['P0R1','P0B','POB1','P0C','P3','P4','P5','P6','P7','P8','P9','P11',
#                    'P13','P15A','P16A','P18A','P18B','P18C','P20','P20','P22','P24','P26',
#                    'P28','P29','P30','P31','P32','P33','P34','P35','P38SP1','P38SP2',
#                    'P38SP3','P38SP4','P38SP5','P38SP6','P38SP7','P38SP8','P38SP9',
#                    'P39','P41','P43','P46','DB0','DB1','DB3','DB4','DB5','DB6','DB7','DB8', 'P23'
#                   ]]
  print("Después del análisis del investigador dejamos las columnas más interesantes ", new_df.shape)

  return new_df

In [32]:
new_df = cleanDataset(df)
# new_df.describe()
print(type(new_df))
print(type(df))
# Variables del experto ya eliminadas previamente
# df[['P50', 'DB2', 'P47', 'P38SP10', 'POF3']].describe()

Tamaño del dataframe antes de limpieza  (6816, 240)
Eliminadas filas que no han contestado a la variable objetivo  (5159, 240)
Eliminados los valores de 99 o 100 (5159, 240)
Unificados Valores de P23 2 y 3 en solo 2 (5159, 240)
Eliminadas todas las columnas con al menos un valor nulo (5159, 127)
Eliminadas columnas que no aportan valor  (5159, 125)
Después del análisis del investigador dejamos las columnas más interesantes  (5159, 125)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [33]:
new_df['P23'].unique()

array([1, 2], dtype=int64)

In [34]:
print(*new_df.columns.to_list(), sep=', ')
# print(new_df.columns.to_list())

P0R1, P0B, POB1, P0B2, P0C, P0F1, POF2, P1P1A, P1P1B, P1P1C, P1P1D, P1P1E, P1P1F, P1P1G, P1P1H, P1P1I, P1P1J, P1P1K, P1P1L, P1P1M, P1P1N, P1P1O, P2P2A, P2P2B, P2P2C, P2P2D, P2P2E, P2P2F, P2P2G, P2P2H, P2P2I, P2P2J, P2P2K, P2P2L, P2P2M, P2P2N, P2P2O, P3, P4, P5, P6, P7, P8, P9, P11, P12A_1, P13, P14A1, P14A2, P14A3, P14A4, P14A5, P14A6, P14A7, P14A8, P14A9, P14A10, P14A11, P14A12, P14A13, P14A14, P14A15, P14A16, P14A17, P15A, P16A, P17P17A, P17P17B, P17P17C, P17P17D, P17P17E, P17P17F, P17P17G, P17P17H, P17P17I, P18A, P18B, P18C, P20, P21, P22, P23, P24, P25P25A, P25P25B, P25P25C, P25P25D, P25P25E, P25P25F, P25P25G, P25P25H, P25P25I, P25P25J, P26, P28, P29, P30, P31, P32, P33, P34, P35, P38SP1, P38SP2, P38SP3, P38SP4, P38SP5, P38SP6, P38SP7, P38SP8, P38SP9, P39, P41, P43, P46, DB0, DB1, DB3, DB4, DB5, DB6, DB7, DB8, PESO, PESO1


In [35]:
new_df.dtypes

P0R1       int64
P0B        int64
POB1       int64
P0B2       int64
P0C        int64
          ...   
DB6        int64
DB7        int64
DB8        int64
PESO     float64
PESO1    float64
Length: 125, dtype: object

In [36]:
new_df=new_df.dropna()

In [37]:
new_df["P23"].value_counts()

1    4820
2     339
Name: P23, dtype: int64

# Creación de los conjuntos de Train y Test

In [38]:
y = new_df['P23']
X = new_df.drop('P23', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)


In [39]:
predictores= X_train.columns.tolist()

In [40]:
print('X train Shape: ' , X_train.shape, '; Y train Shape: ' , y_train.shape)
print('X test Shape: ' , X_test.shape, '; Y test Shape: ' , y_test.shape)

X train Shape:  (3611, 124) ; Y train Shape:  (3611,)
X test Shape:  (1548, 124) ; Y test Shape:  (1548,)


In [41]:
### Variables más importantes ###
class_raw = new_df["P23"]
features_raw = new_df.drop(["P23"], axis = 1)

clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
clf.fit(features_raw, class_raw)

feature_list = list(zip(new_df.columns.values, clf.feature_importances_))
sorted_by_importance = sorted(feature_list, key=lambda x: x[1], reverse=True)

for feat, value in sorted_by_importance:
    print(feat, value)

P14A6 0.08418690451660359
P15A 0.08325513081716275
P14A5 0.06284130293472906
P14A11 0.056828949377033214
P14A14 0.04638067368785934
P16A 0.042255671747133
P14A2 0.039099581162902505
P14A4 0.037039159096484225
P5 0.0337260050106488
P14A1 0.03313538952855264
P1P1H 0.03117488320150953
P14A3 0.028992509111595458
P14A10 0.028718386172075182
P14A9 0.026997431956394088
P14A8 0.02386412763555732
P4 0.02329297037645778
P23 0.017289279935330228
P1P1K 0.015889920109420665
P14A16 0.014793458481099562
P3 0.01288193767492443
P14A17 0.012399649223242033
P8 0.012328402890510966
P14A7 0.010603888782842276
P14A12 0.010001896804539992
P7 0.008621389858803535
P25P25C 0.006812066025812055
P17P17H 0.006581917156060576
P18C 0.006530312926493012
P2P2I 0.006205701237851759
P25P25G 0.005823857072150132
P22 0.0054277971189488575
P1P1J 0.005257332976554464
P2P2M 0.004942380645560029
P26 0.004688325256127007
P11 0.004333171829586073
P18A 0.004281558440571253
P2P2K 0.004268986046356909
PESO 0.004223162823032384
P17

In [42]:
#### Selección de variables con Boruta ####
from boruta import boruta_py
from sklearn.ensemble import RandomForestClassifier

In [43]:
train_x_b  = X_train
train_y_b = y_train

In [44]:
train_x_b=np.matrix(train_x_b)
train_y_b=np.transpose(np.matrix(train_y_b))

In [45]:
train_y_b.shape

(3611, 1)

In [46]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

In [47]:
boruta_feature_selector = boruta_py.BorutaPy(rf, n_estimators='auto', verbose=2, random_state=4242, max_iter = 50, perc = 90)
boruta_feature_selector.fit(train_x_b, train_y_b)

  y = column_or_1d(y, warn=True)
  self.estimator.fit(X, y)


Iteration: 	1 / 50
Confirmed: 	0
Tentative: 	124
Rejected: 	0


  self.estimator.fit(X, y)


Iteration: 	2 / 50
Confirmed: 	0
Tentative: 	124
Rejected: 	0


  self.estimator.fit(X, y)


Iteration: 	3 / 50
Confirmed: 	0
Tentative: 	124
Rejected: 	0


  self.estimator.fit(X, y)


Iteration: 	4 / 50
Confirmed: 	0
Tentative: 	124
Rejected: 	0


  self.estimator.fit(X, y)


Iteration: 	5 / 50
Confirmed: 	0
Tentative: 	124
Rejected: 	0


  self.estimator.fit(X, y)


Iteration: 	6 / 50
Confirmed: 	0
Tentative: 	124
Rejected: 	0


  self.estimator.fit(X, y)


Iteration: 	7 / 50
Confirmed: 	0
Tentative: 	124
Rejected: 	0


  self.estimator.fit(X, y)


Iteration: 	8 / 50
Confirmed: 	40
Tentative: 	41
Rejected: 	43


  self.estimator.fit(X, y)


Iteration: 	9 / 50
Confirmed: 	40
Tentative: 	41
Rejected: 	43


  self.estimator.fit(X, y)


Iteration: 	10 / 50
Confirmed: 	40
Tentative: 	41
Rejected: 	43


  self.estimator.fit(X, y)


Iteration: 	11 / 50
Confirmed: 	40
Tentative: 	41
Rejected: 	43


  self.estimator.fit(X, y)


Iteration: 	12 / 50
Confirmed: 	41
Tentative: 	40
Rejected: 	43


  self.estimator.fit(X, y)


Iteration: 	13 / 50
Confirmed: 	41
Tentative: 	37
Rejected: 	46


  self.estimator.fit(X, y)


Iteration: 	14 / 50
Confirmed: 	41
Tentative: 	37
Rejected: 	46


  self.estimator.fit(X, y)


Iteration: 	15 / 50
Confirmed: 	41
Tentative: 	37
Rejected: 	46


  self.estimator.fit(X, y)


Iteration: 	16 / 50
Confirmed: 	42
Tentative: 	36
Rejected: 	46


  self.estimator.fit(X, y)


Iteration: 	17 / 50
Confirmed: 	42
Tentative: 	33
Rejected: 	49


  self.estimator.fit(X, y)


Iteration: 	18 / 50
Confirmed: 	42
Tentative: 	33
Rejected: 	49


  self.estimator.fit(X, y)


Iteration: 	19 / 50
Confirmed: 	42
Tentative: 	28
Rejected: 	54


  self.estimator.fit(X, y)


Iteration: 	20 / 50
Confirmed: 	42
Tentative: 	28
Rejected: 	54


  self.estimator.fit(X, y)


Iteration: 	21 / 50
Confirmed: 	42
Tentative: 	28
Rejected: 	54


  self.estimator.fit(X, y)


Iteration: 	22 / 50
Confirmed: 	43
Tentative: 	27
Rejected: 	54


  self.estimator.fit(X, y)


Iteration: 	23 / 50
Confirmed: 	43
Tentative: 	27
Rejected: 	54


  self.estimator.fit(X, y)


Iteration: 	24 / 50
Confirmed: 	43
Tentative: 	27
Rejected: 	54


  self.estimator.fit(X, y)


Iteration: 	25 / 50
Confirmed: 	43
Tentative: 	26
Rejected: 	55


  self.estimator.fit(X, y)


Iteration: 	26 / 50
Confirmed: 	43
Tentative: 	24
Rejected: 	57


  self.estimator.fit(X, y)


Iteration: 	27 / 50
Confirmed: 	43
Tentative: 	24
Rejected: 	57


  self.estimator.fit(X, y)


Iteration: 	28 / 50
Confirmed: 	43
Tentative: 	24
Rejected: 	57


  self.estimator.fit(X, y)


Iteration: 	29 / 50
Confirmed: 	43
Tentative: 	24
Rejected: 	57


  self.estimator.fit(X, y)


Iteration: 	30 / 50
Confirmed: 	43
Tentative: 	24
Rejected: 	57


  self.estimator.fit(X, y)


Iteration: 	31 / 50
Confirmed: 	43
Tentative: 	24
Rejected: 	57


  self.estimator.fit(X, y)


Iteration: 	32 / 50
Confirmed: 	43
Tentative: 	22
Rejected: 	59


  self.estimator.fit(X, y)


Iteration: 	33 / 50
Confirmed: 	43
Tentative: 	22
Rejected: 	59


  self.estimator.fit(X, y)


Iteration: 	34 / 50
Confirmed: 	43
Tentative: 	20
Rejected: 	61


  self.estimator.fit(X, y)


Iteration: 	35 / 50
Confirmed: 	43
Tentative: 	20
Rejected: 	61


  self.estimator.fit(X, y)


Iteration: 	36 / 50
Confirmed: 	43
Tentative: 	20
Rejected: 	61


  self.estimator.fit(X, y)


Iteration: 	37 / 50
Confirmed: 	43
Tentative: 	20
Rejected: 	61


  self.estimator.fit(X, y)


Iteration: 	38 / 50
Confirmed: 	43
Tentative: 	20
Rejected: 	61


  self.estimator.fit(X, y)


Iteration: 	39 / 50
Confirmed: 	43
Tentative: 	19
Rejected: 	62


  self.estimator.fit(X, y)


Iteration: 	40 / 50
Confirmed: 	43
Tentative: 	17
Rejected: 	64


  self.estimator.fit(X, y)


Iteration: 	41 / 50
Confirmed: 	43
Tentative: 	17
Rejected: 	64


  self.estimator.fit(X, y)


Iteration: 	42 / 50
Confirmed: 	43
Tentative: 	17
Rejected: 	64


  self.estimator.fit(X, y)


Iteration: 	43 / 50
Confirmed: 	43
Tentative: 	17
Rejected: 	64


  self.estimator.fit(X, y)


Iteration: 	44 / 50
Confirmed: 	43
Tentative: 	17
Rejected: 	64


  self.estimator.fit(X, y)


Iteration: 	45 / 50
Confirmed: 	43
Tentative: 	17
Rejected: 	64


  self.estimator.fit(X, y)


Iteration: 	46 / 50
Confirmed: 	43
Tentative: 	16
Rejected: 	65


  self.estimator.fit(X, y)


Iteration: 	47 / 50
Confirmed: 	43
Tentative: 	16
Rejected: 	65


  self.estimator.fit(X, y)


Iteration: 	48 / 50
Confirmed: 	43
Tentative: 	16
Rejected: 	65


  self.estimator.fit(X, y)


Iteration: 	49 / 50
Confirmed: 	43
Tentative: 	15
Rejected: 	66


BorutaPy finished running.

Iteration: 	50 / 50
Confirmed: 	43
Tentative: 	2
Rejected: 	66


BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight='balanced',
                                          criterion='gini', max_depth=5,
                                          max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=217, n_jobs=-1,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x3F0930E048,
                                          verbose=0, warm_start=Fa

In [48]:
boruta_feature_selector.support_

array([ True, False, False,  True, False, False, False, False, False,
       False, False, False, False, False,  True, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,  True,  True, False,  True,  True, False, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True, False, False,
       False,  True,  True, False,  True, False, False, False, False,
        True,  True,  True,  True, False,  True, False, False,  True,
        True, False, False,  True,  True, False, False,  True,  True,
        True, False, False, False, False, False, False,  True, False,
       False,  True, False, False, False, False, False, False, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True])

In [49]:
final_features = list()
indexes = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indexes):
    final_features.append(predictores[x])
print(final_features)

['P0R1', 'P0B2', 'P1P1H', 'P1P1K', 'P3', 'P4', 'P5', 'P7', 'P8', 'P12A_1', 'P14A1', 'P14A2', 'P14A3', 'P14A4', 'P14A5', 'P14A6', 'P14A7', 'P14A8', 'P14A9', 'P14A10', 'P14A11', 'P14A14', 'P15A', 'P16A', 'P17P17B', 'P17P17G', 'P17P17H', 'P17P17I', 'P18A', 'P18C', 'P22', 'P24', 'P25P25C', 'P25P25D', 'P25P25G', 'P25P25H', 'P25P25I', 'P32', 'P35', 'P38SP8', 'P38SP9', 'PESO', 'PESO1']


In [50]:
len(boruta_feature_selector.ranking_[boruta_feature_selector.support_ == True])

43

In [51]:
# Las que están en el "1" son las aceptadas por Boruta
importancia_boruta= pd.DataFrame()
importancia_boruta['variables']= predictores
importancia_boruta['ranking']= boruta_feature_selector.ranking_
importancia_boruta = importancia_boruta.sort_values('ranking')
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(importancia_boruta)

    variables  ranking
0        P0R1        1
60     P14A14        1
57     P14A11        1
56     P14A10        1
55      P14A9        1
54      P14A8        1
53      P14A7        1
52      P14A6        1
51      P14A5        1
50      P14A4        1
49      P14A3        1
48      P14A2        1
47      P14A1        1
45     P12A_1        1
42         P8        1
41         P7        1
122      PESO        1
64       P15A        1
65       P16A        1
67    P17P17B        1
109    P38SP9        1
108    P38SP8        1
100       P35        1
97        P32        1
90    P25P25I        1
89    P25P25H        1
88    P25P25G        1
39         P5        1
85    P25P25D        1
81        P24        1
80        P22        1
77       P18C        1
75       P18A        1
74    P17P17I        1
73    P17P17H        1
72    P17P17G        1
84    P25P25C        1
38         P4        1
123     PESO1        1
37         P3        1
14      P1P1H        1
3        P0B2        1
17      P1P