In [20]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [21]:
diabetes = datasets.load_diabetes()

In [22]:
diabetes_df = pd.DataFrame( data=np.c_[ diabetes['data'], diabetes['target'] ], columns=diabetes['feature_names'] + ['target'])

In [23]:
diabetes_df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [24]:
diabetes_df['target'] = ( diabetes_df['target'] >  diabetes_df['target'].median() ).astype(int)
diabetes_df['target']

0      1
1      0
2      1
3      1
4      0
      ..
437    1
438    0
439    0
440    1
441    0
Name: target, Length: 442, dtype: int64

In [25]:
X = diabetes_df.drop( 'target', axis=1 )
X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [26]:
y = diabetes_df['target']
y

0      1
1      0
2      1
3      1
4      0
      ..
437    1
438    0
439    0
440    1
441    0
Name: target, Length: 442, dtype: int64

In [27]:
X_entrena, X_prueba, y_entrena, y_prueba = train_test_split(X,y, test_size=0.2, random_state=42)

In [28]:
selector = SelectKBest( score_func=f_classif, k=4 )

In [29]:
X_entrena_seleccionada = selector.fit_transform( X_entrena, y_entrena )
X_entrena_seleccionada


array([[ 0.01211685,  0.0563009 ,  0.03430886,  0.02736405],
       [-0.01806189, -0.03321323,  0.07120998,  0.00027248],
       [ 0.04984027,  0.09761511, -0.00259226,  0.01703607],
       ...,
       [-0.02021751, -0.00567042, -0.03949338, -0.01090325],
       [-0.02345095, -0.04009893, -0.00259226, -0.03845972],
       [ 0.02828403, -0.01599898, -0.03949338, -0.00514219]])

In [30]:
X_prueba_seleccionada = selector.transform(X_prueba)
X_prueba_seleccionada

array([[-0.00620595, -0.01599898,  0.03430886,  0.03243232],
       [ 0.03690653,  0.02187239, -0.03949338, -0.02251653],
       [-0.00405033, -0.01255612, -0.00259226,  0.08449153],
       [ 0.0519959 ,  0.07926471,  0.14132211,  0.09864806],
       [-0.02021751, -0.00222757,  0.03430886, -0.00514219],
       [-0.02452876, -0.02632753, -0.00259226, -0.02139531],
       [ 0.17055523,  0.01498668,  0.03430886,  0.03365381],
       [ 0.04552903,  0.02187239,  0.03430886,  0.0741909 ],
       [-0.0902753 , -0.05731319, -0.00259226,  0.02405509],
       [ 0.01535029, -0.01944183, -0.00259226, -0.03074792],
       [-0.03315126, -0.02288468, -0.02473293, -0.02595311],
       [-0.05794093, -0.02288468, -0.00259226,  0.04289704],
       [-0.06117437, -0.02632753, -0.0763945 , -0.09393727],
       [ 0.01427248,  0.0631866 ,  0.03430886,  0.04666178],
       [-0.05578531, -0.00222757, -0.03949338, -0.01705628],
       [ 0.00564998, -0.00567042, -0.03949338, -0.04542404],
       [ 0.07139652,  0.

In [31]:
pipeline = Pipeline([ ('scaler', StandardScaler()), ('classifier',RandomForestClassifier(n_estimators=100, random_state=42)) ])

In [32]:
pipeline.fit(X_entrena_seleccionada, y_entrena)

In [33]:
predicciones = pipeline.predict(X_prueba_seleccionada)
predicciones

array([1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1])

In [34]:
puntaje = pipeline.score(X_prueba_seleccionada, y_prueba)
puntaje

0.7415730337078652

In [35]:
validacion_cruzada = cross_val_score(pipeline, selector.transform(X), y, cv=5)
validacion_cruzada

array([0.69662921, 0.74157303, 0.67045455, 0.61363636, 0.68181818])

In [36]:
np.mean(validacion_cruzada)

np.float64(0.6808222676200204)