Información de UCI

UCI dataset_repository es el repostorio de datos de la Universidad de California Irvine, con los datasets más populares para el aprendizaje automático. 

Enlace: [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php)

In [149]:
## instalamos ucimlrepo
#!pip install ucimlrepo

In [182]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
Obesidad = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
X = Obesidad.data.features 
y = Obesidad.data.targets 
  
X= X.drop(['Height','Weight'],axis=1)

# metadata 
#print(Obesidad.metadata) 
  
# variable information 
print(Obesidad.variables) 



                              name     role         type demographic  \
0                           Gender  Feature  Categorical      Gender   
1                              Age  Feature   Continuous         Age   
2                           Height  Feature   Continuous        None   
3                           Weight  Feature   Continuous        None   
4   family_history_with_overweight  Feature       Binary        None   
5                             FAVC  Feature       Binary        None   
6                             FCVC  Feature      Integer        None   
7                              NCP  Feature   Continuous        None   
8                             CAEC  Feature  Categorical        None   
9                            SMOKE  Feature       Binary        None   
10                            CH2O  Feature   Continuous        None   
11                             SCC  Feature       Binary        None   
12                             FAF  Feature   Continuous        

In [151]:
Obesidad.variables.head()

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Gender,Feature,Categorical,Gender,,,no
1,Age,Feature,Continuous,Age,,,no
2,Height,Feature,Continuous,,,,no
3,Weight,Feature,Continuous,,,,no
4,family_history_with_overweight,Feature,Binary,,Has a family member suffered or suffers from o...,,no


In [152]:
#X['FAF'].plot(kind='hist', bins=30, color='c', edgecolor='black', linewidth=1.2, alpha=0.5, label='NCP')

### Tipos de obesidad

In [153]:
#y.value_counts().plot(kind='bar', color='c', edgecolor='black', linewidth=1.2, alpha=0.5, label='NObeyesdad')

In [154]:
#### Hagamos una taba para estudiar las variables

import pandas as pd
import numpy as np

def describe_data(dataframe):
    columnas = dataframe.columns
    categorias = []
    faltantes=[]
    atipicos_iqr=[]
    unicos=[]
    minimo=[]
    q1=[]
    q2=[]
    q3=[]
    maximo=[]


    for col in columnas:
        categoria=dataframe[col].dtype
        categorias.append(categoria)
        faltante=dataframe[col].isna().sum()
        faltantes.append(faltante)
        unico=dataframe[col].nunique()
        unicos.append(unico)
        if categoria == 'int64' or categoria == 'float64':
            q75, q25 = np.percentile(dataframe[col], [75 ,25])
            iqr = q75 - q25
            atipico = dataframe[(dataframe[col] < (q25 - 1.5 * iqr)) | (dataframe[col] > (q75 + 1.5 * iqr))].shape[0]
            atipicos_iqr.append(atipico)
            minimo.append(dataframe[col].min())
            q1.append(dataframe[col].quantile(0.25))
            q2.append(dataframe[col].quantile(0.5))
            q3.append(dataframe[col].quantile(0.75))
            maximo.append(dataframe[col].max())
        else:
            atipicos_iqr.append('No aplica')
            minimo.append('No aplica')
            q1.append('No aplica')
            q2.append('No aplica')
            q3.append('No aplica')
            maximo.append('No aplica')

    resumen = pd.DataFrame({'Variable':columnas, 'Categoria':categorias, 'Faltantes':faltantes, 'Valores_unicos':unicos, 'Minimo':minimo, 'Q1':q1, 'Q2':q2, 'Q3':q3, 'Maximo':maximo, 'Atipicos_iqr':atipicos_iqr})
    return resumen
        
        



In [155]:
#describe_data(X).to_excel('resumen.xlsx', index=False)

### Definición del tipo de variable

In [171]:
Numerica = ["Age","FCVC","NCP","CH2O","TUE"]
Dummy = ["Gender", "family_history_with_overweight", "FAVC", "SMOKE", "SCC"]
Cat_Ordinal = ["CAEC", "FAF", "CALC"]
Cat_Nominal = ["MTRANS"]

X_Numerica = X[Numerica]
X_Dummy = X[Dummy]
X_Cat_Ordinal = X[Cat_Ordinal]
X_Cat_Nominal = X[Cat_Nominal]

X = pd.concat([X_Numerica, X_Dummy, X_Cat_Ordinal, X_Cat_Nominal], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

### Pipeline

In [173]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,  OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

Trans_dummy_variables = Pipeline(
    steps=[("dummy",  OneHotEncoder()),
              
    ]
)


Trans_num_variables = Pipeline(
    steps=[("scaler", StandardScaler())]
)

Trans_cat_variables = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
              
    ]
)

Trans_nominal_variables = Pipeline(
    steps=[('nominal', OrdinalEncoder()),
              
    ]
)



In [174]:
Preprocesamiento = ColumnTransformer(
    transformers=[
        ("Numerica", Trans_num_variables, X_Numerica),
        ("Dummy", Trans_dummy_variables, X_Dummy),
        ("Ordinal", Trans_cat_variables, X_Cat_Ordinal),
        ("Nominal", Trans_nominal_variables, X_Cat_Nominal),
    ]
)
Preprocesamiento

In [160]:
Modelo = {'Random Forest': RandomForestClassifier(n_estimators = 60)}

Modelo = Pipeline(
    steps=[("Preprocesamiento", Preprocesamiento), ("RF", Modelo['Random Forest'])])
Modelo

In [161]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [162]:
X_train

Unnamed: 0,Gender,Age,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
162,Female,21.000000,yes,yes,3.000000,3.000000,Always,yes,2.000000,no,2.000000,0.000000,Sometimes,Public_Transportation
2001,Female,20.924956,yes,yes,3.000000,3.000000,Sometimes,no,2.887659,no,1.480919,0.779641,Sometimes,Public_Transportation
1435,Female,22.899740,yes,yes,1.203754,1.355354,Sometimes,no,2.765593,no,0.128342,1.659476,Sometimes,Public_Transportation
649,Female,21.837996,no,no,3.000000,1.696080,Frequently,no,2.550307,no,1.098862,0.000000,no,Public_Transportation
1280,Male,25.994746,yes,yes,3.000000,3.000000,Sometimes,no,2.858171,no,1.813318,0.680215,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,Male,32.290160,yes,yes,2.967300,3.000000,Sometimes,no,2.530035,no,0.955317,1.339232,Sometimes,Automobile
1095,Male,23.000000,yes,yes,2.000000,1.729553,Sometimes,no,1.400247,no,0.887923,1.011983,Sometimes,Public_Transportation
1130,Female,22.989846,yes,yes,2.000000,3.000000,Sometimes,no,2.000000,no,0.146919,2.000000,no,Public_Transportation
1294,Female,23.000000,yes,yes,2.058687,2.962004,Sometimes,no,2.010596,no,0.851059,0.630866,no,Public_Transportation


In [163]:
X_train

Unnamed: 0,Gender,Age,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
162,Female,21.000000,yes,yes,3.000000,3.000000,Always,yes,2.000000,no,2.000000,0.000000,Sometimes,Public_Transportation
2001,Female,20.924956,yes,yes,3.000000,3.000000,Sometimes,no,2.887659,no,1.480919,0.779641,Sometimes,Public_Transportation
1435,Female,22.899740,yes,yes,1.203754,1.355354,Sometimes,no,2.765593,no,0.128342,1.659476,Sometimes,Public_Transportation
649,Female,21.837996,no,no,3.000000,1.696080,Frequently,no,2.550307,no,1.098862,0.000000,no,Public_Transportation
1280,Male,25.994746,yes,yes,3.000000,3.000000,Sometimes,no,2.858171,no,1.813318,0.680215,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,Male,32.290160,yes,yes,2.967300,3.000000,Sometimes,no,2.530035,no,0.955317,1.339232,Sometimes,Automobile
1095,Male,23.000000,yes,yes,2.000000,1.729553,Sometimes,no,1.400247,no,0.887923,1.011983,Sometimes,Public_Transportation
1130,Female,22.989846,yes,yes,2.000000,3.000000,Sometimes,no,2.000000,no,0.146919,2.000000,no,Public_Transportation
1294,Female,23.000000,yes,yes,2.058687,2.962004,Sometimes,no,2.010596,no,0.851059,0.630866,no,Public_Transportation


In [164]:
Modelo.fit(X_train, y_train)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

### Pipeline Dummy

### Pipeline Cat_Ordinal

### Pipeline Cat_Nominal