In [None]:
import pandas as pd
import numpy as np
seed = 42

## Obtener los datos del "Credit Approval Dataset"

In [75]:
ruta = "/home/jair/Escritorio/Tec. Av. Patrones/Tareas/Tarea1/datasets/crx.csv"
df = pd.read_csv(ruta, na_values = '?')
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class
0,b,3083.0,0.0,u,g,w,v,125.0,t,t,1,f,g,202.0,0,positive
1,a,5867.0,446.0,u,g,q,h,304.0,t,t,6,f,g,43.0,560,positive
2,a,245.0,5.0,u,g,q,h,15.0,t,f,0,f,g,280.0,824,positive
3,b,2783.0,154.0,u,g,w,v,375.0,t,t,5,t,g,100.0,3,positive
4,b,2017.0,5625.0,u,g,w,v,171.0,t,f,0,f,s,120.0,0,positive


In [76]:
df.shape
# 690 vectores
# 15 características

(690, 16)

## Limpieza de repetidos e indiscernibles  

In [None]:
# nulos
df = df.dropna()

# repetidos
df = df.drop_duplicates(keep = 'first')

# indiscernibles
características = df.drop(' Class', axis = 1)
df = df[~df.duplicated(subset = características, keep = False)]

In [78]:
df.shape
# 653 vectores
# 15 características
# Se eliminaron 59 vectores

(653, 16)

In [79]:
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class
0,b,3083.0,0.0,u,g,w,v,125.0,t,t,1,f,g,202.0,0,positive
1,a,5867.0,446.0,u,g,q,h,304.0,t,t,6,f,g,43.0,560,positive
2,a,245.0,5.0,u,g,q,h,15.0,t,f,0,f,g,280.0,824,positive
3,b,2783.0,154.0,u,g,w,v,375.0,t,t,5,t,g,100.0,3,positive
4,b,2017.0,5625.0,u,g,w,v,171.0,t,f,0,f,s,120.0,0,positive


## Separar numéricas y categóricas

In [80]:
df.columns

Index(['A1', ' A2', ' A3', ' A4', ' A5', ' A6', ' A7', ' A8', ' A9', ' A10',
       ' A11', ' A12', ' A13', ' A14', ' A15', ' Class'],
      dtype='object')

In [81]:
# Tienen un espacio los nombres de las oclumnas
df.columns = df.columns.str.strip()
df.columns

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'Class'],
      dtype='object')

In [84]:
numericos = ['A2','A3','A8','A11','A14','A15']
numericos

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

In [85]:
categoricos = [col for col in df.columns if col not in numericos and col != 'Class']
categoricos

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

## Corregir formato de numéricos y categóricos

In [90]:
df_num = df[numericos].astype(float)
df_cat = df[categoricos].astype(str)
df_cat.dtypes

A1     object
A4     object
A5     object
A6     object
A7     object
A9     object
A10    object
A12    object
A13    object
dtype: object

## Hold Out Estratificado

In [None]:
def hold_out_estratificado(df, proporcion = 0.8, target_col = 'Class'):

    clases = df[target_col].unique()
    train_list = []
    test_list = []

    for clase in clases:
        # Crear un datarame para cada clase del df
        df_clase = df[df[target_col] == clase].reset_index(drop = True)
        n_filas = int(len(df_clase) * proporcion)

        # Ya se separo la clase pero hay que mezclar todo
        df_clase =df_clase.sample(frac = 1, random_state = seed).reset_index(drop = True)

        # Del df de cada clase mandar las partes a entrenaniento y a prueba
        train_list.append(df_clase.iloc[:n_filas, :]) # primer 80%
        test_list.append(df_clase.iloc[n_filas:, :])  # ultimo 20%

    # Generar los dataframes finales
    train_df = pd.concat(train_list).reset_index(drop = True)
    test_df = pd.concat(test_list).reset_index(drop = True)

    # Ya quedó unido pero puede tener orden y volvemos a mezclar
    train_df = train_df.sample(frac=1, random_state = seed)
    test_df = test_df.sample(frac = 1, random_state = seed)

    # separar X_train, X_Test, Y_train, Y_test
    X_Train = train_df.drop(columns = target_col, axis = 1)
    Y_Train = train_df[target_col]
    X_Test = test_df.drop(columns = target_col, axis = 1) 
    Y_Test = test_df[target_col]

    return X_Train, Y_Train, X_Test, Y_Test