In [237]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

seed = 42

## Obtener los datos del "Credit Approval Dataset"

In [238]:
ruta = "/home/jair/Escritorio/Tec. Av. Patrones/Tareas/Tarea1/datasets/crx.csv"
df = pd.read_csv(ruta, na_values = '?')
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class
0,b,3083.0,0.0,u,g,w,v,125.0,t,t,1,f,g,202.0,0,positive
1,a,5867.0,446.0,u,g,q,h,304.0,t,t,6,f,g,43.0,560,positive
2,a,245.0,5.0,u,g,q,h,15.0,t,f,0,f,g,280.0,824,positive
3,b,2783.0,154.0,u,g,w,v,375.0,t,t,5,t,g,100.0,3,positive
4,b,2017.0,5625.0,u,g,w,v,171.0,t,f,0,f,s,120.0,0,positive


In [239]:
df.shape
# 690 vectores
# 15 características

(690, 16)

In [240]:
# Tienen un espacio los nombres de las oclumnas
df.columns = df.columns.str.strip()
df.columns

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'Class'],
      dtype='object')

## Ubicar valores nulos, repetidos e indiscernibles

In [241]:
# Valores nulos
df.isnull().sum()

A1       12
A2       12
A3        0
A4        6
A5        6
A6        9
A7        9
A8        0
A9        0
A10       0
A11       0
A12       0
A13       0
A14      13
A15       0
Class     0
dtype: int64

In [242]:
# Valores repetidos
df.duplicated().sum()

np.int64(0)

In [243]:
# Valores indiscernibles
X = df.drop('Class', axis = 1)
Y = df['Class']

df_copia = df.copy()

indiscernibles = df_copia.groupby(list(X.columns))['Class'].nunique()
indiscernibles = indiscernibles[indiscernibles > 1]
len(indiscernibles)


0

## Limpieza de nulos, repetidos e indiscernibles  

In [244]:
# nulos
df = df.dropna()

# repetidos
df = df.drop_duplicates(keep = 'first')

# indiscernibles
mask = df.groupby(list(X.columns))['Class'].transform('nunique')>1
df = df[~mask]

df.shape

(653, 16)

In [246]:
df.shape
# 653 vectores
# 15 características
# Se eliminaron 59 vectores

(653, 16)

In [247]:
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class
0,b,3083.0,0.0,u,g,w,v,125.0,t,t,1,f,g,202.0,0,positive
1,a,5867.0,446.0,u,g,q,h,304.0,t,t,6,f,g,43.0,560,positive
2,a,245.0,5.0,u,g,q,h,15.0,t,f,0,f,g,280.0,824,positive
3,b,2783.0,154.0,u,g,w,v,375.0,t,t,5,t,g,100.0,3,positive
4,b,2017.0,5625.0,u,g,w,v,171.0,t,f,0,f,s,120.0,0,positive


In [248]:
# Etiqueta de clase numérica con un diccionario
df['Class'] = df['Class'].replace({'positive': 1, 'negative':0})
df.head()

  df['Class'] = df['Class'].replace({'positive': 1, 'negative':0})


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class
0,b,3083.0,0.0,u,g,w,v,125.0,t,t,1,f,g,202.0,0,1
1,a,5867.0,446.0,u,g,q,h,304.0,t,t,6,f,g,43.0,560,1
2,a,245.0,5.0,u,g,q,h,15.0,t,f,0,f,g,280.0,824,1
3,b,2783.0,154.0,u,g,w,v,375.0,t,t,5,t,g,100.0,3,1
4,b,2017.0,5625.0,u,g,w,v,171.0,t,f,0,f,s,120.0,0,1


## Separar numéricas, categóricos y las clases

In [249]:
numericos = ['A2','A3','A8','A11','A14','A15']
numericos

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

In [250]:
categoricos = [col for col in df.columns if col not in numericos and col != 'Class']
categoricos

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [251]:
clases = df['Class']
clases = pd.DataFrame(clases)
clases.head()

Unnamed: 0,Class
0,1
1,1
2,1
3,1
4,1


## Corregir formato de numéricos y categóricos

In [252]:
df[numericos] = df[numericos].astype(float)
df[categoricos] = df[categoricos].astype(str)
df.dtypes

A1        object
A2       float64
A3       float64
A4        object
A5        object
A6        object
A7        object
A8       float64
A9        object
A10       object
A11      float64
A12       object
A13       object
A14      float64
A15      float64
Class      int64
dtype: object

## Pasar las características Categoricas a Numéricas

In [255]:
# Aplicar Bynary encoding a el df

encoder = ce.BinaryEncoder(cols = categoricos)
df_encoded = encoder.fit_transform(df[categoricos])

df_encoded.head()

Unnamed: 0,A1_0,A1_1,A4_0,A4_1,A5_0,A5_1,A6_0,A6_1,A6_2,A6_3,...,A7_2,A7_3,A9_0,A9_1,A10_0,A10_1,A12_0,A12_1,A13_0,A13_1
0,0,1,0,1,0,1,0,0,0,1,...,0,1,0,1,0,1,0,1,0,1
1,1,0,0,1,0,1,0,0,1,0,...,1,0,0,1,0,1,0,1,0,1
2,1,0,0,1,0,1,0,0,1,0,...,1,0,0,1,1,0,0,1,0,1
3,0,1,0,1,0,1,0,0,0,1,...,0,1,0,1,0,1,1,0,0,1
4,0,1,0,1,0,1,0,0,0,1,...,0,1,0,1,1,0,0,1,1,0


## Estandarización de las características numericas


In [257]:
# Estandarizar para que no afecte a los algoritmos basados en distancias como el SVM o los KNN
scaler = StandardScaler()

df_numericos_scaled = scaler.fit_transform(df[numericos])
df_numericos_scaled = pd.DataFrame(df_numericos_scaled, columns = numericos)
df_numericos_scaled.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15
0,0.260028,-0.391141,-0.244389,-0.302596,0.128682,-0.193125
1,2.052963,-0.24821,-0.11854,0.704516,-0.816802,-0.086443
2,-1.567684,-0.389539,-0.321726,-0.504019,0.592504,-0.03615
3,0.066824,-0.341788,-0.068623,0.503093,-0.477855,-0.192553
4,-0.426491,1.41153,-0.212048,-0.504019,-0.358926,-0.193125


## Unir los categoricos y los numericos a estandarizados con la clase en un df final

In [260]:
df_final = pd.concat([df_encoded.reset_index(drop = True), 
                      df_numericos_scaled.reset_index(drop = True), 
                      clases.reset_index(drop = True)], axis = 1)
df_final.head()

Unnamed: 0,A1_0,A1_1,A4_0,A4_1,A5_0,A5_1,A6_0,A6_1,A6_2,A6_3,...,A12_1,A13_0,A13_1,A2,A3,A8,A11,A14,A15,Class
0,0,1,0,1,0,1,0,0,0,1,...,1,0,1,0.260028,-0.391141,-0.244389,-0.302596,0.128682,-0.193125,1
1,1,0,0,1,0,1,0,0,1,0,...,1,0,1,2.052963,-0.24821,-0.11854,0.704516,-0.816802,-0.086443,1
2,1,0,0,1,0,1,0,0,1,0,...,1,0,1,-1.567684,-0.389539,-0.321726,-0.504019,0.592504,-0.03615,1
3,0,1,0,1,0,1,0,0,0,1,...,0,0,1,0.066824,-0.341788,-0.068623,0.503093,-0.477855,-0.192553,1
4,0,1,0,1,0,1,0,0,0,1,...,1,1,0,-0.426491,1.41153,-0.212048,-0.504019,-0.358926,-0.193125,1


## Guardar Dataset FInal

In [263]:
df_final.to_csv('datasets/crx_limpio.csv', index = False)