# Análisis exploratorio de datos

**Objetivos**:

1. Encontrar irregularidades en los datos y determinar qué transformaciones efectuar

    - Duplicados
    - Valores faltantes
    - Posibles errores de registro de datos
    - Imbalance de clases

2. Encontrar relaciones entre variables

    - Predictora - Objetivo
    - Predictora - Predictora

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
sns.set()

## Cargar datos

In [3]:
datos = pd.read_csv("datos/heart.csv")
datos.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Revisar integridad de datos

In [67]:
# Datos duplicados
def obtener_duplicados(datos, mantener_ultima_col=True, mantener_duplicado='first'):
    subset = datos.columns if mantener_ultima_col else datos.columns[:-1]

    duplicados = datos.duplicated(subset=subset,keep=False)
    datos_duplicados = datos[duplicados].groupby(by=list(subset),as_index=False).size()

    num_registros_con_duplicados = datos_duplicados.shape[0]

    return num_registros_con_duplicados, datos_duplicados, datos.drop_duplicates(subset=subset, mantener_duplicado='first')

obtener_duplicados(datos, mantener_ultima_col=False)

(0,
 Empty DataFrame
 Columns: [Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope, size]
 Index: [],
      Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
 0     40   M           ATA        140          289          0     Normal   
 1     49   F           NAP        160          180          0     Normal   
 2     37   M           ATA        130          283          0         ST   
 3     48   F           ASY        138          214          0     Normal   
 4     54   M           NAP        150          195          0     Normal   
 ..   ...  ..           ...        ...          ...        ...        ...   
 913   45   M            TA        110          264          0     Normal   
 914   68   M           ASY        144          193          1     Normal   
 915   57   M           ASY        130          131          0     Normal   
 916   57   F           ATA        130          236          0    

In [69]:
caca = pd.read_csv("datos/tmp.csv")
obtener_duplicados(caca)

(3,
    col1  col2  col3  size
 0     0     0     0     3
 1     0     1     0     5
 2     1     1     1     2,
    col1  col2  col3
 0     1     1     1
 2     1     1     0
 3     1     0     1
 4     1     0     0
 5     0     1     0
 6     0     1     1
 7     0     0     1
 8     0     0     0)

## Separar conjuntos de entrenamiento y prueba

In [4]:
X = datos.iloc[:,:-1]
y = datos.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=y
)

## Explorar datos de entrenamiento

In [None]:
df = X.copy()
df['HeartDisease'] = y.copy()
sns.pairplot(
    df, hue='HeartDisease',
    kind='scatter',
    diag_kind='hist'
)
plt.show()