# **PREPARACIÓN DE DATOS - DIABETES**


## Importar librerías

In [None]:
# pandas: manejo de dataFrame
import pandas as pd
# sklearn: scikit-learn - Librería de python para Machine Learning
# Normalizar: permite escalar los datos a un rango de [0...1]
from sklearn.preprocessing import MinMaxScaler

## Importación de datos **diabetes.csv**

In [None]:
datos = pd.read_csv("diabetes.csv")
datos.head(10)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
5,55,Male,Yes,Yes,No,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Positive
6,57,Male,Yes,Yes,No,Yes,Yes,Yes,No,No,No,Yes,Yes,No,No,No,Positive
7,66,Male,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,No,Yes,Yes,No,No,Positive
8,67,Male,Yes,Yes,No,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,No,Yes,Positive
9,70,Male,No,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,No,No,Yes,No,Positive


# **2. ELIMINACIÓN DE DATOS FALTANTES**

# Determinar si existen datos faltantes

In [None]:
# visualizar información del dataset
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

In [None]:
#número de valores nulos por cada atributo
datos.isnull().sum()

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64

In [None]:
#visualizar datos que tengan algun atributo nulo
nulos = datos[datos.isnull().any(1)]
nulos

  nulos = datos[datos.isnull().any(1)]


Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class


In [None]:
print ('número de registros con valores nulos: ', len(nulos))

número de registros con valores nulos:  0


## Eliminar de valores nulos

**PODEMOS VERIFICAR QUE EN ESTE DATASET NO TENEMOS VALORES NULOS POR LO TANTO NO SERA NECESARIO ELIMINAR NINGUN REGISTRO Y TAMPOCO SERA NECESARIO LA IMPUTACION DE DATOS FALTANTES**

# **2. CONVERSION DE DATOS CATEGÓRICOS A NUMÉRICOS**


## Implementación de un módulo que convierte un atributo categórico a numérico utilizando las funciones **astype("category")** y **cat.codes**

In [None]:
def Categorico_a_numerico(atributo):
  # Convertir el atributo al tipo categorico
  atributo = atributo.astype("category")
  # Convertir el atributo categorico a numerico
  return atributo.astype("category").cat.codes

In [None]:
# datos antes de la conversión
datos.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [None]:
# convertir los categóricos a numéricos
datos['Gender'] = Categorico_a_numerico(datos['Gender'])
datos['Polyuria'] = Categorico_a_numerico(datos['Polyuria'])
datos['Polydipsia'] = Categorico_a_numerico(datos['Polydipsia'])
datos['sudden weight loss'] = Categorico_a_numerico(datos['sudden weight loss'])
datos['weakness'] = Categorico_a_numerico(datos['weakness'])
datos['Polyphagia'] = Categorico_a_numerico(datos['Polyphagia'])
datos['Genital thrush'] = Categorico_a_numerico(datos['Genital thrush'])
datos['visual blurring'] = Categorico_a_numerico(datos['visual blurring'])
datos['Itching'] = Categorico_a_numerico(datos['Itching'])
datos['Irritability'] = Categorico_a_numerico(datos['Irritability'])
datos['delayed healing'] = Categorico_a_numerico(datos['delayed healing'])
datos['partial paresis'] = Categorico_a_numerico(datos['partial paresis'])
datos['muscle stiffness'] = Categorico_a_numerico(datos['muscle stiffness'])
datos['Alopecia'] = Categorico_a_numerico(datos['Alopecia'])
datos['Obesity'] = Categorico_a_numerico(datos['Obesity'])
datos['class'] = Categorico_a_numerico(datos['class'])

In [None]:
# datos después de la conversión
datos.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


# **3. NORMALIZACIÓN DE DATOS**

## Implementar la normalización de datos por amplitud y distribución

In [None]:
# Transformación por amplitud [0..1]
def Normalizacion_Amplitud(atributo):
  return (atributo - atributo.min())/(atributo.max() - atributo.min())

In [None]:
# Transformación por distribución [-1..1]
def Normalizacion_Distribucion(atributo):
  return (atributo - atributo.mean())/(atributo.var())

## Normalizar todos los atributos por amplitud

In [None]:
datos.columns

Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')

In [None]:
datos['Age'] = Normalizacion_Amplitud(datos['Age'])
datos['Gender'] = Normalizacion_Amplitud(datos['Gender'])
datos['Polyuria'] = Normalizacion_Amplitud(datos['Polyuria'])
datos['Polydipsia'] = Normalizacion_Amplitud(datos['Polydipsia'])
datos['sudden weight loss'] = Normalizacion_Amplitud(datos['sudden weight loss'])
datos['weakness'] = Normalizacion_Amplitud(datos['weakness'])
datos['Polyphagia'] = Normalizacion_Amplitud(datos['Polyphagia'])
datos['Genital thrush'] = Normalizacion_Amplitud(datos['Genital thrush'])
datos['visual blurring'] = Normalizacion_Amplitud(datos['visual blurring'])
datos['Itching'] = Normalizacion_Amplitud(datos['Itching'])
datos['Irritability'] = Normalizacion_Amplitud(datos['Irritability'])
datos['delayed healing'] = Normalizacion_Amplitud(datos['delayed healing'])
datos['partial paresis'] = Normalizacion_Amplitud(datos['partial paresis'])
datos['muscle stiffness'] = Normalizacion_Amplitud(datos['muscle stiffness'])
datos['Alopecia'] = Normalizacion_Amplitud(datos['Alopecia'])
datos['Obesity'] = Normalizacion_Amplitud(datos['Obesity'])
datos['class'] = Normalizacion_Amplitud(datos['class'])

In [None]:
#datos después de la normalizacion por amplitud
datos.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,0.324324,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
1,0.567568,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.337838,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
3,0.391892,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.594595,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# **4. GRABACIÓN DE DATOS PRE-PROCESADOS**


In [None]:
# guardar el preprocesamiento de datos en un archivo csv
datos.to_csv("datos_diabetes_norm.csv", sep=',', header=True, index=False)