# Missing Values

## 1. Importar paquetes

In [67]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

## 2. Carga de los datos

In [2]:
ruta = 'C:/Users/matia/OneDrive/Escritorio/Water_Quality'
trabajo = 'trabajo_resultado_calidad.pickle'
df = pd.read_pickle(ruta + '/02_Datos/03_Trabajo/' + trabajo)

## 3. Missing Values

In [3]:
df.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [4]:
df.isnull().mean() * 100

ph                 14.987790
Hardness            0.000000
Solids              0.000000
Chloramines         0.000000
Sulfate            23.840049
Conductivity        0.000000
Organic_carbon      0.000000
Trihalomethanes     4.945055
Turbidity           0.000000
Potability          0.000000
dtype: float64

## 4. Imputar los datos

### 4.1 Imputar manual por la media

In [56]:
x = df.drop('Potability', axis = 1)
y = df['Potability']

x = x.fillna(x.mean())

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

model = RandomForestClassifier()
model.fit(x_train, y_train)

In [57]:
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred) * 100

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 67.1414
F1 Score: 42.2182


### 4.2 Imputar manual por la mediana

In [58]:
x = df.drop('Potability', axis = 1)
y = df['Potability']

x = x.fillna(x.median())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [59]:
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred) * 100

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 68.2604
F1 Score: 44.6809


### 4.3 Imputar por SimpleImputer utilizando la mediana

In [60]:
imputer = SimpleImputer(strategy = 'median')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

imputer.fit(x_train)
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test)

model = RandomForestClassifier()
model.fit(x_train, y_train)

In [62]:
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred) * 100

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 67.3449
F1 Score: 42.5760


### 4.4 Imputar utilizando KNN 

In [65]:
knn_imputer = KNNImputer(n_neighbors = 8)
x_imputed = knn_imputer.fit_transform(x)
x = pd.DataFrame(x_imputed, columns = x.columns)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

model = RandomForestClassifier()
model.fit(x_train, y_train)

In [66]:
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred) * 100
print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 67.9552
F1 Score: 43.2432


### 4.5 Imputar por MICE

In [68]:
imputer = IterativeImputer(max_iter = 10, random_state = 0)
x_imputed = knn_imputer.fit_transform(x)
x = pd.DataFrame(x_imputed, columns = x.columns)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

model = RandomForestClassifier()
model.fit(x_train, y_train)

In [69]:
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred) * 100
print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 67.3449
F1 Score: 43.1858
