## Caso: Consumo de energía

## Lectura de datos

In [None]:
import pandas as pd
import numpy  as np
import plotly.express as px

In [None]:
#from google.colab import drive
#drive.mount('/gdrive')

In [None]:
energydata = pd.read_csv('/gdrive/My Drive/Colab Notebooks/datos/KAG_energydata_complete.csv', parse_dates=[0])

In [None]:
energydata.head(3)

## Estadística descriptiva

In [None]:
resumen = energydata.describe().T
resumen

In [None]:
resumen['cv'] = resumen.iloc[:,2] / resumen.iloc[:,1] * 100
resumen

In [None]:
resumen['as'] = (3 * (resumen.iloc[:,1] - resumen.iloc[:,5]))/ resumen.iloc[:,2]
resumen

In [None]:
## Curstosis
resumen['cur'] = energydata.skew()
resumen

## Interpretaciones de estadísticos descriptivos

#### La medía del gasto energético es de 97.7 Wh (Watt-hora)
#### la diferencia del gasto energético alrededor de la media, en promedio, es de 102 Wh (Watt-hora)
#### Appliances tiene un valor de curtosisde 3.38 (>3) entonces su distribución es Leptocúrtica

## Graficos Estadísticos y valores atípicos

In [None]:
fig = px.box(energydata, y="Appliances" , title="Box plot of Appliances")
fig.show()

In [None]:
# import plotly.express as px
fig = px.histogram(energydata, x ="Appliances" , title="Histogram of Appliances")
fig.show()

In [None]:
var = energydata['Appliances'].values
var

In [None]:
# Para detectar los valores atipicos debemos calcular el IQR, en el caso de Appliances :
Q1 = np.percentile(var, 25)
Q3 = np.percentile(var, 75)

In [None]:
Q1, Q3

In [None]:
IRQ = Q3 - Q1
Limite_inf = Q1 - 1.5 * IRQ
Limite_sub = Q3 + 1.5 * IRQ

In [None]:
Limite_inf, Limite_sub

In [None]:
energydata_sin_outliers = energydata[(energydata.Appliances > Limite_inf) & (energydata.Appliances <Limite_sub)]

In [None]:
energydata.shape, energydata_sin_outliers.shape

In [None]:
fig = px.box(energydata_sin_outliers, y="Appliances" , title="Box plot of Appliances cleaned")
fig.show()

In [None]:
# import plotly.express as px
fig = px.histogram(energydata_sin_outliers, x ="Appliances" , title="Histogram of Appliances cleaned")
fig.show()

#### Cuanto uno remueve los valores atípcos, suele suceder que otras observaciones pasen a ser los nuevos valores outliers, pero
#### como se ha notado, la visualización y observación de la distribución de los datos ha mejorado muchísismo

## Valores perdidos e imputación

In [None]:
import random

In [None]:
energydata.shape

In [None]:
random.seed(6)
lista_aleatoria = random.sample(range(19735), 1000)

In [None]:
lista_aleatoria

In [None]:
energydata_missing = energydata.copy()
energydata_missing.loc[lista_aleatoria,"Appliances"] = np.nan

In [None]:
## Se han generado 1000 valores perdidos en la variable Appliances para utiliar los métodos de imputación
energydata_missing.describe().T

Imputación por medidas de tendencia central

In [None]:
energydata_missing_media  = energydata_missing.fillna(energydata_missing['Appliances'].mean()).copy()

In [None]:
energydata_missing_median = energydata_missing.fillna(energydata_missing['Appliances'].median()).copy()

Imputación por algoritmo KNN

In [None]:
from sklearn.impute import KNNImputer

energydata_missing_knn = energydata_missing.copy()
# Construimos el modelo
imputer = KNNImputer(n_neighbors=3, weights="uniform")

# Ajustamos el modelo e imputamos los missing values
imputer.fit(energydata_missing_knn[['Appliances']])
energydata_missing_knn['Appliances'] = imputer.transform(energydata_missing_knn[['Appliances']]).ravel()

In [None]:
energydata[['Appliances']].describe().T

In [None]:
energydata_missing_media[['Appliances']].describe().T

In [None]:
energydata_missing_median[['Appliances']].describe().T

In [None]:
energydata_missing_knn[['Appliances']].describe().T

Imputación por Regresión Lineal

In [None]:
energydata_missing_lm = energydata_missing.copy()

In [None]:
energydata_missing_lm.describe().T

In [None]:
energydata_missing_lm_complete = energydata_missing_lm[~energydata_missing_lm.Appliances.isna()]
energydata_missing_lm_complete.describe().T

In [None]:
print(energydata_missing_lm.shape)
print(energydata_missing_lm_complete.shape)

In [None]:
## En un modelo de regresión debemos escoger SOLAMENTE la variables numéricas como "predictoras" 
## Por lo que si queremos predecir la variable "Appliances" no puede ser parte de las predictoras
## Y la variable 'date' se descarta porque no es una variable numérica.
parameters = [x for x in energydata_missing_lm.columns if x not in ['date','Appliances']]

In [None]:
parameters

In [None]:
from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(X = energydata_missing_lm_complete[parameters], y = energydata_missing_lm_complete['Appliances'])

In [None]:
energydata_missing_lm['Appliances'].isnull()

In [None]:
variable_imputada = model.predict(energydata_missing_lm[parameters])[energydata_missing_lm['Appliances'].isnull()]

In [None]:
variable_imputada

In [None]:
energydata_missing_lm['Appliances_imp'] = energydata_missing_lm['Appliances']

In [None]:
energydata_missing_lm.describe().T

In [None]:
energydata_missing_lm.loc[energydata_missing_lm['Appliances'].isnull(), 'Appliances_imp'] = variable_imputada

In [None]:
energydata_missing_lm.describe().T

Imputación con Random Forest

In [None]:
energydata_missing_rf = energydata_missing.copy()
energydata_missing_rf_complete = energydata_missing_rf[~energydata_missing_rf.Appliances.isna()]

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

In [None]:
# La ejecución demora un poco porque el modelo construye N arboles
model_rf = RandomForestRegressor(n_estimators = 100, random_state = 9)
model_rf.fit(X = energydata_missing_rf_complete[parameters], y = energydata_missing_rf_complete['Appliances'])

In [None]:
variable_imputada_rf = model_rf.predict(energydata_missing_rf[parameters])[energydata_missing_rf['Appliances'].isnull()]

In [None]:
energydata_missing_rf['Appliances_imp'] = energydata_missing_rf['Appliances']

In [None]:
energydata_missing_rf.loc[energydata_missing_rf['Appliances'].isnull(), 'Appliances_imp'] = variable_imputada_rf

In [None]:
energydata_missing_lm[['Appliances_imp']].describe().T

In [None]:
energydata_missing_rf[['Appliances_imp']].describe().T