# Proprocesammiento

En esta notebook se hace lo siguiente:  
1.- se calcula la Mean de los features que tiene valores nulos para luego ser imputados.  
2.- Se elimina las variables categoricas.  
3.- se cacula en Z-score para eliminar los Outliers.  
4.- Se caclula en valor VIF para ser usado para eliminas las variables multicolineares.  
5.- Se determinan y corrigen los feautures Skewed.  
6.- Se estandarizan las columnas usando un StandardScaler.  


In [86]:
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [87]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [88]:
df_full = pd.read_csv('../../data/raw/train_dataset.csv')

In [89]:
# Se obtienen los valores de la Mean antes de imputar para poder imputar la misma media en ambos set de datos
a = df_full['year_built'].mean()
b = df_full['energy_star_rating'].mean()
c = df_full['direction_max_wind_speed'].mean()
d = df_full['direction_peak_wind_speed'].mean()
e = df_full['max_wind_speed'].mean()
f = df_full['days_with_fog'].mean()

In [90]:
df_full['year_built'].fillna(a, inplace=True)
df_full['energy_star_rating'].fillna(b, inplace=True)
df_full['direction_max_wind_speed'].fillna(c, inplace=True)
df_full['direction_peak_wind_speed'].fillna(d, inplace=True)
df_full['max_wind_speed'].fillna(e, inplace=True)
df_full['days_with_fog'].fillna(f, inplace=True)

In [91]:
df_full['State_Factor'].value_counts()

In [92]:
df_full['building_class'].value_counts()

In [93]:
df_full.head()

In [94]:
df_full = df_full.drop(columns=['facility_type', 'id'])

In [None]:
df_full.head()

In [None]:
df_full.shape

In [None]:
ohc = OneHotEncoder()
ohs1 = ohc.fit_transform(df_full.building_class.values.reshape(-1,1)).toarray()
dfs1 = pd.DataFrame(ohs1, columns = ["Temp_building_class_"+str(ohc.categories_[0][i])
                               for i in range(len(ohc.categories_[0]))])
df_full = pd.concat([df_full, dfs1], axis = 1)  
df_full.head()

In [None]:
df_full.shape

In [None]:
df_full = df_full.drop(columns=['building_class'], axis = 1)
df_full.head()

In [None]:
df_full.shape

In [None]:
df_full['TEMP_State_Factor']=LabelEncoder().fit_transform(df_full.State_Factor)

In [None]:
df_full.head()

In [None]:
df_full = df_full.drop(columns=['State_Factor'], axis = 1)
df_full.head()

In [None]:
aux_site_eui= pd.DataFrame(df_full['site_eui'])
aux_site_eui.head()

In [None]:
df_full = df_full.drop(columns=['site_eui'], axis = 1)
df_full.head()

In [None]:
df_full.shape

In [None]:
df_full = pd.concat([df_full, aux_site_eui], axis = 1)  
df_full.head()

In [None]:
df_full.shape

Se calcula el Z-score para eliminar los outliers

In [None]:
from scipy import stats
import numpy as np

In [None]:
z = np.abs(stats.zscore(df_full))

In [None]:
df_full.shape

In [None]:
df_full.head()

Se elimina aquellas fillas que no cumplan con el treshold

In [None]:
df_full_z_scores = df_full[(z < 3).all(axis=1)]

In [None]:
df_full_z_scores.shape

In [None]:
df_full_z_scores.head()

Probamos haciendo una standarizacion normal comun y corriente ya que en teoria se eliminaron los outlier

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
transformer = StandardScaler().fit(df_full_z_scores)
df_full_scaled = transformer.transform(df_full_z_scores)

In [None]:
#se debe transforma a un dataframe ya que el metodo transorm lo retorna en un ndarray
df_full_scaled_df = pd.DataFrame(df_full_scaled, columns=df_full_SKW.columns)
df_full_scaled_df.describe()

In [None]:
df_full_scaled_df.shape

In [None]:
# Usaremps PCA
aux = df_full_scaled_df.drop(columns=['site_eui'])
pca = PCA().fit(aux)
plt.rcParams["figure.figsize"] = (12,6)
fig, ax = plt.subplots()

xi = np.arange(1, 49, step=1)
y = np.cumsum(pca.explained_variance_ratio_)
plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')
plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 11, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')
plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)
ax.grid(axis='x')
plt.show()

In [None]:
df_full_scaled_df.to_csv('data/preprocesada/Full_Data_sin_MultiC_sin_Skew.csv')