# Proprocesammiento

En esta notebook se hace lo siguiente:  
1.- se calcula la Mean de los features que tiene valores nulos para luego ser imputados.  
2.- Se elimina las variables categoricas.  
3.- se cacula en Z-score para eliminar los Outliers.  
4.- Se caclula en valor VIF para ser usado para eliminas las variables multicolineares.  
5.- se estandarizan las columnas usando un StandardScaler.  


In [None]:
import pandas as pd

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df_full = pd.read_csv('data/raw/train_dataset.csv')

In [None]:
# Se obtienen los valores antes de imputar para poder imputar la misma media en ambos set de datos
a = df_full['year_built'].mean()
b = df_full['energy_star_rating'].mean()
c = df_full['direction_max_wind_speed'].mean()
d = df_full['direction_peak_wind_speed'].mean()
e = df_full['max_wind_speed'].mean()
f = df_full['days_with_fog'].mean()

In [None]:
df_full['year_built'].fillna(a, inplace=True)
df_full['energy_star_rating'].fillna(b, inplace=True)
df_full['direction_max_wind_speed'].fillna(c, inplace=True)
df_full['direction_peak_wind_speed'].fillna(d, inplace=True)
df_full['max_wind_speed'].fillna(e, inplace=True)
df_full['days_with_fog'].fillna(f, inplace=True)

In [None]:
#se genera el nuevo DF sin las columnas categoricas y la variable objetivo
df_full = df_full.drop(columns=['State_Factor','building_class', 'facility_type', 'id'])

In [None]:
df_full.columns

Se calcula el Z-score para tratar de eliminar los outliers

In [None]:
from scipy import stats
import numpy as np

In [None]:
z = np.abs(stats.zscore(df_full))
print(z)

In [None]:
df_full_z_scores = df_full[(z < 3).all(axis=1)]

In [None]:
df_full_z_scores.shape

In [None]:
df_full_z_scores.describe()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
# Se calcula el valore de VIF para todos los features
def compute_vif(df, considered_features):    
    X = df[considered_features]
    X['intercept'] = 1
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif[vif['Variable']!='intercept']
    return vif

In [None]:
# features altamente correlacionados segun el EDA y que deberia ser borradas
considered_features = ['january_avg_temp', 
'february_min_temp', 
'february_avg_temp', 
'march_min_temp',
'heating_degree_days', 
'cooling_degree_days',
'july_avg_temp', 
'august_avg_temp', 
'days_below_20F',
'days_below_30F', 
'days_below_10F', 
'direction_peak_wind_speed', 
'direction_max_wind_speed',
'max_wind_speed'
]
compute_vif(df_full_z_scores, considered_features).sort_values('VIF', ascending=False)

In [None]:
df_full_SC = df_full_z_scores.drop(columns=['january_avg_temp','february_min_temp', 
                                          'february_avg_temp', 'march_min_temp','heating_degree_days',
                                          'cooling_degree_days','july_avg_temp', 'august_avg_temp', 
                                          'days_below_20F','days_below_30F', 'days_below_10F', 'direction_peak_wind_speed',
                                          'direction_max_wind_speed','max_wind_speed'])

Probamos haciendo una standarizacion normal comun y corriente ya que en teoria se eliminaron los outlier

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
transformer = StandardScaler().fit(df_full_SC)
df_full_scaled = transformer.transform(df_full_SC)

In [None]:
#se debe transforma a un dataframe ya que el metodo transorm lo retorna en un ndarray
df_full_scaled_df = pd.DataFrame(df_full_scaled, columns=df_full_SC.columns)
df_full_scaled_df.describe()

In [None]:
df_full_scaled_df.to_csv('data/preprocesada/Full_Data.csv')