# Proprocesammiento

En esta notebook se hace lo siguiente:  
1.- se calcula la Mean de los features que tiene valores nulos para luego ser imputados.  
2.- Se elimina las variables categoricas.  
3.- se cacula en Z-score para eliminar los Outliers.  
4.- Se caclula en valor VIF para ser usado para eliminas las variables multicolineares.  
5.- Se determinan y corrigen los feautures Skewed.  
6.- Se estandarizan las columnas usando un StandardScaler.  


In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import TargetEncoder

In [47]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [48]:
df_TRAIN = pd.read_csv('../../data/raw/train_dataset.csv')
df_TEST = pd.read_csv('../../data/raw/x_test.csv')

z_test = pd.read_csv('../../data/raw/y_test.csv')

In [49]:
df_TRAIN['origen'] = 'train'
df_TEST['site_eui'] = np.nan
df_TEST['origen'] = 'test'

In [50]:
df_all = pd.concat([df_TRAIN, df_TEST], axis=0, ignore_index=True)

In [51]:
df_all.shape

(85462, 65)

In [52]:
# se completan los valores nulos con la media
df_all['year_built'].fillna(df_all['year_built'].mean(), inplace=True)
df_all['energy_star_rating'].fillna(df_all['energy_star_rating'].mean(), inplace=True)
df_all['direction_max_wind_speed'].fillna(df_all['direction_max_wind_speed'].mean(), inplace=True)
df_all['direction_peak_wind_speed'].fillna(df_all['direction_peak_wind_speed'].mean(), inplace=True)
df_all['max_wind_speed'].fillna(df_all['max_wind_speed'].mean(), inplace=True)
df_all['days_with_fog'].fillna(df_all['days_with_fog'].mean(), inplace=True)

In [53]:
# Se cdifican las columnas categoricas

#     building_class onehot encoder        
OneHot_dumies = pd.get_dummies(df_all['building_class'], dummy_na=False)
for d in OneHot_dumies:
    df_all[f'building_class_{d}'] = OneHot_dumies[d]  
    
#     State_Factor
df_all['label_State_Factor'] = LabelEncoder().fit_transform(df_all['State_Factor'])

#     facility_type    
encoder = TargetEncoder()
df_all['target_facility_type'] = encoder.fit_transform(df_all['facility_type'], df_all['site_eui'])

In [54]:
df_all.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,january_max_temp,february_min_temp,february_avg_temp,february_max_temp,march_min_temp,march_avg_temp,march_max_temp,april_min_temp,april_avg_temp,april_max_temp,may_min_temp,may_avg_temp,may_max_temp,june_min_temp,june_avg_temp,june_max_temp,july_min_temp,july_avg_temp,july_max_temp,august_min_temp,august_avg_temp,august_max_temp,september_min_temp,september_avg_temp,september_max_temp,october_min_temp,october_avg_temp,october_max_temp,november_min_temp,november_avg_temp,november_max_temp,december_min_temp,december_avg_temp,december_max_temp,cooling_degree_days,heating_degree_days,precipitation_inches,snowfall_inches,snowdepth_inches,avg_temp,days_below_30F,days_below_20F,days_below_10F,days_below_0F,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id,origen,building_class_Commercial,building_class_Residential,label_State_Factor,target_facility_type
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,68,35,50.589286,73,40,53.693548,80,41,55.5,78,46,56.854839,84,50,60.5,90,52,62.725806,84,52,62.16129,85,52,64.65,90,47,63.016129,83,43,53.8,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,1.0,1.0,109.94301,248.682615,0,train,1,0,0,241.135162
1,1,State_1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,36,50.5,68,35,50.589286,73,40,53.693548,80,41,55.5,78,46,56.854839,84,50,60.5,90,52,62.725806,84,52,62.16129,85,52,64.65,90,47,63.016129,83,43,53.8,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,70.750627,1.0,12.0,26.50015,1,train,1,0,0,39.559542
2,1,State_1,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,36,50.5,68,35,50.589286,73,40,53.693548,80,41,55.5,78,46,56.854839,84,50,60.5,90,52,62.725806,84,52,62.16129,85,52,64.65,90,47,63.016129,83,43,53.8,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,70.750627,1.0,12.0,24.693619,2,train,1,0,0,100.965103
3,1,State_1,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,36,50.5,68,35,50.589286,73,40,53.693548,80,41,55.5,78,46,56.854839,84,50,60.5,90,52,62.725806,84,52,62.16129,85,52,64.65,90,47,63.016129,83,43,53.8,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,70.750627,1.0,12.0,48.406926,3,train,1,0,0,69.441531
4,1,State_1,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,36,50.5,68,35,50.589286,73,40,53.693548,80,41,55.5,78,46,56.854839,84,50,60.5,90,52,62.725806,84,52,62.16129,85,52,64.65,90,47,63.016129,83,43,53.8,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,1.0,1.0,109.94301,3.899395,4,train,1,0,0,38.209399


In [55]:
df_all.shape

(85462, 69)

In [56]:
train = df_all[df_all['origen']=='train']
test = df_all[df_all['origen']=='test']

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

y_train = train['site_eui']
y_test =  test['site_eui']

X_train = train.drop(columns=['site_eui','id','origen', 'building_class','State_Factor','facility_type'])
X_test = test.drop(columns=['site_eui','id','origen', 'building_class','State_Factor','facility_type']) 

In [57]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(75757, 63)
(9705, 63)
(75757,)
(9705,)


In [None]:
X_train.to_csv('../../data/preprocesada/X_train.csv')
X_test.to_csv('../../data/preprocesada/X_test.csv')    
y_train.to_csv('../../data/preprocesada/y_train.csv')
y_test.to_csv('../../data/preprocesada/y_test.csv')