# Proprocesammiento

En esta notebook se hace lo siguiente:  
1.- se calcula la Mean de los features que tiene valores nulos para luego ser imputados.  
2.- Se elimina las variables categoricas.  
3.- se cacula en Z-score para eliminar los Outliers.  
4.- Se caclula en valor VIF para ser usado para eliminas las variables multicolineares.  
5.- Se determinan y corrigen los feautures Skewed.  
6.- Se estandarizan las columnas usando un StandardScaler.  


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import TargetEncoder

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df_TRAIN = pd.read_csv('../../data/raw/train_dataset.csv')
df_TEST = pd.read_csv('../../data/raw/x_test.csv')

z_test = pd.read_csv('../../data/raw/y_test.csv')

In [None]:
df_TRAIN['origen'] = 'train'
df_TEST['site_eui'] = np.nan
df_TEST['origen'] = 'test'

In [None]:
df_all = pd.concat([df_TRAIN, df_TEST], axis=0, ignore_index=True)

In [None]:
df_all.shape

In [None]:
# se completan los valores nulos con la media
df_all['year_built'].fillna(df_all['year_built'].mean(), inplace=True)
df_all['energy_star_rating'].fillna(df_all['energy_star_rating'].mean(), inplace=True)
df_all['direction_max_wind_speed'].fillna(df_all['direction_max_wind_speed'].mean(), inplace=True)
df_all['direction_peak_wind_speed'].fillna(df_all['direction_peak_wind_speed'].mean(), inplace=True)
df_all['max_wind_speed'].fillna(df_all['max_wind_speed'].mean(), inplace=True)
df_all['days_with_fog'].fillna(df_all['days_with_fog'].mean(), inplace=True)

In [None]:
# Se cdifican las columnas categoricas

#     building_class onehot encoder        
OneHot_dumies = pd.get_dummies(df_all['building_class'], dummy_na=False)
for d in OneHot_dumies:
    df_all[f'building_class_{d}'] = OneHot_dumies[d]  
    
#     State_Factor
df_all['label_State_Factor'] = LabelEncoder().fit_transform(df_all['State_Factor'])

#     facility_type    
encoder = TargetEncoder()
df_all['target_facility_type'] = encoder.fit_transform(df_all['facility_type'], df_all['site_eui'])

In [None]:
df_all.head()

In [None]:
train = df_all[df_all['origen']=='train']
test = df_all[df_all['origen']=='test']

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

y_train = train['site_eui']
y_test = z_test['site_eui']

X_train = train.drop(columns=['site_eui','id','origen', 'building_class','State_Factor','facility_type'])
X_test = test.drop(columns=['site_eui','id','origen', 'building_class','State_Factor','facility_type']) 

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train.to_csv('../../data/preprocesada/X_train.csv')
X_test.to_csv('../../data/preprocesada/X_test.csv')    
y_train.to_csv('../../data/preprocesada/y_train.csv')
y_test.to_csv('../../data/preprocesada/y_test.csv')