In [1]:
import pandas as pd
from sklearn import metrics
import pickle
import numpy as np
from datetime import date

In [2]:
from sklearn import preprocessing

In [3]:
# Load Train Data
submission = pd.read_csv('../data/raw/submission.csv',sep=';')

In [4]:
submission.index = submission['id_pos'].values

In [5]:
# Load Pos Data
pos = pd.read_csv('../data/raw/pos.csv')

In [6]:
pos = pos[pos['id_pos'].isnull() != True]

In [7]:
pos = pos[pos['id_pos']!='Not Available']

In [8]:
# convertimos a int para poder cruzar con informacion de venta
pos['id_pos'] = pos['id_pos'].astype(int)

In [9]:
pos.index = pos['id_pos'].values

#### Removing outliers

In [10]:
numvars = ['competidores',
       'ingreso_mediana', 'ingreso_promedio', 'densidad_poblacional',
       'pct_0a5', 'pct_5a9', 'pct_10a14', 'pct_15a19', 'pct_20a24',
       'pct_25a29', 'pct_30a34', 'pct_35a39', 'pct_40a44', 'pct_45a49',
       'pct_50a54', 'pct_55a59', 'pct_60a64', 'pct_65a69', 'pct_70a74',
       'pct_75a79', 'pct_80a84', 'pct_85ainf', 'pct_bachelors',
       'pct_doctorados', 'pct_secundario', 'pct_master', 'pct_bicicleta',
       'pct_omnibus', 'pct_subtes', 'pct_taxi', 'pct_caminata',
       'mediana_valor_hogar'
]

In [11]:
for n in numvars:
    outliers_ext = np.nanpercentile(pos[n].values, [5, 95]) # calculamos extremos de outliers
    pos[n] = np.where(pos[n] > outliers_ext[1], outliers_ext[1], pos[n]) 
    pos[n] = np.where(pos[n] < outliers_ext[0], outliers_ext[0], pos[n])

#### Normalize numeric values

In [12]:
# Create the Scaler object
scaler = preprocessing.StandardScaler()

In [13]:
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(pos[numvars])

In [14]:
scaled_df = pd.DataFrame(scaled_df, columns=numvars, index=pos.index)

In [15]:
submission = submission[submission.columns[submission.columns.isin(numvars) != True]]

In [16]:
submission = pd.merge(submission, scaled_df, how='inner', left_index=True, right_index=True)

In [17]:
submission.to_csv('../data/processed/submission_norm.csv', index=False, sep=';')

In [18]:
with open('../data/processed/submission_norm.pkl', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(submission, f)