# MIT 6.867 Final Project: Data Cleaning & Standardization
Irina Degtiar


In [15]:
##########################################################################################################
### Set up workspace
##########################################################################################################
# Ensure re-load of all helper file code
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Import libraries - general
import numpy as np
import pylab as pl
import pandas as pd
import random
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
import pickle # Save data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
##########################################################################################################
### Load data
##########################################################################################################
train = pd.read_csv('../Data/train.csv', keep_default_na=False,  na_values='-1')
test = pd.read_csv('../Data/test.csv', keep_default_na=False,  na_values='-1')

In [13]:
##########################################################################################################
### Data transformations - train_X
##########################################################################################################
### Covariates
train_X = train.drop('target', 1)

### Add indicator for missing
train_X_missingindic = train_X.isnull().astype(int).add_suffix('_missing')
train_X1 = pd.concat([train_X, train_X_missingindic], axis=1)

### Impute missing values - fill with mean for now until Greyson replaces
train_X_imputed = train_X1.fillna(train_X1.mean())

### Scale data for use in algorithms
scaler = StandardScaler().fit(train_X_imputed)
train_X_scaled = scaler.transform(train_X_imputed)
train_X_scaled = pd.DataFrame(train_X_scaled, index=train_X_imputed.index, columns=train_X_imputed.columns)
train_X_scaled.head()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11_missing,ps_calc_12_missing,ps_calc_13_missing,ps_calc_14_missing,ps_calc_15_bin_missing,ps_calc_16_bin_missing,ps_calc_17_bin_missing,ps_calc_18_bin_missing,ps_calc_19_bin_missing,ps_calc_20_bin_missing
0,-1.732308,0.050218,0.965507,0.213594,1.182507,-0.311919,-0.805893,1.700163,-0.442786,-0.476919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.732303,-0.453868,-0.542624,0.954362,-0.845779,-0.311919,-0.805893,-0.588179,2.258425,-0.476919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.732294,1.562477,3.981771,1.69513,1.182507,-0.311919,-0.805893,-0.588179,2.258425,-0.476919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.732287,-0.957955,-0.542624,-0.897559,-0.845779,-0.311919,1.24086,-0.588179,-0.442786,-0.476919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.732284,-0.957955,0.965507,-1.638327,1.182507,-0.311919,1.24086,-0.588179,-0.442786,-0.476919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
##########################################################################################################
### Data transformations - test
##########################################################################################################
### Add indicator for missing
test_missingindic = test.isnull().astype(int).add_suffix('_missing')
test1 = pd.concat([test, test_missingindic], axis=1)

### Impute missing values - fill with mean for now until Greyson replaces
test_imputed = test1.fillna(train_X1.mean()) # Using training dataset means

### Scale data for use in algorithms
#scaler = StandardScaler().fit(test_imputed)
test_scaled = scaler.transform(test_imputed) # Using training dataset means, SDs
test_scaled = pd.DataFrame(test_scaled, index=test_imputed.index, columns=test_imputed.columns)
test_scaled.head()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11_missing,ps_calc_12_missing,ps_calc_13_missing,ps_calc_14_missing,ps_calc_15_bin_missing,ps_calc_16_bin_missing,ps_calc_17_bin_missing,ps_calc_18_bin_missing,ps_calc_19_bin_missing,ps_calc_20_bin_missing
0,-1.732324,-0.957955,-0.542624,1.324746,1.182507,-0.311919,-0.805893,1.700163,-0.442786,-0.476919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.732322,1.05839,0.965507,0.213594,1.182507,-0.311919,-0.805893,-0.588179,-0.442786,2.096794,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.732319,1.562477,-0.542624,-0.527174,-0.845779,-0.311919,-0.805893,-0.588179,-0.442786,2.096794,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.732317,-0.957955,-0.542624,0.583978,-0.845779,-0.311919,1.24086,-0.588179,-0.442786,-0.476919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.732315,1.562477,-0.542624,0.954362,-0.845779,-0.311919,-0.805893,-0.588179,-0.442786,2.096794,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
##########################################################################################################
### Save output
##########################################################################################################
train_scaled.to_pickle('../Data/Cleaned/train_scaled.pickle')
test_scaled.to_pickle('../Data/Cleaned/test_scaled.pickle')

#Load data:
#train_scaled = pd.read_pickle('../Data/Cleaned/train_scaled.pickle')