In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%reload_ext rpy2.ipython

In [79]:
import numpy as np
import pandas as pd
import scipy.stats as stats

from kaggle.house_prices import helpers
from kaggle.house_prices import missing
from kaggle.house_prices import outliers

combined_dataset = helpers.load_data()
combined_dataset = missing.fix_all(combined_dataset)
combined_dataset = outliers.remove_outliers(combined_dataset)
combined_dataset = combined_dataset.drop(columns=['Id'])
combined_dataset.shape

(2917, 81)

In [29]:
trans = {
    'log':    lambda x: np.log(x + 1),
    'sqrt':   lambda x: np.sqrt(x),
    'invcube': lambda x: x**(1/3)
}

In [35]:
df1 = (
    pd.melt(
        frame=(
            combined_dataset
            .drop(columns=['SalePrice'])
            .select_dtypes(include=[np.number])
        ), 
        var_name='var', 
        value_name='x'
    )
    .dropna(subset=['x'])
)

df1.query("var == 'LotFrontage'").head()
len(df1)

for tran_name, tran_fn in trans.items():
    df1[tran_name] = tran_fn(df1['x'])
    
df1.query("var == 'LotFrontage'").head()

Unnamed: 0,var,x
55423,LotFrontage,65.0
55424,LotFrontage,80.0
55425,LotFrontage,68.0
55426,LotFrontage,60.0
55427,LotFrontage,84.0


99178

Unnamed: 0,var,x,log,sqrt,invcube
55423,LotFrontage,65.0,4.189655,8.062258,4.020726
55424,LotFrontage,80.0,4.394449,8.944272,4.308869
55425,LotFrontage,68.0,4.234107,8.246211,4.081655
55426,LotFrontage,60.0,4.110874,7.745967,3.914868
55427,LotFrontage,84.0,4.442651,9.165151,4.379519


In [44]:
df2 = (
    pd.melt(
        frame=df1,
        id_vars=['var'],
        var_name='tran', 
        value_name='value'
    )
    .assign(
        value_normed = lambda df: (
            df
            .groupby(['var', 'tran'])
            ['value']
            .transform(lambda x: (x - x.mean()) / x.std())
        )
    )
    .drop(columns=['value'])
)

df2.query("var == 'LotArea'").groupby(['var', 'tran']).head(2)

Unnamed: 0,var,tran,value_normed
52506,LotArea,x,-0.2164
52507,LotArea,x,-0.069097
151684,LotArea,log,-0.101744
151685,LotArea,log,0.14941
250862,LotArea,sqrt,-0.210815
250863,LotArea,sqrt,0.024445
350040,LotArea,invcube,-0.182045
350041,LotArea,invcube,0.067009


In [63]:
df3 = (
    df2
    .groupby(['var', 'tran', 'value_normed'])
    .size()
    .to_frame('k')
)

df3.query("var == 'YrSold'")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,k
var,tran,value_normed,Unnamed: 3_level_1
YrSold,invcube,-1.363338,619
YrSold,invcube,-0.602723,691
YrSold,invcube,0.15764,621
YrSold,invcube,0.91775,647
YrSold,invcube,1.677608,339
YrSold,log,-1.363451,619
YrSold,log,-0.602662,691
YrSold,log,0.157748,621
YrSold,log,0.91778,647
YrSold,log,1.677434,339


In [103]:
df4 = (
    df3
    .assign(
        sum=lambda df: df.groupby(['var', 'tran'])['k'].transform('sum'),
        cumsum=lambda df: df.groupby(['var', 'tran'])['k'].cumsum()
    )
    .assign(
        theoretical=lambda df: stats.norm.cdf(df.index.get_level_values('value_normed')),
        empirical=lambda df: df['cumsum'] / df['sum']
    )
    .assign(
        diff_L2=lambda df: df['k']*(df['empirical'] - df['theoretical'])**2
    )
)

df4.query("var == 'YrSold'")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,k,sum,cumsum,theoretical,empirical,diff_L2
var,tran,value_normed,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
YrSold,invcube,-1.363338,619,2917,619,0.086388,0.212204,9.798608
YrSold,invcube,-0.602723,691,2917,1310,0.273347,0.449092,21.342424
YrSold,invcube,0.15764,621,2917,1931,0.56263,0.661981,6.129756
YrSold,invcube,0.91775,647,2917,2578,0.820625,0.883785,2.580972
YrSold,invcube,1.677608,339,2917,2917,0.953288,1.0,0.739697
YrSold,log,-1.363451,619,2917,619,0.08637,0.212204,9.801384
YrSold,log,-0.602662,691,2917,1310,0.273367,0.449092,21.337515
YrSold,log,0.157748,621,2917,1931,0.562672,0.661981,6.124481
YrSold,log,0.91778,647,2917,2578,0.820633,0.883785,2.580324
YrSold,log,1.677434,339,2917,2917,0.953271,1.0,0.740235


In [110]:
df5 = (
    df4
    .groupby(['var', 'tran'])
    ['diff_L2']
    .agg('sum')
    .to_frame()
)

df5.query("var == 'LotArea'")

Unnamed: 0_level_0,Unnamed: 1_level_0,diff_L2
var,tran,Unnamed: 2_level_1
LotArea,invcube,10.788623
LotArea,log,13.313755
LotArea,sqrt,13.390061
LotArea,x,50.032779
