In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%reload_ext rpy2.ipython

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats

from kaggle.house_prices import helpers
from kaggle.house_prices import missing
from kaggle.house_prices import outliers
from kaggle.house_prices import transform_numeric as TN

combined_dataset = helpers.load_data()
combined_dataset = missing.fix_all(combined_dataset)
combined_dataset = outliers.remove_outliers(combined_dataset)
combined_dataset = combined_dataset.drop(columns=['Id'])
combined_dataset.shape

(2917, 81)

In [4]:
Trans = {
    'log':    lambda x: np.log(x + 1),
    'sqrt':   lambda x: np.sqrt(x),
    'inv3': lambda x: x**(1/3)
}

Trans

{'inv3': <function __main__.<lambda>>,
 'log': <function __main__.<lambda>>,
 'sqrt': <function __main__.<lambda>>}

In [5]:
df0 = (
    combined_dataset
    .select_dtypes(include=[np.number])
    .drop(columns=['SalePrice'])
)

df1 = TN.calc_tran_config.step1(df0, Trans)
    
df1.query("var == 'LotFrontage'").head()

Unnamed: 0,var,x,log,sqrt,inv3
55423,LotFrontage,65.0,4.189655,8.062258,4.020726
55424,LotFrontage,80.0,4.394449,8.944272,4.308869
55425,LotFrontage,68.0,4.234107,8.246211,4.081655
55426,LotFrontage,60.0,4.110874,7.745967,3.914868
55427,LotFrontage,84.0,4.442651,9.165151,4.379519


In [6]:
df2 = TN.calc_tran_config.step2(df1)

df2.query("var == 'LotArea'").groupby(['var', 'tran']).head(2)

Unnamed: 0,var,tran,value,value_normed
52506,LotArea,x,8450.0,-0.2164
52507,LotArea,x,9600.0,-0.069097
151684,LotArea,log,9.04204,-0.101744
151685,LotArea,log,9.169623,0.14941
250862,LotArea,sqrt,91.923882,-0.210815
250863,LotArea,sqrt,97.97959,0.024445
350040,LotArea,inv3,20.368181,-0.182045
350041,LotArea,inv3,21.253171,0.067009


In [7]:
df3 = TN.calc_tran_config.step3(df2)
df3.query("var == 'YrSold'")

Unnamed: 0,var,tran,value_normed,k
43932,YrSold,inv3,-1.363338,619
43933,YrSold,inv3,-0.602723,691
43934,YrSold,inv3,0.15764,621
43935,YrSold,inv3,0.91775,647
43936,YrSold,inv3,1.677608,339
43937,YrSold,log,-1.363451,619
43938,YrSold,log,-0.602662,691
43939,YrSold,log,0.157748,621
43940,YrSold,log,0.91778,647
43941,YrSold,log,1.677434,339


In [8]:
df4 = TN.calc_tran_config.step4(df3)

df4.query("var == 'LotArea'")

Unnamed: 0_level_0,Unnamed: 1_level_0,diff_L2
var,tran,Unnamed: 2_level_1
LotArea,inv3,10.788623
LotArea,log,13.313755
LotArea,sqrt,13.390061
LotArea,x,50.032779


In [9]:
df5 = TN.calc_tran_config.step5(df4)
df5

Unnamed: 0,var,progress_score,tran
0,GrLivArea,91.157942,log
1,1stFlrSF,90.771657,log
2,BsmtUnfSF,79.976031,sqrt
3,LotArea,78.436891,inv3
4,TotRmsAbvGrd,30.685015,log
5,OverallQual,29.944357,log
6,GarageCars,24.956765,log
7,BedroomAbvGr,22.375086,log
8,OverallCond,16.247239,log
9,FullBath,8.797333,inv3


In [10]:
tran_config1 = TN.calc_tran_config.step6(df5, Trans)

tran_config2 = TN.get_transformation_config(
    df=(
        combined_dataset
        .select_dtypes(include=[np.number])
        .drop(columns=['SalePrice'])
    ),
    trans=Trans
)

pd.testing.assert_frame_equal(
    tran_config1.sort_index(axis=1), 
    tran_config2.sort_index(axis=1)
)

tran_config2

Unnamed: 0,var,progress_score,tran,tran_fn
0,GrLivArea,91.157942,log,<function <lambda> at 0x7f2091853a60>
1,1stFlrSF,90.771657,log,<function <lambda> at 0x7f2091853a60>
2,BsmtUnfSF,79.976031,sqrt,<function <lambda> at 0x7f20915fed08>
3,LotArea,78.436891,inv3,<function <lambda> at 0x7f20915fee18>
4,TotRmsAbvGrd,30.685015,log,<function <lambda> at 0x7f2091853a60>
5,OverallQual,29.944357,log,<function <lambda> at 0x7f2091853a60>
6,GarageCars,24.956765,log,<function <lambda> at 0x7f2091853a60>
7,BedroomAbvGr,22.375086,log,<function <lambda> at 0x7f2091853a60>
8,OverallCond,16.247239,log,<function <lambda> at 0x7f2091853a60>
9,FullBath,8.797333,inv3,<function <lambda> at 0x7f20915fee18>


In [12]:
%%time
tran_config = TN.get_transformation_config(
    df=(
        combined_dataset
        .select_dtypes(include=[np.number])
        .drop(columns=['SalePrice'])
    ),
    trans=Trans
)

CPU times: user 351 ms, sys: 23.9 ms, total: 375 ms
Wall time: 376 ms


In [135]:
df = combined_dataset[['GrLivArea', 'LotArea']].copy()
df.head()

# mutates df inplace
df2 = TN.apply_transform(df, tran_config)

df2.head()
df.head()

Unnamed: 0,GrLivArea,LotArea
0,1710,8450
1,1262,9600
2,1786,11250
3,1717,9550
4,2198,14260


Unnamed: 0,GrLivArea,LotArea
0,7.444833,20.368181
1,7.141245,21.253171
2,7.488294,22.407024
3,7.448916,21.216209
4,7.695758,24.249708


Unnamed: 0,GrLivArea,LotArea
0,7.444833,20.368181
1,7.141245,21.253171
2,7.488294,22.407024
3,7.448916,21.216209
4,7.695758,24.249708
