In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%reload_ext rpy2.ipython

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats

from kaggle.house_prices import helpers
from kaggle.house_prices import missing
from kaggle.house_prices import outliers
from kaggle.house_prices import transform_numeric as TN
from kaggle.house_prices import api

combined_dataset = helpers.load_data()
combined_dataset = missing.fix_all(combined_dataset)
combined_dataset = outliers.remove_outliers(combined_dataset)
combined_dataset = (
    combined_dataset
    .assign(
        price_log=lambda df: np.log(df['SalePrice'])
    )
    .drop(columns=['Id', 'SalePrice'])
)

combined_dataset.shape

(2917, 81)

In [3]:
Trans = {
    'log':  lambda x: np.log(x + 1),
    'sqrt': lambda x: np.sqrt(x),
    'inv3': lambda x: x**(1/3),
    'inv4': lambda x: x**(1/4)
}

trans_config = TN.get_transformation_config(
    df=(
        combined_dataset
        .select_dtypes(include=[np.number])
        .drop(columns=['price_log'])
    ),
    trans=Trans
)


trans_config2 = TN.get_transformation_config(
    df=(
        combined_dataset
        .drop(columns=['price_log'])
    ),
    trans=Trans
)

pd.testing.assert_frame_equal(
    trans_config.sort_index(axis=1), 
    trans_config2.sort_index(axis=1)
)

trans_config

Unnamed: 0,var,progress_score,tran,tran_fn
0,GrLivArea,91.157942,log,<function <lambda> at 0x7f24f8129bf8>
1,X1stFlrSF,90.771657,log,<function <lambda> at 0x7f24f8129bf8>
2,BsmtUnfSF,79.976031,sqrt,<function <lambda> at 0x7f24f7f54950>
3,LotArea,78.864026,inv4,<function <lambda> at 0x7f24f7f549d8>
4,TotRmsAbvGrd,30.685015,log,<function <lambda> at 0x7f24f8129bf8>
5,OverallQual,29.944357,log,<function <lambda> at 0x7f24f8129bf8>
6,GarageCars,24.956765,log,<function <lambda> at 0x7f24f8129bf8>
7,BedroomAbvGr,22.375086,log,<function <lambda> at 0x7f24f8129bf8>
8,OverallCond,16.247239,log,<function <lambda> at 0x7f24f8129bf8>
9,FullBath,12.073778,inv4,<function <lambda> at 0x7f24f7f549d8>


In [4]:
trans_config3 = api.get_functional_transformation_config(
    data=combined_dataset, 
    target="price_log", 
    trans=Trans, 
    threshold=0
)

trans_config3

Unnamed: 0,var,progress_score,tran,tran_fn,r2_x,r2_tran
0,GrLivArea,91.157942,log,<function <lambda> at 0x7f24f8129bf8>,0.525931,0.543804
3,LotArea,78.864026,inv4,<function <lambda> at 0x7f24f7f549d8>,0.067883,0.159187
4,TotRmsAbvGrd,30.685015,log,<function <lambda> at 0x7f24f8129bf8>,0.289123,0.293868
11,Fireplaces,3.801495,inv4,<function <lambda> at 0x7f24f7f549d8>,0.242062,0.265132


In [70]:
x = (
    pd.melt(
        frame=combined_dataset[trans_config3['var']], 
        var_name='var', 
        value_name='value'
    )
    .dropna(subset=['value'])
    .set_index('var')
    .join(
        trans_config3.set_index('var'), 
        how='inner'
    )
    .assign(
        value_transformed=lambda df: [f(v) for f, v in zip(df['tran_fn'], df['value'])]
    )
    [['value', 'value_transformed']]
    .reset_index()
)

y = (
    pd.melt(
        frame=x,
        id_vars=['var'],
        var_name='tran', 
        value_name='value'
    )
    .assign(
        tran=lambda df: (
            df.apply(lambda row: 'original' if row['tran'] == 'value' else 'transformed', axis=1)
        ),
        normed_value=lambda df: (
            df.groupby(['var', 'tran'])['value'].transform(lambda x: (x - x.mean()) / x.std())
        )
    )
)

y

Unnamed: 0,var,tran,value,normed_value
0,Fireplaces,original,0.000000,-0.924604
1,Fireplaces,original,1.000000,0.626328
2,Fireplaces,original,1.000000,0.626328
3,Fireplaces,original,1.000000,0.626328
4,Fireplaces,original,1.000000,0.626328
5,Fireplaces,original,0.000000,-0.924604
6,Fireplaces,original,1.000000,0.626328
7,Fireplaces,original,2.000000,2.177260
8,Fireplaces,original,2.000000,2.177260
9,Fireplaces,original,2.000000,2.177260


In [5]:
transformed_dataset = api.functional_transform(combined_dataset, trans_config3)


pd.testing.assert_series_equal(
    transformed_dataset['GrLivArea'],
    np.log(combined_dataset['GrLivArea'] + 1)
)

pd.testing.assert_series_equal(
    transformed_dataset['LotArea'],
    np.power(combined_dataset['LotArea'], 1/4)
)

pd.testing.assert_frame_equal(
    transformed_dataset.select_dtypes(exclude=[np.number]), 
    combined_dataset.select_dtypes(exclude=[np.number])
)

pd.testing.assert_frame_equal(
    transformed_dataset.select_dtypes(include=[np.number]).drop(columns=trans_config3['var']), 
    combined_dataset.select_dtypes(include=[np.number]).drop(columns=trans_config3['var'])
)