In [8]:
import os
import sys

if os.environ['PY_SOURCES'] not in sys.path:
    sys.path.append(os.environ['PY_SOURCES'])

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [2]:
import IPython
import numpy as np
import pandas as pd

In [16]:
%%R

library(dplyr)
library(tidyr)

In [14]:
import kaggle.house_prices.helpers as helpers

combined_dataset = helpers.load_data()

combined_dataset.shape
combined_dataset[combined_dataset['dataSource'] == 'train'].shape
combined_dataset[combined_dataset['dataSource'] == 'test'].shape

categ_data = combined_dataset[helpers.get_character_colnames(combined_dataset)].fillna('__missing__')
categ_data['SalePrice'] = combined_dataset['SalePrice']

categ_data.head(2)

(2919, 82)

(1460, 82)

(1459, 82)

Unnamed: 0,Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,...,Neighborhood,PavedDrive,PoolQC,RoofMatl,RoofStyle,SaleCondition,SaleType,Street,Utilities,SalePrice
0,__missing__,1Fam,TA,No,GLQ,Unf,Gd,Y,Norm,Norm,...,CollgCr,Y,__missing__,CompShg,Gable,Normal,WD,Pave,AllPub,208500.0
1,__missing__,1Fam,TA,Gd,ALQ,Unf,Gd,Y,Feedr,Norm,...,Veenker,Y,__missing__,CompShg,Gable,Normal,WD,Pave,AllPub,181500.0


In [24]:
%%R -i categ_data -o R_result

df <- 
    categ_data %>%
    filter(!is.na(SalePrice)) %>%
    select(Alley, ExterQual, SalePrice) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice) %>%
    gather(var, value, -price_log) %>%
    group_by(var, value)
    
global_mean <- mean(df[,'price_log'][[1]])
global_mean %>% print

df %>%
summarise(
    n = n(),
    mean = mean(price_log),
    Q_within_group = sum( (price_log - mean)**2 ),
    Q_of_group = n * (mean - global_mean)**2,
    Q_total = sum( (price_log - global_mean)**2 )
) -> x

x %>%
group_by(var) %>%
summarise(
    num_levels = n(),
    num_observ = sum(n),
    Q_within_groups = sum(Q_within_group),
    Q_groups = sum(Q_of_group),
    Q_total = sum(Q_total),
    Q_control_sum = Q_groups + Q_within_groups
) -> R_result

[1] 12.02405


In [25]:
R_result

var,num_levels,num_observ,Q_within_groups,Q_groups,Q_total,Q_control_sum
'Alley',3.0,1460.0,226.366695,6.433964,232.800659,232.800659
'ExterQual',4.0,1460.0,125.451163,107.349496,232.800659,232.800659
,,,,,,
,,,,,,
,,,,,,
,,,,,,
,,,,,,


In [27]:
df = (
    categ_data
    [['Alley', 'ExterQual', 'SalePrice']]
    .dropna(subset=['SalePrice'])
    .assign(
        price_log = lambda df: np.log(df['SalePrice'])
    )
    .drop('SalePrice', 1)
)

global_mean = np.mean(df['price_log'])
global_mean

df_long = pd.melt(
    frame=df,
    id_vars=['price_log'],
    var_name='var', 
    value_name='value'
).groupby(['var', 'value'], as_index=False)

x = (
    df_long
    ['price_log']
    .agg({
        'n': np.count_nonzero,
        'mean': np.mean,
        'Q_within_group': lambda vec: np.sum( (vec - np.mean(vec))**2 ),
        'Q_total': lambda vec: np.sum( (vec - global_mean)**2 )
    })
    .assign(
        Q_of_group=lambda df: df['n'] * (df['mean'] - global_mean)**2
    )
    
)

x

(
    x
    .groupby(['var'])
    .apply(lambda grp: pd.Series({
        'num_levels': grp.shape[0],
        'num_observ': np.sum(grp['n']),
        'Q_within_groups': np.sum(grp['Q_within_group']),
        'Q_groups': np.sum(grp['Q_of_group']),
        'Q_total': np.sum(grp['Q_total']),
    }))
    .assign(
        Q_control_sum = lambda df: df['Q_groups'] + df['Q_within_groups']
    )
    .reset_index()
)

12.024050901109383

Unnamed: 0,var,value,n,mean,Q_within_group,Q_total,Q_of_group
0,Alley,Grvl,50.0,11.673354,4.196369,10.345798,6.149429
1,Alley,Pave,41.0,11.996812,3.570696,3.601116,0.03042
2,Alley,__missing__,1369.0,12.037675,218.59963,218.853746,0.254116
3,ExterQual,Ex,52.0,12.764044,5.410069,33.884762,28.474693
4,ExterQual,Fa,14.0,11.304541,2.147263,9.394987,7.247724
5,ExterQual,Gd,488.0,12.311282,39.830407,80.091366,40.260959
6,ExterQual,TA,906.0,11.837985,78.063423,109.429543,31.36612


Unnamed: 0,var,Q_groups,Q_total,Q_within_groups,num_levels,num_observ,Q_control_sum
0,Alley,6.433964,232.800659,226.366695,3.0,1460.0,232.800659
1,ExterQual,107.349496,232.800659,125.451163,4.0,1460.0,232.800659
