In [115]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [116]:
import numpy as np
import pandas as pd

from statsmodels.distributions.empirical_distribution import ECDF

from kaggle.house_prices import helpers
from kaggle.house_prices import missing
from kaggle.house_prices import outliers

combined_dataset = helpers.load_data()
combined_dataset = missing.fix_all(combined_dataset)
combined_dataset = outliers.remove_outliers(combined_dataset)
combined_dataset['price_log'] = np.log(combined_dataset['SalePrice'])
combined_dataset = combined_dataset.drop(['SalePrice', 'Id'], axis=1)
combined_dataset.shape

(2917, 81)

In [37]:
def calc_quantiles(sample, probs=None):
    if probs is None:
        probs = [.25, .5, .75]
    return (
        sample
        .dropna()
        .quantile(probs)
    )

calc_quantiles(combined_dataset['price_log'])

0.25    11.774713
0.50    12.001505
0.75    12.273731
Name: price_log, dtype: float64

In [110]:
def calc_rating_for_sample(sample, rating_quantiles):
    sample = sample.dropna()
    cdf = ECDF(sample)
    cdf_points = [0] + [cdf(q) for q in rating_quantiles.tolist()] + [1]
    probs = [pair[0] - pair[1] for pair in zip(cdf_points[1:], cdf_points[:-1])]
    rating = sum((i+1)*p for i, p in enumerate(probs))
    return rating

quantiles = calc_quantiles(combined_dataset['price_log'].dropna().head(100))
quantiles

calc_rating_for_sample(combined_dataset['price_log'], quantiles)

0.25    11.770372
0.50    11.943082
0.75    12.244089
Name: price_log, dtype: float64

2.5816186556927296

In [117]:
def calc_rating_for_groups(df):
    """ XXX """
    global_quantiles = calc_quantiles(df['price_log'])
    df = (
        df[['Alley', 'BldgType', 'price_log']]
        .dropna(subset=['price_log'])
    )
    return (
        pd.melt(
            frame=df,
            id_vars=['price_log'],
            var_name='var', 
            value_name='value'
        )
        .groupby(['var', 'value'])
        .apply(
            lambda df: calc_rating_for_sample(df['price_log'], global_quantiles)
        )
        .to_frame('rating')
        .reset_index()
    )

x = calc_rating_for_groups(combined_dataset)
x

Unnamed: 0,var,value,rating
0,Alley,Grvl,1.5
1,Alley,Pave,2.536585
2,Alley,_none_,2.531822
3,BldgType,1Fam,2.564039
4,BldgType,2fmCon,1.580645
5,BldgType,Duplex,1.692308
6,BldgType,Twnhs,1.883721
7,BldgType,TwnhsE,2.622807


In [124]:
# get character columns
combined_dataset.select_dtypes(include=['object']).head(2)

[col[1].name for col in combined_dataset.items() if col[1].dtype.kind == 'O']

Unnamed: 0,Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,...,Neighborhood,PavedDrive,PoolQC,RoofMatl,RoofStyle,SaleCondition,SaleType,Street,Utilities,dataSource
0,_none_,1Fam,TA,No,GLQ,Unf,Gd,Y,Norm,Norm,...,CollgCr,Y,_none_,CompShg,Gable,Normal,WD,Pave,AllPub,train
1,_none_,1Fam,TA,Gd,ALQ,Unf,Gd,Y,Feedr,Norm,...,Veenker,Y,_none_,CompShg,Gable,Normal,WD,Pave,AllPub,train


['Alley',
 'BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Foundation',
 'Functional',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotConfig',
 'LotShape',
 'MSSubClass',
 'MSZoning',
 'MasVnrType',
 'MiscFeature',
 'MoSold',
 'Neighborhood',
 'PavedDrive',
 'PoolQC',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'Street',
 'Utilities',
 'dataSource']