In [36]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [37]:
import numpy as np
import pandas as pd

from kaggle.house_prices import helpers
from kaggle.house_prices import missing
from kaggle.house_prices import outliers

combined_dataset = helpers.load_data()
combined_dataset = missing.fix_all(combined_dataset)
combined_dataset = outliers.remove_outliers(combined_dataset)
combined_dataset['price_log'] = np.log(combined_dataset['SalePrice'])
combined_dataset = combined_dataset.drop(['SalePrice', 'Id'], axis=1)
combined_dataset.shape

(2917, 83)

In [38]:
#
# Global quantiles
#
from kaggle.house_prices.quantile_rating import calc_quantiles

calc_quantiles(
    sample=combined_dataset['price_log'],
    probs=[0.25, 0.5, 0.75]
)

0.25    11.774713
0.50    12.001505
0.75    12.273731
Name: price_log, dtype: float64

In [45]:
#
# Quantile ratings calculation
#
from kaggle.house_prices import quantile_rating as QR
from kaggle.house_prices import topapi

ratings1 = QR.calc_ratings(
    df=combined_dataset, 
    target_var='price_log', 
    rating_quantiles=calc_quantiles(
        sample=combined_dataset['price_log'],
        probs=[0.25, 0.5, 0.75]
    ), 
    categ_vars=helpers.get_character_colnames(combined_dataset)
)

ratings2 = topapi.calc_ratings(combined_dataset, 'price_log')

pd.testing.assert_frame_equal(
    ratings1.sort_index(axis=1), 
    ratings2.sort_index(axis=1)
)

In [40]:
ratings1.sort_values(['var', 'rating']).head(9)

Unnamed: 0,var,value,rating
0,Alley,Grvl,1.5
2,Alley,_none_,2.531822
1,Alley,Pave,2.536585
4,BldgType,2fmCon,1.580645
5,BldgType,Duplex,1.692308
6,BldgType,Twnhs,1.883721
3,BldgType,1Fam,2.564039
7,BldgType,TwnhsE,2.622807
10,BsmtCond,Po,1.0


In [41]:
ratings1.sort_values(['var', 'rating']).tail(9)

Unnamed: 0,var,value,rating
288,SaleType,WD,2.421468
281,SaleType,CWD,3.0
286,SaleType,New,3.583333
282,SaleType,Con,4.0
289,Street,Grvl,1.833333
290,Street,Pave,2.499311
292,Utilities,NoSeWa,2.0
291,Utilities,AllPub,2.496911
293,,,2.5


In [47]:
#
# Quantile ratings transformation
#
from kaggle.house_prices import topapi
from kaggle.house_prices.transform_categ import rating_transform

ratings = topapi.calc_ratings(combined_dataset, 'price_log')

df1 = rating_transform(
    dataset=combined_dataset, 
    columns=helpers.get_character_colnames(combined_dataset), 
    ratings=ratings
)

df2 = topapi.rating_transform(combined_dataset, 'price_log')

pd.testing.assert_frame_equal(
    df1.sort_index(axis=1), 
    df2.sort_index(axis=1)
)

df1.head(5)

Unnamed: 0,Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,...,PoolArea,ScreenPorch,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,dataSource,price_log
0,2.531822,2.564039,2.540871,2.320042,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,...,0,0,8,856.0,0,2003,2003,2008,train,12.247694
1,2.531822,2.564039,2.540871,3.295455,2.231818,2.553429,3.066343,2.57887,1.9875,2.50277,...,0,0,6,1262.0,298,1976,1976,2007,train,12.109011
2,2.531822,2.564039,2.540871,2.675439,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,...,0,0,6,920.0,0,2001,2002,2008,train,12.317167
3,2.531822,2.564039,3.046154,2.320042,2.231818,2.553429,1.847458,2.57887,2.547619,2.50277,...,0,0,7,756.0,0,1915,1970,2006,train,11.849398
4,2.531822,2.564039,2.540871,2.900452,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,...,0,0,9,1145.0,192,2000,2000,2008,train,12.429216
