In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

In [2]:
import IPython, graphviz, sklearn_pandas

In [3]:
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics

In [None]:
!pwd

In [4]:
from pandas.api.types import is_string_dtype, is_numeric_dtype
import os
import feather
import math
import re
import sklearn

In [5]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.ensemble import forest
from sklearn.tree import export_graphviz

In [6]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

## Helper Functions

In [7]:
def parallel_trees(m, fn, n_jobs=8):
    return list(ProcessPoolExecutor(n_jobs).map(fn, m.estimators_))

In [8]:
def draw_tree(t, df, size=10, ratio=0.6, precision=0):
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
                      special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {', f'Tree {{ size = {size}; ratio={ratio}', s)))

In [9]:
def rf_feature_imp(m, df):
    return pd.DataFrame({'cols': df.columns, 'importance': m.feature_importances_}).sort_values('importance', ascending=False)

In [None]:
def add_datepart(df, fldname, drop=True, time=False):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.

    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.

    Examples:
    ---------

    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df

        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13

    >>> add_datepart(df, 'A')
    >>> df

        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)


In [10]:
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
            
            
def apply_cats(df, trn):
    for n, c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)

In [11]:
def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = col.cat.codes+1
        
        
def get_sample(df, n):
    idxs = sorted(np.random.permutation(len(df))[:n])
    return df.iloc[idxs].copy()


def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict


def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n], StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_name_] = mapper.transform(df)
    return mapper
        
        
def proc_df(df, y_fld, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not skip_flds: skip_flds=[]
    if not ignore_flds: ignore_flds=[]
    if subset:
        df = get_sample(df, subset)
    else:
        df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn:
        preproc_fn(df)
    if y_fld is None:
        y = None
    else:
        if not is_numeric_dtype(df[y_fld]):
            df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)
    
    if na_dict is None:
        na_dict = {}
    else:
        na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n, c in df.items():
        na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale:
        mapper = scale_vars(df, mapper)
    for n, c in df.items():
        numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale:
        res = res + mapper
    return res

In [None]:
def split_vals(a, n):
    return a[:n].copy(), a[n:].copy()

In [None]:
def rmse(x, y):
    return math.sqrt(((x - y)**2).mean())

log = lambda x: np.log1p(x)

kaggle_pred = lambda x: x / 100

def print_score(m, X_train, y_train, X_test, y_test):
    res = [rmse(m.predict(X_train), y_train),
           rmse(m.predict(X_test), y_test),
           m.score(X_train, y_train), m.score(X_test, y_test)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [None]:
PATH = "data/"

In [None]:
df_train = pd.read_csv(f'{PATH}train.csv')
df_test =  pd.read_csv(f'{PATH}test.csv')
print(df_train.shape)
print(df_test.shape)

In [None]:
!head -2 data/train.csv

In [None]:
df_train.columns

In [None]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)

In [None]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        display(df)

In [None]:
display_all(df_raw.head().T)

In [None]:
display_all(df_raw.tail().T)

In [None]:
display_all(df_test.describe(include='all').T)

In [None]:
display_all(df_raw.describe(include='all').T)

In [None]:
# df_raw.SalePrice = np.log(df_raw.SalePrice)

In [None]:
df_raw.SalePrice = np.log1p(df_raw.SalePrice)

In [None]:
train_cats(df_raw)

In [None]:
df_raw.SaleCondition.cat.categories

In [None]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

In [None]:
df, y, nas = proc_df(df_raw, 'SalePrice')

In [None]:
len(df.columns), df.columns

In [None]:
df.head()

In [None]:
display_all(df.info())

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=10)
m.fit(df, y)
m.score(df,y)

In [None]:
n_valid = 400
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=10)
%time m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
# def get_result(m, df_test):
#     df_raw_test = df_test.copy()
#     apply_cats(df_raw_test, df_raw)
#     df_t, _, _ = proc_df(df_raw_test, None, na_dict=nas)
#     y_pred = m.predict(df_t)
    
#     submission = pd.DataFrame({
#         "Id": df_test["Id"],
#         "SalePrice": y_pred
#     })
    
#     submission.to_csv('submission-Feb-2-2.csv', index=False)

In [None]:
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice', subset=10000, na_dict=nas)
X_train, _ = split_vals(df_trn, 1200)
y_train, _ = split_vals(y_trn, 1200)

In [None]:
m = RandomForestRegressor(n_estimators=10, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=1)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
draw_tree(m.estimators_[0], df_trn, precision=3)

In [None]:
m = RandomForestRegressor(n_estimators=1, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=20, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
preds = np.stack([t.predict(X_valid) for t in m.estimators_])
preds[:, 0], np.mean(preds[:, 0]), y_valid[0]

In [None]:
preds.shape

In [None]:
plt.plot([metrics.r2_score(y_valid, np.mean(preds[:i+1], axis=0)) for i in range(20)])

In [None]:
m = RandomForestRegressor(n_estimators=20, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=80, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=100, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valif = split_vals(y_trn, n_trn)

In [None]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

## Reduce Overfitting

In [None]:
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')
X_train, X_valid = split_vals(df_trn, 1000)
y_train, y_valid = split_vals(y_trn, 1000)

In [None]:
X_valid.shape

In [None]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

## Confidence based on tree Variance

In [None]:
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')

In [None]:
n_valid = 460
n_trn = len(df_trn) - n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
%time preds = np.stack([t.predict(X_valid) for t in m.estimators_])
np.mean(preds[:, 0]), np.std(preds[:, 0])

In [None]:
def get_preds(t):
    return t.predict(X_valid)

%time preds = np.stack(parallel_trees(m, get_preds))
np.mean(preds[:, 0]), np.std(preds[:, 0])

In [None]:
x = raw_valid.copy()
x['pred_std'] = np.std(preds, axis=0)
x['pred'] = np.mean(preds, axis=0)
x.MSZoning.value_counts().plot.barh()

In [None]:
flds = ['MSZoning', 'SalePrice', 'pred', 'pred_std']
zone_summ = x[flds].groupby('MSZoning', as_index=False).mean()
zone_summ

In [None]:
raw_valid.MSSubClass.value_counts().plot.barh()

In [None]:
flds = ['MSSubClass', 'SalePrice', 'pred', 'pred_std']
class_summ = x[flds].groupby('MSSubClass', as_index=False).mean()
class_summ

In [None]:
(class_summ.pred_std/ class_summ.pred).sort_values(ascending=False)

In [None]:
plt.subplots_adjust(bottom=.25, left=.25)

In [None]:
plt.tight_layout()

In [None]:
fi = rf_feature_imp(m, df_trn); fi[:10]

In [None]:
fi.plot('cols', 'importance', figsize=(17, 10), legend=False)

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'importance', 'barh', figsize=(12, 7), legend=False)

plot_fi(fi[:30])

In [None]:
to_keep = fi[fi.importance>0.004].cols; len(to_keep)

In [None]:
df_keep = df_trn[to_keep].copy()

X_train, X_valid = split_vals(df_keep, n_trn)

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
fi = rf_feature_imp(m, df_keep)
plot_fi(fi)

In [None]:
df_shuffle = df_keep.copy()
np.percentile
df_shuffle.OverallQual = sklearn.utils.shuffle(df_shuffle.OverallQual).values
df_shuffle.OverallQual[15], df_shuffle.OverallQual[15]

In [None]:
X_train, X_valid = split_vals(df_shuffle, n_trn)

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
def shuffle_test(df, flds=None):
    if flds is None:
        flds = df.columns.values
    for fld in flds:
        df_shuffle = df.copy()
        df_shuffle[fld] = sklearn.utils.shuffle(df_shuffle[fld]).values
        X_train, X_valid = split_vals(df_shuffle, n_trn)
        m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
        m.fit(X_train, y_train)
        print(fld, end='=> ')
        print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
shuffle_test(df_keep)

## One Hot Encoding

In [None]:
df_trn2, y_trn, nas = proc_df(df_raw, 'SalePrice', max_n_cat=4)
X_train, X_valid = split_vals(df_trn2, n_trn)

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
fi = rf_feature_imp(m, df_trn2)
plot_fi(fi[:25])

In [None]:
to_keep = fi[fi.importance>=0.004].cols; len(to_keep)

In [None]:
df_keep = df_trn2[to_keep].copy()

## Remove Redundant Features

In [None]:
from scipy.cluster import hierarchy as hc
import scipy

In [None]:
corr = np.round(scipy.stats.spearmanr(df_keep).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16, 10))
dendrogram = hc.dendrogram(z, labels=df_keep.columns, orientation='left', leaf_font_size=16)

In [None]:
def get_oob(df):
    m = RandomForestRegressor(n_estimators=30, min_samples_leaf=5, max_features=0.6, n_jobs=-1, oob_score=True)
    x, _ = split_vals(df, n_trn)
    m.fit(x, y_train)
    return m.oob_score_

In [None]:
get_oob(df_keep)

In [None]:
for c in ('GarageCars', 'ExterQual_TA', 'FullBath', '1stFlrSF', 'LotFrontage', 'BsmtFinSF1'):
    print(c, get_oob(df_keep.drop(c, axis=1)))

In [None]:
to_drop = ['1stFlrSF', 'LotFrontage', 'GarageCars']
get_oob(df_keep.drop(to_drop, axis=1))

In [None]:
df_keep.drop(to_drop, axis=1, inplace=True)
X_train, X_valid = split_vals(df_keep, n_trn)

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.7, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

In [None]:
def solution(m, df_test, df_raw, nas, df, i):
    apply_cats(df_test, df_raw)
    df_test, _, _ = proc_df(df_test, None, na_dict=nas, max_n_cat=4)
    df_test = df_test[list(df.columns.values)]
    y_pred = m.predict(df_test)
    submission = pd.DataFrame({
            "Id": df_test2["Id"],
            "SalePrice": np.expm1(y_pred)
        })
    submission.to_csv(f'submission-Feb-8-{i}.csv', index=False)

In [None]:
df_test2 = df_test.copy()

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.7, n_jobs=-1, oob_score=True)
m.fit(df_keep, y)
solution(m, df_test, df_raw, nas, df_keep, 3)

In [None]:
apply_cats(df_test, df_raw)
df_test, _, _ = proc_df(df_test, None, na_dict=nas, max_n_cat=4)

In [None]:
df_test[list(df_keep.columns.values)].shape

In [None]:
list(df_keep.columns.values)

In [None]:
apply_cats(df_test, df_raw)

In [None]:
df_test, _, _ = proc_df(df_test, None, na_dict=nas, max_n_cat=7)

In [None]:
y_pred = m.predict(df_test)

In [None]:
df_trn[df_trn2.columns.values]

In [None]:
df_test

In [None]:
nas

In [None]:
mapper

In [None]:
pd.get_dummies(df_trn2)