In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, auc, roc_curve, roc_auc_score
%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
X = data.drop('SalePrice', axis=1)
Y = data['SalePrice']

In [5]:
from sklearn.model_selection import train_test_split
d_train, d_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [6]:
cat_feat = list(data.dtypes[data.dtypes == object].index)

num_feat = [f for f in data if f not in (cat_feat + ['ID', 'SalePrice'])]

# Создаем дамми-переменные для категорий
dummy_train = pd.get_dummies(d_train[cat_feat], columns=cat_feat)
dummy_test = pd.get_dummies(d_test[cat_feat], columns=cat_feat)

dummy_cols = list(set(dummy_train) & set(dummy_test))

dummy_train = dummy_train[dummy_cols]
dummy_test = dummy_test[dummy_cols]


# Заменяем пропуски на специальное значение -999, чтобы деревья могли их отличить
X_train = pd.concat([d_train[num_feat].fillna(-999),
                     dummy_train], axis=1)

X_test = pd.concat([d_test[num_feat].fillna(-999),
                     dummy_test], axis=1)

In [10]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators=300, max_depth=30, random_state=0)

In [22]:
def get_meta_features(clf, X_train, y_train, X_test, stack_cv):
    meta_train = np.zeros_like(y_train, dtype=float)
    meta_test = np.zeros_like(y_test, dtype=float)
    
    for i, (train_ind, test_ind) in enumerate(stack_cv.split(X_train, y_train)):
        
        clf.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
        meta_train[test_ind] = clf.predict(X_train.iloc[test_ind])
        meta_test += clf.predict(X_test)
    
    return meta_train, meta_test / stack_cv.n_splits

In [43]:
from sklearn.model_selection import StratifiedKFold

stack_cv = StratifiedKFold(n_splits=10, random_state=555)


RForest_tr, RForest_te = get_meta_features(clf, X_train, y_train, X_test, stack_cv)




In [45]:
from sklearn.metrics import mean_squared_error

print (len(meta_test),len(y_test.values))
print ("mean_squared_error = ", mean_squared_error(RForest_te, y_test.values))

292 292
mean_squared_error =  927355336.5203816


In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
clf_rf = RandomForestRegressor(n_estimators=300, max_depth=30, random_state=0)
clf_lr = LinearRegression()
clf_tr = DecisionTreeRegressor(random_state=0)

In [49]:
meta_train = []
meta_test = []
col_names = []

print('RF features...')
meta_tr, meta_te = get_meta_features(clf_rf, X_train, y_train, X_test, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('rf_pred')

print('LR features...')
meta_tr, meta_te = get_meta_features(clf_lr, X_train, y_train, X_test, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('lr_pred')

print('TR features...')
meta_tr, meta_te = get_meta_features(clf_tr, X_train, y_train, X_test, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('tr_pred')

RF features...




LR features...




TR features...


In [50]:
X_meta_train = pd.DataFrame(np.stack(meta_train, axis=1), columns=col_names)
X_meta_test = pd.DataFrame(np.stack(meta_test, axis=1), columns=col_names)

In [52]:
clf_lr_meta = LinearRegression()
clf_lr_meta.fit(X_meta_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [53]:
print ("mean_squared_error = ", mean_squared_error(clf_lr_meta.predict(X_meta_test), y_test.values))

mean_squared_error =  739442969.9276597
