In [113]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from jupyterthemes import jtplot
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
%matplotlib inline
jtplot.style()

In [114]:
train = pd.read_csv('train.csv')

In [115]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [116]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [117]:
test = pd.read_csv('test.csv')

In [118]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

In [119]:
cat_feat_train = list(train.dtypes[train.dtypes == object].index)
cat_feat_test = list(test.dtypes[test.dtypes == object].index)

In [120]:
#закодируем пропущенные значений строкой, факт пропущенного значения тоже может нести в себе информацию
train[cat_feat_train] = train[cat_feat_train].fillna('nan')
test[cat_feat_test] = test[cat_feat_test].fillna('nan')

In [121]:
#отфильтруем непрерывные признаки
num_feat = [f for f in train if f not in (cat_feat_train + ['ID', 'SalePrice'])]

# Смотрим сколько у нас значений по каждому категориальному признаку
cat_nunique = train[cat_feat_train].nunique()
#print(cat_nunique)

#Чтобы в разы не увеличивать число признаков при построении dummy,
#будем использовать категориальные признаки с < 10 уникальных значений
cat_feat_train = list(cat_nunique[cat_nunique < 10].index)

In [122]:
# Создаем дамми-переменные для категорий
dummy_train = pd.get_dummies(train[cat_feat_train], columns=cat_feat_train)
dummy_cols = list(set(dummy_train))
dummy_train = dummy_train[dummy_cols]

In [123]:
# Заменяем пропуски на специальное значение -999, чтобы деревья могли их отличить
train = pd.concat([train[num_feat].fillna(-999), dummy_train, train['SalePrice']], axis=1)

In [124]:
X = train.loc[:, train.columns != 'SalePrice']
y = train['SalePrice']

### Посчитаем важной фичей с помощью случайного леса

In [127]:
from sklearn.ensemble import RandomForestRegressor
kfold = KFold(n_splits=10, random_state=7)
#cart = RandomForestClassifier()
rfc_model = RandomForestRegressor(n_estimators=30, max_depth=5, min_samples_leaf=20, max_features=0.5, n_jobs=-1)

results = cross_val_score(rfc_model, X, y, cv=kfold)
print(results.mean())

0.8226953834899755


In [128]:
rfc_model.fit(X, y)
imp = pd.Series(rfc_model.feature_importances_)
imp.sort_values(ascending=False)

4      0.418859
16     0.129340
26     0.097357
6      0.081116
12     0.046455
144    0.039850
13     0.034614
27     0.028078
208    0.024967
9      0.020877
19     0.015715
248    0.011312
3      0.007512
233    0.005593
23     0.005093
24     0.004111
7      0.004104
14     0.003478
62     0.002327
234    0.001965
8      0.001913
25     0.001731
116    0.001559
51     0.001197
221    0.001184
53     0.001163
209    0.000906
17     0.000892
130    0.000761
11     0.000572
         ...   
105    0.000000
104    0.000000
103    0.000000
102    0.000000
123    0.000000
125    0.000000
148    0.000000
126    0.000000
147    0.000000
146    0.000000
145    0.000000
201    0.000000
143    0.000000
142    0.000000
141    0.000000
140    0.000000
139    0.000000
138    0.000000
137    0.000000
136    0.000000
135    0.000000
134    0.000000
133    0.000000
132    0.000000
131    0.000000
202    0.000000
129    0.000000
128    0.000000
127    0.000000
124    0.000000
Length: 249, dtype: floa

Разобьем на обучающую и тестовую выборку и проведём нормализацию

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 777)

In [130]:
scaler = StandardScaler()
scaler.fit(X_train[num_feat])

X_train[num_feat] = scaler.transform(X_train[num_feat])
X_test[num_feat] = scaler.transform(X_test[num_feat])

  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Оставим только те фичи, которые имеют ненулевую значимость с т.з. RandomForest

In [131]:
imp_index = imp[imp != 0].index

In [132]:
X_train = X_train.iloc[:,imp_index]
X_test = X_test.iloc[:,imp_index]

## Обучить стекинг как минимум 3х моделей, использовать хотя бы 1 линейную модель и 1 нелинейную

In [175]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from mlxtend.regressor import StackingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
import xgboost as xgb

In [159]:
#Функция для кросс валидации
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [171]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print('R2 score LinearRegression ', r2_score(lr_pred, y_test))
print('mean square error LinearRegression ', mean_squared_error(lr_pred, y_test))
print('Variance score ', lr.score(X_test, y_test))


R2 score LinearRegression  0.8636053664248666
mean square error LinearRegression  585516717.1262553
Variance score  0.8818274910338828


In [172]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
print('R2 score LinearRegression ', r2_score(lasso_pred, y_test))
print('mean square error LinearRegression ', mean_squared_error(y_test, lasso_pred))
print('Variance score ', lasso.score(X_test, y_test))


R2 score LinearRegression  0.8638397401516993
mean square error LinearRegression  583523474.3464411
Variance score  0.8822297792920649




In [176]:
xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
print('R2 score mlp ', r2_score(xgb_pred, y_test))
print('mean square error mlp ', mean_squared_error(y_test, xgb_pred))
print('Variance score ', xgb_model.score(X_test, y_test))

  if getattr(data, 'base', None) is not None and \


R2 score mlp  0.879272321600838
mean square error mlp  529243680.396709
Variance score  0.8931848540996397


In [177]:
rfr =  RandomForestRegressor(n_estimators=30, max_depth=5, min_samples_leaf=20, max_features=0.5, n_jobs=-1)
rfr.fit(X_train, y_train)
rfr_pred = rfr.predict(X_test)
print('R2 score svr_rbf ', r2_score(rfr_pred, y_test))
print('mean square error svr_rbf ', mean_squared_error(rfr_pred, y_test))
print('Variance score ', rfr.score(X_test, y_test))

R2 score svr_rbf  0.8246624619173843
mean square error svr_rbf  639734851.8998853
Variance score  0.8708848606524493


Выделим метафичи

In [186]:
def get_meta_features(clf, X_train, y_train, X_test, stack_cv):
    meta_train = np.zeros_like(y_train, dtype=float)
    meta_test = np.zeros_like(y_test, dtype=float)
    
    for i, (train_ind, test_ind) in enumerate(stack_cv.split(X_train, y_train)):
        
        clf.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
        meta_train[test_ind] = clf.predict(X_train.iloc[test_ind])
        meta_test += clf.predict(X_test)
    
    return meta_train, meta_test / stack_cv.n_splits

In [188]:
from sklearn.model_selection import StratifiedKFold

stack_cv = StratifiedKFold(n_splits=10, random_state=555)

meta_train = []
meta_test = []
col_names = []

print('LR features...')
meta_tr, meta_te = get_meta_features(lr, X_train, y_train, X_test, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('lr_pred')

print('xgb_model features...')
meta_tr, meta_te = get_meta_features(xgb_model, X_train, y_train, X_test, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('xgb_pred')

print('lasso features...')
meta_tr, meta_te = get_meta_features(lasso, X_train, y_train, X_test, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('lasso_pred')

LR features...




xgb_model features...


  if getattr(data, 'base', None) is not None and \


lasso features...




In [189]:
X_meta_train = pd.DataFrame(np.stack(meta_train, axis=1), columns=col_names)
X_meta_test = pd.DataFrame(np.stack(meta_test, axis=1), columns=col_names)

Используем линейную регрессию в качестве модели второго уровня

In [190]:
lr_meta = LinearRegression()
lr_meta.fit(X_meta_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [191]:
y_pred_meta_test = lr_meta.predict(X_meta_test)

print('R2 score stacking ', r2_score(y_pred_meta_test, y_test))
print('mean square error stacking ', mean_squared_error(y_pred_meta_test, y_test))

R2 score stacking  0.8994692373404942
mean square error stacking  553022050.8718418


R2 метрика стэка моделей получилась выше, чем каждой модели по отдельности   