In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso, ElasticNet
import xgboost as xgb
import lightgbm as lgb
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
len(train.columns)

81

In [3]:
len(test.columns)

80

In [4]:
for col in ('Alley', 'Utilities', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'):
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

In [5]:
for col in ('MSZoning', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'SaleType', 'Functional'):
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])

In [6]:
train.select_dtypes(include = ['int', 'float']).isnull().sum()
test.select_dtypes(include = ['int', 'float']).isnull().sum()

Id                 0
MSSubClass         0
LotFrontage      227
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea        15
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       78
GarageCars         1
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

In [7]:
for col in ('MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea'):
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

In [8]:
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].mean())
test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].mean())

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = [50, 20])
sns.heatmap(train.corr(), annot = True)

<matplotlib.axes._subplots.AxesSubplot at 0x1a106e07b8>

In [10]:
train.drop(columns = ['2ndFlrSF', '1stFlrSF', 'TotRmsAbvGrd', 'BsmtFullBath', 'BedroomAbvGr', 'GarageArea'], inplace = True)
test.drop(columns = ['2ndFlrSF', '1stFlrSF', 'TotRmsAbvGrd', 'BsmtFullBath', 'BedroomAbvGr', 'GarageArea'], inplace = True)

In [11]:
train['MSSubClass'] = train['MSSubClass'].astype(str)
test['MSSubClass'] = test['MSSubClass'].astype(str)

In [12]:
from scipy.stats import skew
house = pd.concat([train, test], sort = False)
skew = house.select_dtypes(include = ['int64', 'float']).apply(lambda x: skew(x.dropna())).sort_values(ascending = False)
skew_df = pd.DataFrame({'Skew':skew})
print(skew_df)
skewed_df = skew_df[(skew_df['Skew'] > 0.5) | (skew_df['Skew'] < -0.5)]

                    Skew
MiscVal        21.947195
PoolArea       16.898328
LotArea        12.822431
LowQualFinSF   12.088761
3SsnPorch      11.376065
KitchenAbvGr    4.302254
BsmtFinSF2      4.146143
EnclosedPorch   4.003891
ScreenPorch     3.946694
BsmtHalfBath    3.931594
MasVnrArea      2.613592
OpenPorchSF     2.535114
SalePrice       1.880941
WoodDeckSF      1.842433
LotFrontage     1.643732
BsmtFinSF1      1.425230
GrLivArea       1.269358
TotalBsmtSF     1.156894
BsmtUnfSF       0.919339
Fireplaces      0.733495
HalfBath        0.694566
OverallCond     0.570312
OverallQual     0.197110
MoSold          0.195884
FullBath        0.167606
YrSold          0.132399
Id              0.000000
GarageCars     -0.219581
YearRemodAdd   -0.451020
YearBuilt      -0.599806
GarageYrBlt    -3.906205


In [13]:
skewed_df.index

Index(['MiscVal', 'PoolArea', 'LotArea', 'LowQualFinSF', '3SsnPorch',
       'KitchenAbvGr', 'BsmtFinSF2', 'EnclosedPorch', 'ScreenPorch',
       'BsmtHalfBath', 'MasVnrArea', 'OpenPorchSF', 'SalePrice', 'WoodDeckSF',
       'LotFrontage', 'BsmtFinSF1', 'GrLivArea', 'TotalBsmtSF', 'BsmtUnfSF',
       'Fireplaces', 'HalfBath', 'OverallCond', 'YearBuilt', 'GarageYrBlt'],
      dtype='object')

In [14]:
from scipy.special import boxcox1p
lam = 0.1
for col in ('MiscVal', 'PoolArea', 'LotArea', 'LowQualFinSF', '3SsnPorch',  'KitchenAbvGr', 'BsmtFinSF2', 'EnclosedPorch', 'ScreenPorch', 'BsmtHalfBath', 'MasVnrArea', 'OpenPorchSF', 'WoodDeckSF', 'LotFrontage', 'BsmtFinSF1', 'GrLivArea', 'TotalBsmtSF', 'BsmtUnfSF', 
            'Fireplaces', 'HalfBath', 'OverallCond', 'YearBuilt', 'GarageYrBlt'):
    house[col] = boxcox1p(house[col], lam)
    

In [15]:
house = pd.get_dummies(house)

In [16]:
train = house[:len(train)]
test = house[len(train):]

In [17]:
train['SalePrice'] = np.log(train['SalePrice'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
train.drop('Id', axis = 1, inplace = True)
Id = test['Id']
test.drop('Id', axis = 1, inplace = True)
X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']
test = test.drop('SalePrice', axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [19]:
#cv score after model is trained
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle = True).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model, X.values, y, scoring = 'neg_mean_squared_error', cv = kf))
    return rmse

In [20]:
sc = RobustScaler()
la = make_pipeline(RobustScaler(), Lasso())
parameters = {'alpha': [1E-3, 1E-2, 1E-1, 1, 10, 100, 1000]}

clf = GridSearchCV(la, parameters, cv = 5)
clf.fit(X,y)
clf.best_params_

In [21]:
el = ElasticNet()
parameters = {'l1_ratio' : [0, 0.2, 0.4, 0.6, 0.8, 1],'alpha': [1E-3, 1E-2, 1E-1, 1, 10, 100]}
clf2 = GridSearchCV(el, parameters, cv = 5)
clf2.fit(X,y)
clf2.best_params_



{'alpha': 0.001, 'l1_ratio': 0.6}

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0.001))
enet = make_pipeline(RobustScaler(), ElasticNet(alpha = 0.001, l1_ratio = 0.6))
model_lgb = lgb.LGBMRegressor()
model_xgb = xgb.XGBRegressor()

  from numpy.core.umath_tests import inner1d


In [23]:
class StackingAverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds = 5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
    
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits = self.n_folds, shuffle = True)
        out_of_hold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, model in enumerate(self.base_models):
            for train_ind, hold_out_ind in kfold.split(X,y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X.loc[train_ind], y.loc[train_ind])
                y_pred = instance.predict(X.loc[hold_out_ind])
                out_of_hold_predictions[hold_out_ind, i] = y_pred
                
        self.meta_model_.fit(out_of_hold_predictions, y)
        return self
    
    def prediction(self, X):
        meta_features = np.column_stack([np.column_stack([model.predict(X) for model in self.base_models]).mean(axis = 1) for base_models in self.base_models_])
        return self.meta_model_.predict(meta_features)

In [24]:
stack_averaged_models = StackingAverageModels(base_models = (enet, model_xgb, model_lgb), meta_model = lasso)

In [25]:
model_lgb.fit(X,y)
lgb_pred = np.expm1(model_lgb.predict(test))

In [26]:
model_xgb.fit(X,y)
xgb_pred = np.expm1(model_xgb.predict(test))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




In [27]:
stack_averaged_models.fit(X, y)
stack_pred = np.expm1(stack_averaged_models.prediction(test))

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




NotFittedError: This RobustScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
stack_model_prediction = stack_pred * 0.6 + lgb_pred * 0.2 + xgb_pred * 0.2
stack_output = pd.DataFrame({'Id':Id, 'SalesPrice': stack_model_prediction})
stack_output.to_csv('submission', index = False)

prediction = clf2.predict(test)
prediction = np.exp(prediction)
output = pd.DataFrame({'Id': Id, 'SalePrice': prediction})
output.to_csv('sub.csv', index = False)