In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso, ElasticNet
import xgboost as xgb
import lightgbm as lgb
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [None]:
len(train.columns)

In [None]:
len(test.columns)

In [None]:
for col in ('Alley', 'Utilities', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'):
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

In [None]:
for col in ('MSZoning', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'SaleType', 'Functional'):
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])

In [None]:
train.select_dtypes(include = ['int', 'float']).isnull().sum()
test.select_dtypes(include = ['int', 'float']).isnull().sum()

In [None]:
for col in ('MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea'):
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

In [None]:
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].mean())
test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].mean())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = [50, 20])
sns.heatmap(train.corr(), annot = True)

In [None]:
train.drop(columns = ['2ndFlrSF', '1stFlrSF', 'TotRmsAbvGrd', 'BsmtFullBath', 'BedroomAbvGr', 'GarageArea'], inplace = True)
test.drop(columns = ['2ndFlrSF', '1stFlrSF', 'TotRmsAbvGrd', 'BsmtFullBath', 'BedroomAbvGr', 'GarageArea'], inplace = True)

In [None]:
train['MSSubClass'] = train['MSSubClass'].astype(str)
test['MSSubClass'] = test['MSSubClass'].astype(str)

In [None]:
from scipy.stats import skew
house = pd.concat([train, test], sort = False)
skew = house.select_dtypes(include = ['int64', 'float']).apply(lambda x: skew(x.dropna())).sort_values(ascending = False)
skew_df = pd.DataFrame({'Skew':skew})
print(skew_df)
skewed_df = skew_df[(skew_df['Skew'] > 0.5) | (skew_df['Skew'] < -0.5)]

In [None]:
skewed_df.index

In [None]:
from scipy.special import boxcox1p
lam = 0.1
for col in ('MiscVal', 'PoolArea', 'LotArea', 'LowQualFinSF', '3SsnPorch',  'KitchenAbvGr', 'BsmtFinSF2', 'EnclosedPorch', 'ScreenPorch', 'BsmtHalfBath', 'MasVnrArea', 'OpenPorchSF', 'WoodDeckSF', 'LotFrontage', 'BsmtFinSF1', 'GrLivArea', 'TotalBsmtSF', 'BsmtUnfSF', 
            'Fireplaces', 'HalfBath', 'OverallCond', 'YearBuilt', 'GarageYrBlt'):
    house[col] = boxcox1p(house[col], lam)
    

In [None]:
house = pd.get_dummies(house)

In [None]:
train = house[:len(train)]
test = house[len(train):]

In [None]:
train['SalePrice'] = np.log(train['SalePrice'])

In [None]:
train.drop('Id', axis = 1, inplace = True)
Id = test['Id']
test.drop('Id', axis = 1, inplace = True)
X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']
test = test.drop('SalePrice', axis = 1)

In [None]:
#cv score after model is trained
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle = True).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model, X.values, y, scoring = 'neg_mean_squared_error', cv = kf))
    return rmse

In [None]:
sc = RobustScaler()
la = make_pipeline(RobustScaler(), Lasso())
parameters = {'alpha': [1E-3, 1E-2, 1E-1, 1, 10, 100, 1000]}

clf = GridSearchCV(la, parameters, cv = 5)
clf.fit(X,y)
clf.best_params_

In [None]:
el = ElasticNet()
parameters = {'l1_ratio' : [0, 0.2, 0.4, 0.6, 0.8, 1],'alpha': [1E-3, 1E-2, 1E-1, 1, 10, 100]}
clf2 = GridSearchCV(el, parameters, cv = 5)
clf2.fit(X,y)
clf2.best_params_

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0.001))
enet = make_pipeline(RobustScaler(), ElasticNet(alpha = 0.001, l1_ratio = 0.6))
model_lgb = lgb.LGBMRegressor()
model_xgb = xgb.XGBRegressor()

In [None]:
class StackingAverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds = 5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
    
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits = self.n_folds, shuffle = True)
        out_of_hold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, model in enumerate(self.base_models):
            for train_ind, hold_out_ind in kfold.split(X,y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X.loc[train_ind], y.loc[train_ind])
                y_pred = instance.predict(X.loc[hold_out_ind])
                out_of_hold_predictions[hold_out_ind, i] = y_pred
                
        self.meta_model_.fit(out_of_hold_predictions, y)
        return self
    
    def prediction(self, X):
        meta_features = np.column_stack([np.column_stack([model.predict(X) for model in self.base_models]).mean(axis = 1) for base_models in self.base_models_])
        return self.meta_model_.predict(meta_features)

In [None]:
stack_averaged_models = StackingAverageModels(base_models = (enet, model_xgb, model_lgb), meta_model = lasso)

In [None]:
model_lgb.fit(X,y)
lgb_pred = np.expm1(model_lgb.predict(test))

In [None]:
model_xgb.fit(X,y)
xgb_pred = np.expm1(model_xgb.predict(test))

In [None]:
stack_averaged_models.fit(X, y)
stack_pred = np.expm1(stack_averaged_models.prediction(test))

In [None]:
stack_model_prediction = stack_pred * 0.6 + lgb_pred * 0.2 + xgb_pred * 0.2
stack_output = pd.DataFrame({'Id':Id, 'SalesPrice': stack_model_prediction})
stack_output.to_csv('submission', index = False)

prediction = clf2.predict(test)
prediction = np.exp(prediction)
output = pd.DataFrame({'Id': Id, 'SalePrice': prediction})
output.to_csv('sub.csv', index = False)