In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
test_ID = test.Id
y = train.SalePrice
qualitative = list(train.columns[train.dtypes == "object"])
quantitative = list(test.select_dtypes(include=['float64',"int64"]).columns)
quantitative.remove("Id")




In [84]:
cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold']
train[cols] = train[cols].fillna(0)
test[cols] = test[cols].fillna(0)
old_cols = [x for x in test.columns if x not in cols]

In [85]:
key = pd.DataFrame(columns=["index"], index=pd.MultiIndex.from_tuples([], names=['variable','value']))

for x in cols:
    kf = train.groupby(x)["SalePrice"].mean().to_frame().sort_values("SalePrice")
    kf = kf.reset_index().reset_index().drop(columns=["SalePrice"])
    kf["variable"] = x
    kf.index = kf.index + 1
    key = pd.concat([key, kf.rename(columns={x:"value"}).set_index(["variable","value"])])



In [86]:
train = train.drop("SalePrice", axis=1)
features = pd.concat([train, test])

reshape_df = pd.melt(features, id_vars="Id", value_vars=cols).merge(key, on=["variable","value"])
qualDf= reshape_df.pivot(index='Id', columns='variable')["index"]
#qualDf.columns = [str(x)+"_E" for x in qualDf.columns]


features = qualDf.merge(features[old_cols], on=["Id"])

In [87]:
#features = features.drop(features[features[cols].isnull().any(axis=1)].index)
features[cols] = features[cols].fillna(0)
features[cols] = features[cols].astype('int')

In [88]:
features = features.drop(columns=["Id"])



In [89]:
from scipy.stats import norm, skew
numeric_feats = features.dtypes[features.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(15)


Skew in numerical features: 



Unnamed: 0,Skew
MiscVal,21.947195
PoolQC,20.723994
PoolArea,16.898328
LotArea,12.822431
LowQualFinSF,12.088761
3SsnPorch,11.376065
LandSlope,4.975157
KitchenAbvGr,4.302254
BsmtFinSF2,4.145323
EnclosedPorch,4.003891


In [90]:
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    features[feat] = boxcox1p(features[feat], lam)

There are 58 skewed numerical features to Box Cox transform


In [91]:
features[quantitative] = features[quantitative].fillna(0)

In [92]:
features = pd.get_dummies(features)

In [93]:
train = features[:len(y)]
test = features[len(y):]

In [12]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [156]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, np.log1p(y), scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [157]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.1304 (0.0140)



In [158]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
score = rmsle_cv(lasso)
print("Lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Lasso score: 0.1283 (0.0157)



In [159]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.1282 (0.0157)



In [160]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.1256 (0.0126)



In [161]:
y_train = np.log1p(y)
LassoMd = lasso.fit(train.values,y_train)
ENetMd = ENet.fit(train.values,y_train)
KRRMd = KRR.fit(train.values,y_train)
GBoostMd = GBoost.fit(train.values,y_train)

In [166]:
finalMd = (np.expm1(LassoMd.predict(test.values)) + np.expm1(ENetMd.predict(test.values)) + np.expm1(KRRMd.predict(test.values)) + np.expm1(GBoostMd.predict(test.values)) ) / 4
finalMd = np.expm1(LassoMd.predict(test.values))



In [167]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = finalMd
sub.to_csv('../output/lasso_submission.csv',index=False)