In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
from scipy.special import boxcox1p
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew #for some statistics
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from tune_sklearn import TuneGridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
%matplotlib inline

In [132]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train.values)
    rmse= np.sqrt(-cross_val_score(model, x_train.values, y_train.values, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models

    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]

        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self

    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [133]:
test_df = pd.read_csv('test.csv' )
train_df = pd.read_csv('train.csv' )

In [134]:
upperlimit = np.percentile(train_df.SalePrice.values, 99.5)
train_df['SalePrice'].loc[train_df['SalePrice']>upperlimit] = upperlimit

# GrLivArea Outliter,
train_df = train_df[train_df.GrLivArea < 4500]
train_df.reset_index(drop=True, inplace=True)
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])
y = train_df['SalePrice'].reset_index(drop=True)

train_features = train_df.drop(['SalePrice'], axis=1)
test_features = test_df
total_features = pd.concat([train_features, test_features]).reset_index(drop=True)

# Removing for strange GarageCars
train_df = train_df.drop(train_df[(train_df['GarageCars']>3)
                                  & (train_df['SalePrice']<350000)].index).reset_index(drop=True)

train_features = train_df.drop(['SalePrice'], axis=1)
test_features = test_df
total_features = pd.concat([train_features, test_features]).reset_index(drop=True)

# Since these column are actually a category , using a numerical number will lead the model to assume
# that it is numerical , so we convert to string .
total_features['MSSubClass'] = total_features['MSSubClass'].apply(str)
total_features['YrSold'] = total_features['YrSold'].astype(str)
total_features['MoSold'] = total_features['MoSold'].astype(str)



## Filling these columns With most suitable value for these columns
total_features['Functional'] = total_features['Functional'].fillna('Typ')
total_features['Electrical'] = total_features['Electrical'].fillna("SBrkr")
total_features['KitchenQual'] = total_features['KitchenQual'].fillna("TA")
total_features["PoolQC"] = total_features["PoolQC"].fillna("None")



## Filling these with MODE , i.e. , the most frequent value in these columns .
total_features['Exterior1st'] = total_features['Exterior1st'].fillna(total_features['Exterior1st'].mode()[0])
total_features['Exterior2nd'] = total_features['Exterior2nd'].fillna(total_features['Exterior2nd'].mode()[0])
total_features['SaleType'] = total_features['SaleType'].fillna(total_features['SaleType'].mode()[0])

## Missing data in GarageYrBit most probably means missing Garage , so replace NaN with zero .

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    total_features[col] = total_features[col].fillna(0)

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    total_features[col] = total_features[col].fillna('None')


## Same with basement features

for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    total_features[col] = total_features[col].fillna('None')

total_features['MSZoning'] = total_features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

objects = []
for i in total_features.columns:
    if total_features[i].dtype == object:
        objects.append(i)
total_features.update(total_features[objects].fillna('None'))

# We are still filling up missing values
total_features['LotFrontage'] = total_features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in total_features.columns:
    if total_features[i].dtype in numeric_dtypes:
        numerics.append(i)
total_features.update(total_features[numerics].fillna(0))

In [135]:
#MSSubClass=The building class
total_features['MSSubClass'] = total_features['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
total_features['OverallCond'] = total_features['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
total_features['YrSold'] = total_features['YrSold'].astype(str)
total_features['MoSold'] = total_features['MoSold'].astype(str)

# Removing features that are not very useful . This can be understood only by doing proper EDA on data

total_features = total_features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)


cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
        'ExterQual', 'ExterCond','HeatingQC', 'KitchenQual', 'BsmtFinType1',
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive',  'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(total_features[c].values))
    total_features[c] = lbl.transform(list(total_features[c].values))

print('Shape all_data: {}'.format(total_features.shape))

Shape all_data: (2912, 77)


In [136]:
# Adding new features . Make sure that you understand this.

total_features['YrBltAndRemod']=total_features['YearBuilt']+total_features['YearRemodAdd']
total_features['TotalSF']=total_features['TotalBsmtSF'] + total_features['1stFlrSF'] + total_features['2ndFlrSF']

total_features['Total_sqr_footage'] = (total_features['BsmtFinSF1'] + total_features['BsmtFinSF2'] +
                                       total_features['1stFlrSF'] + total_features['2ndFlrSF'])

total_features['Total_Bathrooms'] = (total_features['FullBath'] + (0.5 * total_features['HalfBath']) +
                                     total_features['BsmtFullBath'] + (0.5 * total_features['BsmtHalfBath']))

total_features['Total_porch_sf'] = (total_features['OpenPorchSF'] + total_features['3SsnPorch'] +
                                    total_features['EnclosedPorch'] + total_features['ScreenPorch'] +
                                    total_features['WoodDeckSF'])

print(total_features.shape)
## For ex, if PoolArea = 0 , Then HasPool = 0 too

total_features['haspool'] = total_features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
total_features['has2ndfloor'] = total_features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
total_features['hasgarage'] = total_features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
total_features['hasbsmt'] = total_features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
total_features['hasfireplace'] = total_features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

print(total_features.shape)
numeric_feats = total_features.dtypes[total_features.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = total_features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness = skewness[abs(skewness) > 0.75]


skewed_features = skewness.index
lam = 0.15

for feat in skewed_features:
    total_features[feat] = boxcox1p(total_features[feat], lam)

numeric_feats = total_features.dtypes[total_features.dtypes != "object"].index

skewed_feats = total_features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})

final_features = pd.get_dummies(total_features).reset_index(drop=True)
final_features.shape

(2912, 82)
(2912, 87)

Skew in numerical features: 


(2912, 228)

In [137]:
total_features

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,...,YrBltAndRemod,TotalSF,Total_sqr_footage,Total_Bathrooms,Total_porch_sf,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace
0,0.730463,2.885846,RL,5.831328,19.212182,0.730463,1.540963,Lvl,Inside,0.000000,...,16.471644,14.976591,14.781997,1.687259,5.714669,0.0,0.730463,0.730463,0.730463,0.000000
1,1.194318,2.055642,RL,6.221214,19.712205,0.730463,1.540963,Lvl,FR2,0.000000,...,16.424600,14.923100,14.540127,1.378202,9.010206,0.0,0.000000,0.730463,0.730463,0.730463
2,1.540963,2.885846,RL,5.914940,20.347241,0.730463,0.000000,Lvl,Inside,0.000000,...,16.469044,15.149678,14.585277,1.687259,5.053371,0.0,0.730463,0.730463,0.730463,0.730463
3,1.820334,3.011340,RL,5.684507,19.691553,0.730463,0.000000,Lvl,Corner,0.000000,...,16.365466,14.857121,14.076603,1.194318,9.080098,0.0,0.730463,0.730463,0.730463,0.730463
4,2.055642,2.885846,RL,6.314735,21.325160,0.730463,0.000000,Lvl,FR2,0.000000,...,16.466443,15.852312,15.323415,1.687259,8.831514,0.0,0.730463,0.730463,0.730463,0.730463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2907,15.394418,1.194318,RM,3.932510,14.081426,0.730463,1.540963,Lvl,Inside,0.000000,...,16.414072,13.567978,12.374860,0.982247,0.000000,0.0,0.730463,0.000000,0.730463,0.000000
2908,15.395553,1.194318,RM,3.932510,14.013314,0.730463,1.540963,Lvl,Inside,0.000000,...,16.414072,13.567978,12.976752,0.982247,4.137711,0.0,0.730463,0.730463,0.730463,0.000000
2909,15.396687,2.055642,RL,7.620056,22.782058,0.730463,1.540963,Lvl,Inside,0.000000,...,16.428104,14.824355,14.824355,1.194318,10.137338,0.0,0.000000,0.730463,0.730463,0.730463
2910,15.397821,3.340760,RL,5.744420,20.046557,0.730463,1.540963,Lvl,Inside,0.000000,...,16.452543,13.993617,12.894732,0.982247,6.881187,0.0,0.000000,0.000000,0.730463,0.000000


In [53]:
train = train_df
y_train = np.log1p(train.pop('SalePrice'))
x_train = train

In [54]:
# Lasso with 5 fold cross-validation
model = LassoCV(cv=5, random_state=1, max_iter=10000)

# Fit model
model.fit(x_train, y_train)
LassoCV(cv=5, max_iter=10000, random_state=1)
model.alpha_

1.0252347246350564

In [44]:
# Use grid search to tune the parameters:
parametersGrid = {"max_iter": [1, 5, 10],
                  "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                  "l1_ratio": np.arange(0.0, 1.0, 0.1),
                  "random_state": np.arange(1, 10, 1)}

eNet = ElasticNet()
grid = GridSearchCV(eNet, parametersGrid, scoring='accuracy', cv=10)
grid.fit(x_train, y_train)
print(grid.best_params_)

{'alpha': 0.0001, 'l1_ratio': 0.0, 'max_iter': 1, 'random_state': 1}


In [73]:
#    'n_estimators': np.arange(100, 5000, 100),
gbr = GradientBoostingRegressor()

# Define the parameter grid
parameters = {
    'n_estimators': np.arange(1000, 5000, 1000),
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': np.arange(1, 5, 1)
}

# Create a GridSearchCV object
grid_search = TuneGridSearchCV(gbr, parameters)

# Fit the GridSearchCV object to the data
grid_search.fit(x_train, y_train)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)

2023-04-23 21:05:06,190	ERROR tune.py:794 -- Trials did not complete: [_Trainable_c58c4_00030, _Trainable_c58c4_00031, _Trainable_c58c4_00032, _Trainable_c58c4_00033, _Trainable_c58c4_00034, _Trainable_c58c4_00035, _Trainable_c58c4_00037, _Trainable_c58c4_00038, _Trainable_c58c4_00039, _Trainable_c58c4_00040, _Trainable_c58c4_00041, _Trainable_c58c4_00042, _Trainable_c58c4_00043, _Trainable_c58c4_00044, _Trainable_c58c4_00045, _Trainable_c58c4_00046, _Trainable_c58c4_00047]


{'n_estimators': 2000, 'learning_rate': 0.1, 'max_depth': 1}
0.9051947569819359


In [None]:
# Create a Lasso regression object
lasso = Lasso()

# Define the parameter grid
param_grid = {
    'alpha': np.arange(0.0001, 5, 0.0004)
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid)

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)

In [75]:
ridge = Ridge()
param_grid = {
    'alpha': np.arange(0.1, 3, 0.1)
}
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'alpha': 2.9000000000000004}
0.8564960534094646


In [82]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0001892547, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.003752674, l1_ratio=0, random_state=1))
ridge = Ridge(2.9000000000000004)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.1,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10,
                                   loss='huber', random_state =5)

In [83]:
score = rmsle_cv(lasso)
print("Lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(ridge)
print("Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

averaged_models = AveragingModels(models = (ENet, GBoost, ridge, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Lasso score: 0.1483 (0.0269)

Ridge score: 0.1485 (0.0274)

ElasticNet score: 0.1483 (0.0271)

Gradient Boosting score: 0.1271 (0.0135)

 Averaged base models score: 0.1373 (0.0231)


In [15]:
averaged_models = AveragingModels(models = (ENet, GBoost, ridge, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.1374 (0.0241)


In [80]:
ridge = Ridge(2.9000000000000004)
score = rmsle_cv(ridge)
print("Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Ridge score: 0.1485 (0.0274)


In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0001892547, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.003752674, l1_ratio=0, random_state=1))
ridge = Ridge(2.9000000000000004)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.1,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10,
                                   loss='huber', random_state =5)