# Assignment 2
```Berle, Bastian``` ```Holzapfel, Ron```

## Imports
- import packages
- set options

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [51]:
pd.set_option('display.max_columns', None)
RANDOM = 42

## Read Data

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/maschere/public-jupyter/master/data/ames%20housing/ames.csv')

## Prepare Dataset
- Split dataset in train and test data
- retrieve numeric and catergorical columns

In [6]:
# randomly select train and test sample in order to give no impact to time series
X_train, X_test = train_test_split(df, train_size=0.66, random_state=RANDOM)

In [7]:
y_train = X_train['Sale_Price']
X_train.drop(columns=['Sale_Price'], inplace=True)

In [32]:
y_test = X_test['Sale_Price']
X_test.drop(columns=['Sale_Price'], inplace=True)

In [8]:
num_cols = {column_name for column_name in X_train.columns if (X_train[column_name].dtype == 'int64' or X_train[column_name].dtype == 'float64')} # columns with numerical data types
cat_cols = set(X_train.columns)-num_cols # columns with object data types
num_cols = list(num_cols)
cat_cols = list(cat_cols)

## Analyse Dataset

In [9]:
print(f'{round(sum(df.memory_usage()/1e6), 2)} MB')

1.9 MB


In [10]:
df.describe()

Unnamed: 0,Lot_Frontage,Lot_Area,Year_Built,Year_Remod_Add,Mas_Vnr_Area,BsmtFin_SF_1,BsmtFin_SF_2,Bsmt_Unf_SF,Total_Bsmt_SF,First_Flr_SF,Second_Flr_SF,Low_Qual_Fin_SF,Gr_Liv_Area,Bsmt_Full_Bath,Bsmt_Half_Bath,Full_Bath,Half_Bath,Bedroom_AbvGr,Kitchen_AbvGr,TotRms_AbvGrd,Fireplaces,Garage_Cars,Garage_Area,Wood_Deck_SF,Open_Porch_SF,Enclosed_Porch,Three_season_porch,Screen_Porch,Pool_Area,Misc_Val,Mo_Sold,Year_Sold,Sale_Price,Longitude,Latitude
count,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0
mean,57.647782,10147.921843,1971.356314,1984.266553,101.096928,4.177474,49.705461,559.071672,1051.255631,1159.557679,335.455973,4.676792,1499.690444,0.431058,0.061092,1.566553,0.379522,2.854266,1.044369,6.443003,0.599317,1.766212,472.658362,93.751877,47.533447,23.011604,2.592491,16.002048,2.243345,50.635154,6.216041,2007.790444,180796.060068,-93.642897,42.034482
std,33.499441,7880.017759,30.245361,20.860286,178.634545,2.233372,169.142089,439.540571,440.968018,391.890885,428.395715,46.31051,505.508887,0.524762,0.245175,0.552941,0.502629,0.827731,0.214076,1.572964,0.647921,0.761137,215.187196,126.361562,67.4834,64.139059,25.141331,56.08737,35.597181,566.344288,2.714492,1.316613,79886.692357,0.0257,0.01841
min,0.0,1300.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,12789.0,-93.693153,41.986498
25%,43.0,7440.25,1954.0,1965.0,0.0,3.0,0.0,219.0,793.0,876.25,0.0,0.0,1126.0,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1.0,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129500.0,-93.660217,42.022088
50%,63.0,9436.5,1973.0,1993.0,0.0,3.0,0.0,465.5,990.0,1084.0,0.0,0.0,1442.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,2.0,480.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,160000.0,-93.641806,42.034662
75%,78.0,11555.25,2001.0,2004.0,162.75,7.0,0.0,801.75,1301.5,1384.0,703.75,0.0,1742.75,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2.0,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,213500.0,-93.622113,42.049853
max,313.0,215245.0,2010.0,2010.0,1600.0,7.0,1526.0,2336.0,6110.0,5095.0,2065.0,1064.0,5642.0,3.0,2.0,4.0,2.0,8.0,3.0,15.0,4.0,5.0,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755000.0,-93.577427,42.063388


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 81 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MS_SubClass         2930 non-null   object 
 1   MS_Zoning           2930 non-null   object 
 2   Lot_Frontage        2930 non-null   int64  
 3   Lot_Area            2930 non-null   int64  
 4   Street              2930 non-null   object 
 5   Alley               2930 non-null   object 
 6   Lot_Shape           2930 non-null   object 
 7   Land_Contour        2930 non-null   object 
 8   Utilities           2930 non-null   object 
 9   Lot_Config          2930 non-null   object 
 10  Land_Slope          2930 non-null   object 
 11  Neighborhood        2930 non-null   object 
 12  Condition_1         2930 non-null   object 
 13  Condition_2         2930 non-null   object 
 14  Bldg_Type           2930 non-null   object 
 15  House_Style         2930 non-null   object 
 16  Overal

In [52]:
df.nunique()

MS_SubClass         16
MS_Zoning            7
Lot_Frontage       129
Lot_Area          1960
Street               2
                  ... 
Sale_Type           10
Sale_Condition       6
Sale_Price        1032
Longitude         2776
Latitude          2762
Length: 81, dtype: int64

## Build Pipeline

In [15]:
cat_transf = Pipeline(steps=[('imputer', SimpleImputer(strategy = 'most_frequent')), ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])
num_transf = Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])

In [16]:
preproc = ColumnTransformer(transformers=[('num', num_transf, num_cols), ('cat', cat_transf, cat_cols)])

### Linear Regression Model

In [18]:
linear = LinearRegression()
pipeline = Pipeline(steps=[('preproc', preproc), ('model', linear)])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preproc', 'model', 'preproc__n_jobs', 'preproc__remainder', 'preproc__sparse_threshold', 'preproc__transformer_weights', 'preproc__transformers', 'preproc__verbose', 'preproc__verbose_feature_names_out', 'preproc__num', 'preproc__cat', 'preproc__num__memory', 'preproc__num__steps', 'preproc__num__verbose', 'preproc__num__imputer', 'preproc__num__scaler', 'preproc__num__imputer__add_indicator', 'preproc__num__imputer__copy', 'preproc__num__imputer__fill_value', 'preproc__num__imputer__missing_values', 'preproc__num__imputer__strategy', 'preproc__num__imputer__verbose', 'preproc__num__scaler__copy', 'preproc__num__scaler__with_mean', 'preproc__num__scaler__with_std', 'preproc__cat__memory', 'preproc__cat__steps', 'preproc__cat__verbose', 'preproc__cat__imputer', 'preproc__cat__onehot', 'preproc__cat__imputer__add_indicator', 'preproc__cat__imputer__copy', 'preproc__cat__imputer__fill_value', 'preproc__cat__imputer__missing_values', 'preproc__cat_

In [23]:
grid = GridSearchCV(
    estimator=pipeline,
    param_grid={
        'model__fit_intercept': [True, False], # whether to calculate the intercept for this model (i.e. data is expected to be centered).
        'model__n_jobs': [10, 100, 200],
        'model__positive': [False] # do not force coefficient to be positive
    },
    cv=3,
    n_jobs=1,
    scoring = 'neg_root_mean_squared_error',
    refit = True,
    verbose = 4
)

In [24]:
# build linear model and evaluate
grid.fit(X_train, y_train)
print(f'Best: {grid.best_score_} using {grid.best_params_}')

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END model__fit_intercept=True, model__n_jobs=10, model__positive=False;, score=-25890.128 total time=   0.5s
[CV 2/3] END model__fit_intercept=True, model__n_jobs=10, model__positive=False;, score=-32385.149 total time=   0.4s
[CV 3/3] END model__fit_intercept=True, model__n_jobs=10, model__positive=False;, score=-26836.038 total time=   0.4s
[CV 1/3] END model__fit_intercept=True, model__n_jobs=100, model__positive=False;, score=-25890.128 total time=   0.3s
[CV 2/3] END model__fit_intercept=True, model__n_jobs=100, model__positive=False;, score=-32385.149 total time=   0.5s
[CV 3/3] END model__fit_intercept=True, model__n_jobs=100, model__positive=False;, score=-26836.038 total time=   0.4s
[CV 1/3] END model__fit_intercept=True, model__n_jobs=200, model__positive=False;, score=-25890.128 total time=   0.4s
[CV 2/3] END model__fit_intercept=True, model__n_jobs=200, model__positive=False;, score=-32385.149 total time

In [33]:
grid.score(X_test, y_test) # neg_root_mean_squared_error

-34947.745705143025

## Stacked Model

In [37]:
ada = AdaBoostRegressor(random_state=RANDOM)
grad = GradientBoostingRegressor(random_state=RANDOM)
rf = RandomForestRegressor(random_state=RANDOM)
linear = LinearRegression()

In [38]:
stack = StackingCVRegressor(regressors=(ada, grad, rf), meta_regressor=linear)

In [39]:
pipeline = Pipeline(steps=[('preproc', preproc), ('model', stack)])

In [43]:
# get all parameters of the models used that can be adjusted
list(filter(lambda x: x.startswith('model_'), pipeline.get_params().keys())) 

['model__cv',
 'model__meta_regressor__copy_X',
 'model__meta_regressor__fit_intercept',
 'model__meta_regressor__n_jobs',
 'model__meta_regressor__normalize',
 'model__meta_regressor__positive',
 'model__meta_regressor',
 'model__multi_output',
 'model__n_jobs',
 'model__pre_dispatch',
 'model__random_state',
 'model__refit',
 'model__regressors',
 'model__shuffle',
 'model__store_train_meta_features',
 'model__use_features_in_secondary',
 'model__verbose',
 'model__adaboostregressor',
 'model__gradientboostingregressor',
 'model__randomforestregressor',
 'model__adaboostregressor__base_estimator',
 'model__adaboostregressor__learning_rate',
 'model__adaboostregressor__loss',
 'model__adaboostregressor__n_estimators',
 'model__adaboostregressor__random_state',
 'model__gradientboostingregressor__alpha',
 'model__gradientboostingregressor__ccp_alpha',
 'model__gradientboostingregressor__criterion',
 'model__gradientboostingregressor__init',
 'model__gradientboostingregressor__learning_

In [46]:
grid = GridSearchCV(
    estimator=pipeline,
    param_grid={
        'model__randomforestregressor__n_estimators': [15, 75],
        'model__adaboostregressor__loss': ['linear', 'square', 'exponential'],
    },
    cv=3,
    n_jobs=1,
    scoring = 'neg_root_mean_squared_error',
    refit = True,
    verbose = 4
)

In [47]:
# build model and evaluate
grid.fit(X_train, target)
print(f'Best: {grid.best_score_} using {grid.best_params_}')

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END model__adaboostregressor__loss=linear, model__randomforestregressor__n_estimators=15;, score=-28008.268 total time=  35.0s
[CV 2/3] END model__adaboostregressor__loss=linear, model__randomforestregressor__n_estimators=15;, score=-32435.917 total time=  38.7s
[CV 3/3] END model__adaboostregressor__loss=linear, model__randomforestregressor__n_estimators=15;, score=-23283.122 total time=  34.4s
[CV 1/3] END model__adaboostregressor__loss=linear, model__randomforestregressor__n_estimators=75;, score=-27837.000 total time= 1.7min
[CV 2/3] END model__adaboostregressor__loss=linear, model__randomforestregressor__n_estimators=75;, score=-32031.501 total time= 1.6min
[CV 3/3] END model__adaboostregressor__loss=linear, model__randomforestregressor__n_estimators=75;, score=-23664.055 total time= 1.5min
[CV 1/3] END model__adaboostregressor__loss=square, model__randomforestregressor__n_estimators=15;, score=-28065.084 total t

In [48]:
grid.score(X_test, y_test) # neg_root_mean_squared_error

-26332.194838630097