<a href="https://colab.research.google.com/github/Mikd14/Projects/blob/main/Machine-learning/Housing_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
pd.set_option('display.max_rows', 500)
%matplotlib inline 
plt.rcParams['figure.figsize'] = [25, 15]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train = pd.read_csv('/content/drive/MyDrive/Housing Prices/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Housing Prices/test.csv')

X = train.drop(['Id', 'SalePrice'], axis=1)
y = train['SalePrice'].copy()

X_test = test.drop('Id', axis=1)

In [None]:
#drop columns that are less than 50% full
sparse_cols = [col for col in X.columns if ((X[col].isna().sum() / 1000) > 0.5)]
X = X.drop(sparse_cols, axis=1)

X_test = X_test.drop(sparse_cols, axis=1)

In [None]:
num_cols = [col for col in X.columns if X[col].dtype in ['float64', 'int64']]
cat_cols = [col for col in X.columns if X[col].dtype == 'object']

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16',
                'float32', 'float64']
    for col in df.columns:
        if df[col].dtype=='bool':
            df[col] = df[col].astype(int)
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            #change int type to lowest poss
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

X = reduce_mem_usage(X)
X_test = reduce_mem_usage(X_test)


Mem. usage decreased to  0.51 Mb (38.2% reduction)
Mem. usage decreased to  0.51 Mb (37.7% reduction)


In [None]:
#histograms of num cols

X[num_cols].hist()
plt.figure(figsize=(8,8))


In [None]:
#scatter graphs of num cols  --  too many variables on this df

pd.plotting.scatter_matrix(X[num_cols])
plt.show()

In [None]:
#prep data
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
         ('Scaler', StandardScaler())
        ])

full_pipeline = ColumnTransformer([
            ('num_transformer', num_pipeline, num_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

preped_X = full_pipeline.fit_transform(X)
preped_X_test = full_pipeline.transform(X_test)

In [None]:

def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard Deviation:', scores.std()) 

display_scores(scores)

Scores: [-14709.34670175 -15976.78058425 -15466.30197081]
Mean: -15384.143085602025
Standard Deviation: 520.6788667664578


In [None]:
#grid search and fine tune hyperparameters

params = {'n_estimators': [ 20, 50, 100, 200 ],
        }

grid_search = GridSearchCV(model, params, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(preped_X, y)

grid_search.best_params_

cvres = grid_search.cv_results_
for mean, params in zip(cvres['mean_test_score'], cvres['params']):
  print(((-mean)**0.5), params)

31254.02039407175 {'n_estimators': 20}
30554.113317611907 {'n_estimators': 50}
30380.79782904145 {'n_estimators': 100}
30317.57588529509 {'n_estimators': 200}


In [None]:
#display feature importance and corresponding category in each column
feature_importances = grid_search.best_estimator_.feature_importances_
 
 
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attribs = num_cols + cat_one_hot_attribs


sorted(zip(feature_importances, attribs), reverse=True)

In [None]:
#try XGBRegressor
from xgboost.sklearn import XGBRegressor
import xgboost as xgb 
from sklearn import metrics

#function to test effect of changing hyperparameters
def modelfit(alg, X, y, useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
  xgb_param = alg.get_xgb_params()
  xgtrain = xgb.DMatrix(X, y)
  cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='mae', early_stopping_rounds=early_stopping_rounds)
  alg.set_params(n_estimators=cvresult.shape[0])

  #Fit the algorithm on the data
  alg.fit(X, y ,eval_metric='mae')

  #Predict training set:
  X_train_predictions = alg.predict(X)

  print("\nModel Report")
  print("Accuracy : %.7g" % metrics.mean_absolute_error(y, X_train_predictions))
  print(alg.get_booster().best_iteration)

  #feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
  #feat_imp.plot(kind='bar', title='Feature Importances')
  #plt.ylabel('Feature Importance Score')

modelfit(xgb1, preped_X, y)


Model Report
Accuracy : 4803.835
299


In [None]:
param_test1 = {
 'gamma':[i/10.0 for i in range(0,5)]
 
}

gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=137, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, seed=27), 
 param_grid = param_test1, scoring='neg_mean_absolute_error',n_jobs=-1, cv=5)
gsearch1.fit(preped_X ,y)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

cvresults = gsearch1.cv_results_
for mean, params in zip(cvresults['mean_test_score'], cvresults['params']):
  print((-mean), params)

gsearch1.best_params_

15578.073450877568 {'gamma': 0.0}
15578.073450877568 {'gamma': 0.1}
15578.073450877568 {'gamma': 0.2}
15578.073450877568 {'gamma': 0.3}
15578.073450877568 {'gamma': 0.4}


{'gamma': 0.0}

In [None]:
param_test4 =  {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}


gsearch4 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=137, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, seed=27), 
 param_grid = param_test4, scoring='neg_mean_absolute_error',n_jobs=-1, cv=5)
gsearch4.fit(preped_X ,y)
gsearch4.cv_results_, gsearch1.best_params_, gsearch1.best_score_

cvresults = gsearch4.cv_results_
for mean, params in zip(cvresults['mean_test_score'], cvresults['params']):
  print((-mean), params)

gsearch4.best_params_

15135.722270976028 {'colsample_bytree': 0.6, 'subsample': 0.6}
15549.11831656678 {'colsample_bytree': 0.6, 'subsample': 0.7}
15674.794790774828 {'colsample_bytree': 0.6, 'subsample': 0.8}
15423.304433326199 {'colsample_bytree': 0.6, 'subsample': 0.9}
15257.6826760488 {'colsample_bytree': 0.7, 'subsample': 0.6}
15423.744271725172 {'colsample_bytree': 0.7, 'subsample': 0.7}
15522.226763163528 {'colsample_bytree': 0.7, 'subsample': 0.8}
15675.820558647261 {'colsample_bytree': 0.7, 'subsample': 0.9}
15104.542206228594 {'colsample_bytree': 0.8, 'subsample': 0.6}
15709.588088613014 {'colsample_bytree': 0.8, 'subsample': 0.7}
15578.073450877568 {'colsample_bytree': 0.8, 'subsample': 0.8}
15731.774579944347 {'colsample_bytree': 0.8, 'subsample': 0.9}
15326.178994541953 {'colsample_bytree': 0.9, 'subsample': 0.6}
15764.25783390411 {'colsample_bytree': 0.9, 'subsample': 0.7}
15658.971385380992 {'colsample_bytree': 0.9, 'subsample': 0.8}
15658.344226241437 {'colsample_bytree': 0.9, 'subsample': 0

{'colsample_bytree': 0.8, 'subsample': 0.6}

In [None]:
#param test found 0.6 and 0.8 as best, check at smaller increments here, e.g 0.05's

param_test5 =  {
 'subsample':[i/100.0 for i in range(50,70,5)],
 'colsample_bytree':[i/100.0 for i in range(70,90, 5)]
}


gsearch5 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=91, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, seed=27), 
 param_grid = param_test5, scoring='neg_mean_absolute_error',n_jobs=-1, cv=5)
gsearch5.fit(preped_X ,y)


cvresults = gsearch5.cv_results_
for mean, params in zip(cvresults['mean_test_score'], cvresults['params']):
  print((-mean), params)

gsearch5.best_params_

15639.348975278253 {'colsample_bytree': 0.7, 'subsample': 0.5}
15493.761683968321 {'colsample_bytree': 0.7, 'subsample': 0.55}
15479.368073095035 {'colsample_bytree': 0.7, 'subsample': 0.6}
15451.416293878425 {'colsample_bytree': 0.7, 'subsample': 0.65}
15923.769263698628 {'colsample_bytree': 0.75, 'subsample': 0.5}
15593.373432148972 {'colsample_bytree': 0.75, 'subsample': 0.55}
15727.492604880137 {'colsample_bytree': 0.75, 'subsample': 0.6}
15862.731375749145 {'colsample_bytree': 0.75, 'subsample': 0.65}
16368.725390625 {'colsample_bytree': 0.8, 'subsample': 0.5}
15887.68283390411 {'colsample_bytree': 0.8, 'subsample': 0.55}
15330.715156785101 {'colsample_bytree': 0.8, 'subsample': 0.6}
15909.433243257707 {'colsample_bytree': 0.8, 'subsample': 0.65}
15867.921872324485 {'colsample_bytree': 0.85, 'subsample': 0.5}
15816.667923266268 {'colsample_bytree': 0.85, 'subsample': 0.55}
15328.644560680652 {'colsample_bytree': 0.85, 'subsample': 0.6}
15968.441170804794 {'colsample_bytree': 0.85,

{'colsample_bytree': 0.85, 'subsample': 0.6}

In [None]:
#check regulization checked [1e-5, 1e-2, 0.1, 1, 100] first, found 1 to be best
param_test6 = {
 'reg_alpha':[0.6, 0.55, 0.5]
}
gsearch6 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=91, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.6, colsample_bytree=0.85,seed=27), 
 param_grid = param_test6, scoring='neg_mean_absolute_error',n_jobs=-1, cv=5)
gsearch6.fit(preped_X ,y)

cvresults = gsearch6.cv_results_
for mean, params in zip(cvresults['mean_test_score'], cvresults['params']):
  print((-mean), params)

gsearch6.best_params_



15314.421176690925 {'reg_alpha': 0.6}
15328.644491117293 {'reg_alpha': 0.55}
15328.644715860446 {'reg_alpha': 0.5}


{'reg_alpha': 0.6}

In [None]:
param_test7 = {
    'reg_lambda': [0.8, 0.95, 1,1.05, 1.1, 1.3]
}

gsearch7 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=148, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.6, colsample_bytree=0.85,seed=27,reg_alpha=0.6), 
 param_grid = param_test7, scoring='neg_mean_absolute_error',n_jobs=-1, cv=5)
gsearch7.fit(preped_X ,y)

cvresults = gsearch7.cv_results_
for mean, params in zip(cvresults['mean_test_score'], cvresults['params']):
  print((-mean), params)

gsearch7.best_params_



15471.661732127566 {'reg_lambda': 0.8}
15342.473482983734 {'reg_lambda': 0.95}
15236.029818600171 {'reg_lambda': 1}
15343.068929259414 {'reg_lambda': 1.05}
15587.062168236302 {'reg_lambda': 1.1}
15621.319520547946 {'reg_lambda': 1.3}


{'reg_lambda': 1}

In [None]:

kfold = KFold(n_splits=3)
scores = cross_val_score(xgb1, preped_X, y, scoring='neg_mean_absolute_error', cv=kfold, n_jobs=-1)
display_scores(scores)
xgb1.fit(preped_X, y,verbose=1)
preds = xgb1.predict(preped_X_test)
preds


Scores: [-14709.34670175 -15976.78058425 -15466.30197081]
Mean: -15384.143085602025
Standard Deviation: 520.6788667664578


array([127420.28, 159132.  , 181317.7 , ..., 170747.02, 115442.66,
       218206.88], dtype=float32)

In [None]:
sub = pd.DataFrame()
sub['Id'] = test['Id']
sub['SalePrice'] = preds
sub.to_csv('Submission.csv', index=False)

from google.colab import files

files.download('Submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#when all tuning is done lower the learning rate and see the effect  best lr=0.06 4800
xgb1 = XGBRegressor(
 learning_rate =0.06,
 n_estimators=299,
 max_depth=5,
 min_child_weight=2,
 gamma=0,
 subsample=0.6,
 seed=27,
 reg_alpha=0.6,
 reg_lambda=1,
 colsample_bytree=0.85,)