 * [Пример моделирования](https://www.kaggle.com/godzill22/house-price-prediction-improved-model-top-8)<br>

In [None]:
import numpy as np
import pandas as pd

import missingno
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import scipy as sp
from scipy.stats import skew

import warnings
warnings.filterwarnings("ignore")

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
comb = [train_df, test_df]

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

# Create function for model evaluation
def model_evaluation(algo,algoname):
    """
    This function  fit and  evaluate 
    given algorithm. It takes 3 arguments:
    
    First: algorithm of a choice without parentheses.
    Second: the name of a algorithm as a string.
    """

    # Fit given model
    algo.fit(scaled_Xtrain, y_train)
    y_pred = algo.predict(scaled_Xtest)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # R-squared 
    r2score = r2_score(y_test, y_pred)
    
    print(f"**{algoname} Metrics**")
    print(f"**MAE: {mae:}")
    print(f"**RMSE: {rmse:}")
    print(f"**R-squared: {r2score:.2f}%")
    
    return mae, rmse, r2score, y_pred, algo

Great place to start for someone who ask what algorithm I shoud use is sklearn algorithm cheat-sheet https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html. Follow their recommendation I am going to choose first Stochastic Gradient Descent. Let's create Stochastic Gradient Descent first.

**SGDRegressor**

In [None]:
# Create a base model
from sklearn.linear_model import SGDRegressor

sgd_base_model = SGDRegressor(random_state=101)

sgd_base_mae, sgd_base_rmse, sgd_base_r2score, sgd_y_pred, _ = model_evaluation(sgd_base_model, 
                                                                                "SGDRegressor")

In [None]:
def plot_residuals(y_pred, algoname):
    """
    Function plots probability and residuals plot
    """
    residuals = pd.Series(y_test - y_pred, 
                          name="residuals")
    
    fig, axes = plt.subplots(ncols=2, 
                             nrows=2, 
                             figsize=(14,4), 
                             dpi=120)
    # Plot probability
    sp.stats.probplot(residuals, plot=axes[0,0])
    # Plot kde
    sns.distplot(residuals, ax=axes[0,1], hist=False)
    # Plot residuals
    sns.scatterplot(x=y_test, y=residuals, ax=axes[1,0])
    axes[1,0].axhline(y=0, c='red',ls='--')
    # Plot distribution
    sns.boxplot(residuals, ax=axes[1,1])
    plt.tight_layout()

In [None]:
plot_residuals(sgd_y_pred, "SGDRegressor")

**Gradient Boosting Regressor**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor()
gbr_base_mae, gbr_base_rmse, gbr_base_r2score, gbr_y_pred, gbr_model = model_evaluation(gbr_model, 
                                                                                        "GradientBostingRegressor")

In [None]:
plot_residuals(gbr_y_pred, "Gradient Boosting Regressor")

**Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr_model = RandomForestRegressor()
rfr_base_mae, rfr_base_rmse, rfr_base_r2score, rfr_y_pred, rfr_model = model_evaluation(rfr_model, 
                                                                                        "RandomForestRegressor")

In [None]:
plot_residuals(rfr_y_pred, "Random Forest Regressor")

**Extreme Gradient Boosting**

In [None]:
from xgboost import XGBRegressor

xgboost_model = XGBRegressor()
xgboost_base_mae, xgboost_base_rmse, xgboost_base_r2score, xgboost_y_pred, xgboost_model = model_evaluation(xgboost_model, 
                                                                                                            "Extreme Gradient Boosting")

In [None]:
plot_residuals(xgboost_y_pred, "Extreme Gradient Boosting")

**KNeighbors**

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor()
knn_base_mae, knn_base_rmse, knn_base_r2score, knn_y_pred, knn_model = model_evaluation(knn_model, 
                                                                                        "KNeighborsRegressor")

In [None]:
plot_residuals(knn_y_pred, "KNeighborsRegressor")

**Submmit to Kaggle with the best score to Kaggle competition**

In [None]:
# Instantiate StandardScaler and copy dataset
sc = StandardScaler()
scaled_X = X.copy()
scaled_test = clean_test_df.copy()

# Scale the data
scaled_X[num_feat] = sc.fit_transform(X[num_feat])
scaled_test[num_feat] = sc.transform(clean_test_df[num_feat])

# Instantiate the final model
# final_base_model = GradientBoostingRegressor()

# Fit the model
# final_base_model.fit(scaled_X, y)

# final_predictions = final_base_model.predict(scaled_test)


# Make predictions and save it to the dataframe
# final_base_model_df = pd.DataFrame({"id":row_id,
                                    # "SalePrice": np.expm1(final_predictions)})

In [None]:
# final_base_model_df.to_csv("house_price_final_base_sub.csv", index=False)

**ElasticNetCV**

I am going to use ElasticNetCV in the base line models predictions as it will allow me to choose between Ridge(L2 regularization) or Lasso (L1 regularization). The benefit is that elastic net allows a balance of both penalties, which can result in better performance than a model with either one or the other penalty on some problems.

In [None]:
from sklearn.linear_model import ElasticNetCV

elastic_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1])

el_base_mae, el_base_rmse, el_base_r2score, el_base_y_pred, elastic_model = model_evaluation(elastic_model,
                                                                                             "ElasticNetCV")

In [None]:
plot_residuals(el_base_y_pred, "ElasticNetCV")

In [None]:
elastic_model.l1_ratio_

In [None]:
from sklearn.linear_model import LassoCV

lasso_cv_model = LassoCV(eps=0.01, n_alphas=200, cv=10, max_iter=1000000)


lassoCV_mae, lassoCV_rmse, lassoCV_r2score, lassoCV_y_pred, lasso_cv_model = model_evaluation(lasso_cv_model, "LassoCV")

In [None]:
plot_residuals(lassoCV_y_pred, "LassoCV")

**RidgeCV**

In [None]:
from sklearn.linear_model import RidgeCV

ridge_model = RidgeCV(alphas=[0.1, 1.0, 10.0])
ridge_cv_mae, ridge_cv_rmse, ridge_cv_r2, ridge_cv_y_pred, ridge_model = model_evaluation(ridge_model,
                                                                                          "RidgeCV")

In [None]:
ridge_model.alpha_

In [None]:
plot_residuals(ridge_cv_y_pred, "RidgeCV")

In [None]:
from sklearn.svm import SVR

svr_base_model = SVR()

svr_base_mae, svr_base_rmse, svr_base_r2score, svr_base_y_red, svr_base_model = model_evaluation(svr_base_model, 
                                                                                                 "Support Vector Regressor")

In [None]:
plot_residuals(svr_base_y_red, "SVR")

**CatBoostRegressor**

In [None]:
from catboost import CatBoostRegressor

cat_base = CatBoostRegressor(verbose=0, random_state=101)

cat_base_mae, cat_base_rmse, cat_base_r2, cat_base_y_pred, cat_base_model = model_evaluation(cat_base,
                                                                                             "CatBoostRegressor")

In [None]:
feat_imp = cat_base.get_feature_importance(prettified=True)

# Plotting top 20 features' importance

plt.figure(figsize = (12,8))
sns.barplot(feat_imp['Importances'][:20],feat_imp['Feature Id'][:20], orient = 'h', palette="coolwarm_r")
plt.title("Feature Importance")
plt.show()

In [None]:
import shap
from catboost import Pool

# Feature importance Interactive Plot 

train_pool = Pool(scaled_Xtrain)
val_pool = Pool(scaled_Xtest)

explainer = shap.TreeExplainer(cat_base_model) # insert your model
shap_values = explainer.shap_values(train_pool) # insert your train Pool object

shap.summary_plot(shap_values, scaled_Xtrain)

### Base model scores metrics

In [None]:
base_score_df = pd.DataFrame({"Model":["SGDRegressor", "GradientBoostingRegressor",
                                       "RandomForestRegressor", "Extreme Gradient Boosting",
                                       "KNeighborsRegressor" , "LassoCV", "SVR", "RidgeCV",
                                       "CatBoost"],
                              
                              "R-square":[sgd_base_r2score, gbr_base_r2score, rfr_base_r2score,
                                         xgboost_base_r2score, knn_base_r2score, lassoCV_r2score,
                                         svr_base_r2score, ridge_cv_r2, cat_base_r2],
                              
                              "RMSE":[sgd_base_rmse, gbr_base_rmse, rfr_base_rmse, xgboost_base_rmse,
                                      knn_base_rmse, lassoCV_rmse, svr_base_rmse, ridge_cv_rmse,
                                      cat_base_rmse],
                              
                              "MAE": [sgd_base_mae, gbr_base_mae, rfr_base_mae, xgboost_base_mae,
                                      knn_base_mae, lassoCV_mae, svr_base_mae, ridge_cv_mae,
                                      cat_base_mae]})

base_score_df = base_score_df.sort_values(by=["R-square"], 
                                          ascending=False).reset_index(drop=True)

In [None]:
print("**Base Models Metrics**")
base_score_df

In [None]:
# Visualize the table above
fig, ax = plt.subplots(figsize=(8,5))

sns.barplot(x="Model", y="R-square", data=base_score_df, ax=ax, palette="magma")
sns.lineplot(x="Model", y="RMSE", data=base_score_df, color="red", ax=ax,legend='brief', label="rmse")
sns.lineplot(x="Model", y="MAE", data=base_score_df, color='green', ax=ax, legend='brief', label="mae")

plt.xticks(rotation=45, horizontalalignment="right")
plt.title("Regression Model Performance Metrics")
plt.ylabel("R_squared")
plt.legend();

#### Submmit Voting Ensamble Model with base models

In [None]:
from sklearn.ensemble import VotingRegressor

ensemble1_model = VotingRegressor(estimators=[("ridgecv", ridge_model),
                                             ("catboost", cat_base_model),
                                             ("gbr", gbr_model),
                                             ("lassocv", lasso_cv_model),
                                             ("svr", svr_base_model),
                                             ("forest", rfr_model)])

In [None]:
ensemble1_mae, ensemble1_rmse, ensemble1_r2, _ , ensemble1_model = model_evaluation(ensemble1_model,
                                                                                    "Voting Regressor")

**Submmit ensemble model to the competition**

In [None]:
# Fit the model
# ensemble_model1.fit(scaled_X, y)

# final_ensemble = ensemble_model1.predict(scaled_test)


# Make predictions and save it to the dataframe
# final_base_ensemble_df = pd.DataFrame({"id":row_id,
                                       # "SalePrice": np.expm1(final_ensemble)})

In [None]:
# final_base_ensemble_df.to_csv("house_price_final_ensemble_base_sub.csv", index=False)

### GridSearchCV for the best hyperparameters

GridSearchCV is a exhaustive search over specified hyperparameters values for an estimator. It allow us to find the best combination of best parameters for a chosen model. I will split data again with test size of 0.1.

In [None]:
from sklearn.model_selection import GridSearchCV

def model_gridsearchCV(algo,param,name):
    """
    Function will perform gridsearchCV for given algorithm
    and parameter grid. Returns grid model, y_pred. Prints out 
    mean absolute error, root mean squared error, R-square score
    """
    # Instatiate base model
    model = algo()
    
    # Instantiate grid for a model
    model_grid = GridSearchCV(model, 
                             param,
                             scoring="r2",
                             # verbose=2,
                             n_jobs=-1,
                             cv=3)
    # Fit the grid model
    model_grid.fit(scaled_Xtrain, y_train)
    
    # Make prediction
    y_pred = model_grid.predict(scaled_Xtest)
    
    # Evaluate model
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2score = r2_score(y_test, y_pred)
    
    # Print 
    print(f"**{name} with GridSearchCV**")
    print(f"MAE: {mae:}")
    print(f"RMSE: {rmse:}")
    print(f"R-squared: {r2score:.2f}%")
    
    return mae, rmse, r2score, y_pred, model_grid

**GradientBoostingRegressor**

In [None]:
param_grid = {#"loss":["ls","lad","huber","quantile"],
              "learning_rate": [ 0.01, 0.1, 0.3, 1],
              "subsample": [0.5, 0.2, 0.1],
              "n_estimators": [500, 1000],
              "max_depth": [3,6,8]}

gbr_grid_mae, gbr_grid_rmse, gbr_grid_r2, _ , gbr_grid = model_gridsearchCV(GradientBoostingRegressor, 
                                                                            param_grid,
                                                                            "GradientBoostingRegressor")

In [None]:
gbr_grid.best_params_

**Random Forest Regressor**

In [None]:
param_grid = {"n_estimators": [500,1000, 1500],
              "max_features": ['auto','sqrt'],
              "max_depth": [None,5,10,],
              "min_samples_split": [2,5,10],
              "min_samples_leaf": [1,2,5,10]}

rfr_grid_mae, rfr_grid_rmse, rfr_grid_r2, _ , rfr_grid_model = model_gridsearchCV(RandomForestRegressor,
                                                                                  param_grid,
                                                                                  "RandomForestRegressor")

In [None]:
rfr_grid_model.best_params_

**SVR**

In [None]:
param_grid = {"kernel":["linear","rbf",],
              "gamma": ["scale","auto"],
              "C": [0.1, 0.5, 1, 10],
              "epsilon": [0.1, 0.01, 0.001]}

svr_grid_mae, svr_grid_rmse, svr_grid_r2, svr_grid_y_pred, svr_grid_model = model_gridsearchCV(SVR,
                                                                                param_grid,
                                                                               "SVR")

In [None]:
svr_grid_model.best_params_

In [None]:
svr_grid_model.best_score_

**Ridge**

RidgeCV had the best metrics and I want to see if GridSearchCV can improve the model.

In [None]:
from sklearn.linear_model import Ridge

param_grid = {"solver": ["auto","svd","lsqr","saga"],
              "max_iter": [1000, 10000],
              "tol": [1e-3,1e-2],
              "alpha": [0.1, 1.0, 10.0, 30.0]}

ridge_gr_mae, ridge_gr_rmse, ridge_gr_r2,_ , ridge_gr_model = model_gridsearchCV(Ridge,
                                                                                 param_grid,
                                                                                 "Ridge")

In [None]:
ridge_gr_model.best_params_

**Extreme Gradient Boosting**

There is a warning when running this algoritm, but It should not prevent your code from running, nor should it lead to different results.

In [None]:
param_grid = {"learning_rate":[0.05, 0.10, 0.15, 0.20, 0.30],
              "max_depth":[3,4,5,6,8,15],
              "min_child_weight":[1,3,5,7],
              "gamma":[0.0, 0.1, 0.2, 0.3, 0.4],
              "colsample_bytree":[0.3, 0.4, 0.5, 0.7]}

xboost_gr_mae, xboost_gr_rmse, xboost_gr_r2, _ , xboost_gr_model = model_gridsearchCV(XGBRegressor,
                                                                                      param_grid,
                                                                                      "XGBoost")

In [None]:
xboost_gr_model.best_params_

**CatBoostRegressor**

In [None]:
#param_grid = {'iterations': [250,100,500,1000],
              #'learning_rate': [0.01,0.1,0.2,0.3],
              #'depth': [4, 6],
              #'l2_leaf_reg': [3,1,5,10,100]}


# cat_grid_mae, cat_grid_rmse, cat_grid_r2, _ , cat_grid_model = model_gridsearchCV(CatBoostRegressor,
                                                                                  # param_grid,
                                                                                  # "CatBoost")

**CatBoost with GridSearchCV**
1. MAE: 0.08686650731538992
2. RMSE: 0.12366782337837257
3. R-squared: 0.92%

In [None]:
cat_grid_model.best_params_

In [None]:
grCV_metrics_df = pd.DataFrame({"Model":["GradientBoostingRegressor", "RandomForestRegressor", 
                                         "SVR", "Ridge", "XGBRegressor", "CatBoost"],
                                        
                                "R-square":[gbr_grid_r2, rfr_grid_r2, svr_grid_r2, 
                                            ridge_gr_r2, xboost_gr_r2, cat_grid_r2],
                                        
                                "RMSE":[gbr_grid_rmse, rfr_grid_rmse, svr_grid_rmse, 
                                        ridge_gr_rmse, xboost_gr_rmse, cat_grid_rmse],
                                        
                                "MAE":[gbr_grid_mae, rfr_grid_mae, svr_grid_mae, 
                                      ridge_gr_mae, xboost_gr_mae, cat_grid_mae]})

grCV_mertics_df = grCV_metrics_df.sort_values(by=["R-square"],
                                              ascending=False).reset_index(drop=True)

print("**GridSearchCV Models Metrics**")
grCV_mertics_df

In [None]:
# Visualize the table above
fig, ax = plt.subplots(figsize=(8,5))

list_order = list(grCV_mertics_df['Model'].values)
# R-squared
sns.barplot(x="Model", y="R-square", 
            data=grCV_metrics_df, ax=ax, 
            palette="magma", order= list_order)
# Root Mean Squared Error
sns.lineplot(x="Model", y="RMSE", data=grCV_metrics_df, 
             color="red", ax=ax,legend='brief', label="rmse")
# Mean Absolute Error
sns.lineplot(x="Model", y="MAE", data=grCV_metrics_df, 
             color='green', ax=ax, legend='brief', label="mae")

plt.xticks(rotation=45, horizontalalignment="right")
plt.title("Regression Models with GridSearchCV Metrics")
plt.ylabel("R_squared")
plt.legend();

###  Ensemble model with best parameters

In [None]:
ensemble2_model = VotingRegressor(estimators=[("ridgecv", ridge_gr_model.estimator),
                                             ("catboost", cat_grid_model.estimator),
                                             ("gbr", gbr_grid.estimator),
                                             ("lassocv", lasso_cv_model),
                                             ("svr", svr_base_model),
                                             ("forest", rfr_model.base_estimator)])

In [None]:
# Fit the model
ensemble2_model.fit(scaled_Xtrain, y_train)

In [None]:
# Evaluate ensemble model
ensemble2_y_pred = ensemble2_model.predict(scaled_Xtest)

ensemble2_mae = mean_absolute_error(y_test, ensemble2_y_pred)
ensemble2_rmse = np.sqrt(mean_squared_error(y_test, ensemble2_y_pred))
    
# R-squared 
ensemble2_r2 = r2_score(y_test, ensemble2_y_pred)
    
print(f"**VotingRegressor Metrics**")
print(f"**MAE: {ensemble2_mae}")
print(f"**RMSE: {ensemble2_rmse}")
print(f"**R-squared: {ensemble2_r2:.2f}%")

### Make predictions submmision to Kaggle

In [None]:
best_ensemble = VotingRegressor(estimators=[("gbr", gbr_grid.estimator),
                                            ("forest", rfr_grid_model.estimator),
                                            ("svr", svr_grid_model.estimator),
                                            ("ridge", ridge_gr_model.estimator),
                                            ("xgboost", xboost_gr_model.estimator),
                                            ("catboost", cat_grid_model.estimator)])

In [None]:
#fit the model
best_ensemble.fit(scaled_X, y)

final_ensemble2 = best_ensemble.predict(scaled_test)


#Make predictions and save it to the dataframe
final_ensemble_df = pd.DataFrame({"id":row_id,"SalePrice": np.expm1(final_ensemble2)})

In [None]:
final_ensemble_df.to_csv("house_price_grid_ensemble_sub.csv", index=False)

I have to admit that spent all that time on testing didn't help to improve the model in this case .After submmision to Kaggle competition I end up in top 8%. Now, I need to figure it out what to do next or which techniques I could implement to make this model even more robust. I've got some ideas already in my mind, anyway, back to reading and searching. So, if you Kagglers have some ideas let me know. Don't forget to leave feadback.