In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from pandas.core.common import random_state
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
%pip install xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.exceptions import ConvergenceWarning
import pickle

Note: you may need to restart the kernel to use updated packages.


In [32]:
# This script contains:  
#   - prepared data loading,
#   - defining custom scoring metric,
#   - training and tuning of hyper parameters of such models as (see more in the FittingAndEvaluation notebook):
#                                           * Linear Regression,
#                                           * Ridge Regression,
#                                           * Lasso Regression,
#                                           * Elastic Net,
#                                           * Decision Tree Regressor,
#                                           * Random Forest Regressor, 
#                                           * XGBoost Regressor.
#   - saving models trained on data processed in EDA notebook


# Load the processed data
def load_processed_data(processed_data_path):
    data = pd.read_csv(processed_data_path, index_col = 'Id')
    X = data.copy()  
    y = X['SalePrice']
    X = X.drop(['SalePrice'], axis = 1)
    return X, y


# Define a custom scoring method. (See FittingAndEvaluation notebook why this scorer was choosen)

# My metric for cross validation
def rmse_log(model, X, y):
    y_pred = model.predict(X)
    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(y_pred)))

# My metric for GridSearch
def rmse_log_for_gridsearch(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log(y_true), np.log(y_pred)))

custom_scorer = make_scorer(rmse_log_for_gridsearch, greater_is_better=False)


# Linear Regression
def linear_regression_fit(X, y, rmse_log):
    
    linear_regression = LinearRegression()

    # Get cross-validation score
    linear_regression_scores = cross_val_score(linear_regression,
                            X,
                            y,
                            cv = 5,
                            scoring = rmse_log)

    print(linear_regression_scores.mean())
    
    return linear_regression


# Ridge Regression
def ridge_regression_tune_fit(X, y, custom_scorer, alpha = [2,641]):
    # Create a model sample
    ridge_sample = Ridge()

    # Set the search area for GridSearch
    ridge_hyper_params = {'alpha': alpha, 'random_state': [0]}

    # Create a GridSearch sample and fit the model
    ridge_regression = GridSearchCV(ridge_sample, ridge_hyper_params, scoring = custom_scorer, cv = 5)
    ridge_regression.fit(X, y)

    # Print best parameters and best score
    print('Best value of λ: ', ridge_regression.best_params_)
    print('Best score: ', ridge_regression.best_score_)

    # Give model best parameters
    ridge_best_params = ridge_regression.best_params_
    ridge_regression = Ridge(**ridge_best_params)

    return ridge_regression



# Lasso Regression 
def lasso_regression_tune_fit(X, y, custom_scorer, alpha = [41]):
    # I don't want to overload the output of the Lasso regression
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

    # Create a model sample
    lasso_sample = Lasso()

    # Set the search area for GridSearch
    lasso_hyper_params = {'alpha': alpha, 'random_state': [0]}
    
    # Create a GridSearch sample and fit the model
    lasso_regression = GridSearchCV(lasso_sample, lasso_hyper_params, scoring = custom_scorer, cv = 5)
    lasso_regression.fit(X, y)

    # Print best parameters and best score
    print('best alpha: ', lasso_regression.best_params_)
    print('score: ', lasso_regression.best_score_)

    # Give model best parameters
    lasso_best_params = lasso_regression.best_params_
    lasso_regression = Lasso(**lasso_best_params)

    return lasso_regression


# Elastic Net  
def elastic_net_tune_fit(X, y, custom_scorer, alpha = [41], l1_ratio = [1]):
    # I don't want to overload the output of the Elastic Net
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    
    # Create a model sample
    elastic_net_sample = ElasticNet()

    # Set the search area for GridSearch
    elnet_hyper_params = {'alpha': alpha, 'l1_ratio': l1_ratio, 'random_state': [0]}

    # Create a GridSearch sample and fit the model
    elastic_net = GridSearchCV(elastic_net_sample, elnet_hyper_params, scoring = custom_scorer, cv = 5)
    elastic_net.fit(X, y)

    # Print best parameters and best score
    print('best alpha and l1_ratio: ', elastic_net.best_params_)
    print('score: ', elastic_net.best_score_)

    # Give model best parameters
    elnet_best_params = elastic_net.best_params_
    elastic_net = ElasticNet(**elnet_best_params)

    return elastic_net


# DecisionTree   
def decision_tree_tune_fit(X, y, custom_scorer, max_depth = [6], min_samples_split = [2], min_samples_leaf = [6], max_features = [35], 
                           min_impurity_decrease = [0], ccp_alpha = [0]):
    # Create a model sample
    decision_tree_sample = DecisionTreeRegressor()

    # Set the search area for GridSearch
    decision_tree_hyper_params = {'max_depth': max_depth,
                                'min_samples_split': min_samples_split,
                                'min_samples_leaf': min_samples_leaf,
                                'max_features': max_features,
                                'random_state': [0],
                                'min_impurity_decrease': min_impurity_decrease,
                                'ccp_alpha': ccp_alpha
                                }
    
    # Create a GridSearch sample and fit the model
    decision_tree_regressor = GridSearchCV(decision_tree_sample, decision_tree_hyper_params, 
                                        scoring = custom_scorer, cv = 5)
    decision_tree_regressor.fit(X, y)

    # Print best parameters and best score
    print('Best DT params: ', decision_tree_regressor.best_params_)
    print('Best score: ', decision_tree_regressor.best_score_)

    # Give model best parameters
    decision_tree_best_params = decision_tree_regressor.best_params_
    decision_tree_regressor = DecisionTreeRegressor(**decision_tree_best_params)

    return decision_tree_regressor


# Random Forest   
def random_forest_tune_fit(X, y, custom_scorer, n_estimators = [1150], max_depth = [27], min_samples_split = [3], 
                           min_samples_leaf = [1], max_features = [12]):
    # Create a model sample
    random_forest_sample = RandomForestRegressor()

    # Set the search area for GridSearch
    random_forest_hyper_params = {'n_estimators': n_estimators,
                                'max_depth': max_depth, 
                                'min_samples_split': min_samples_split,
                                'min_samples_leaf': min_samples_leaf,
                                'max_features': max_features,
                                'random_state': [0],
                                'n_jobs': [-1]
                                }
    
    # Create a GridSearch sample and fit the model
    random_forest_regressor = GridSearchCV(random_forest_sample, random_forest_hyper_params, 
                                        scoring = custom_scorer, cv = 5)
    random_forest_regressor.fit(X, y)

    # Print best parameters and best score
    print('Best parameters: ', random_forest_regressor.best_params_)
    print('Best score: ', random_forest_regressor.best_score_)

    # Give model best parameters
    random_forest_best_params = random_forest_regressor.best_params_
    random_forest_regressor = RandomForestRegressor(**random_forest_best_params)

    return random_forest_regressor


# # Extreme Gradient Boosting  
def xgboost_tune_fit(X, y, custom_scorer, max_depth = [3], min_child_weight = [7], gamma = [0], subsample = [1], colsample_bytree = [1],
                     reg_alpha = [0], reg_lambda = [1]):
    # Split the data to use eval_set in selecting the best values for n_estimators and learning_rate
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)

    # Find the best values for n_estimators and learning_rate
    # The best learning_rate was found through experimentation and multiple code executions. Here you can only see the result
    xgb_regressor_presearch = XGBRegressor(n_estimators = 1000, learning_rate = 0.25, random_state = 0) 
    xgb_regressor_presearch.fit(X_train, y_train,
                    early_stopping_rounds = 100,
                    eval_set = [(X_valid, y_valid)],
                    verbose = False)

    print("Best value for n_estimators: ", xgb_regressor_presearch.best_iteration)

    # Create features for best iteretion and best learning rate
    xgb_best_iteration = xgb_regressor_presearch.best_iteration
    xgb_best_learning_rate = 0.25

    # Find the best values for all other parameters using GridSearchCV
    # All the values were found through experimentation and multiple code executions. Here you can only see the result
    xgb_regressor_sample = XGBRegressor()
    xgb_hyper_params = {'n_estimators': [xgb_best_iteration],
                        'learning_rate': [xgb_best_learning_rate],
                        'max_depth': max_depth, 
                        'min_child_weight': min_child_weight,  
                        'gamma': gamma,
                        'subsample': subsample,
                        'colsample_bytree': colsample_bytree,
                        'reg_alpha': reg_alpha,
                        'reg_lambda': reg_lambda,
                        'random_state': [0]
                    }

    # Create a GridSearch sample and fit the model
    xgb_regressor = GridSearchCV(xgb_regressor_sample, xgb_hyper_params, scoring = custom_scorer, cv = 10)
    xgb_regressor.fit(X, y)

    # Give model best parameters
    print('Best parameters: ', xgb_regressor.best_params_)
    print('Best score: ', xgb_regressor.best_score_)

    # Give model best parameters
    xgb_best_params = xgb_regressor.best_params_
    xgb_regressor = XGBRegressor(**xgb_best_params)

    return xgb_regressor




# Save models
def save_models(X, y, rmse_log, custom_scorer):
    # Create a list for storing models
    models = []

    # Add linear regression
    models.append(linear_regression_fit(X, y, rmse_log))

    # Add other models. (I exclude elastic net, because in my case it's equal to lasso regression)
    for function in (ridge_regression_tune_fit, lasso_regression_tune_fit, decision_tree_tune_fit,
                random_forest_tune_fit, xgboost_tune_fit):
        new_model = function(X, y, custom_scorer)
        models.append(new_model)
    
    # Check that right hyper parameters were saved
    print(models)
    
    # Save models to a file
    with open('models.pkl', 'wb') as f:
        pickle.dump(models, f)
    
    return models

In [29]:
data_file_path = '../Data/train_data_processed.csv'

X, y = load_processed_data(data_file_path)

In [33]:
save_models(X, y, rmse_log, custom_scorer)

0.11992314746501571
Best value of λ:  {'alpha': 2, 'random_state': 0}
Best score:  -0.11924837322151323
best alpha:  {'alpha': 41, 'random_state': 0}
score:  -0.11832225699293708
Best DT params:  {'ccp_alpha': 0, 'max_depth': 6, 'max_features': 35, 'min_impurity_decrease': 0, 'min_samples_leaf': 6, 'min_samples_split': 2, 'random_state': 0}
Best score:  -0.14731339342578062
Best parameters:  {'max_depth': 27, 'max_features': 12, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 1150, 'n_jobs': -1, 'random_state': 0}
Best score:  -0.11047975622091558




Best value for n_estimators:  18
Best parameters:  {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.25, 'max_depth': 3, 'min_child_weight': 7, 'n_estimators': 18, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 1}
Best score:  -0.11599717485490679
[LinearRegression(), Ridge(alpha=2, random_state=0), Lasso(alpha=41, random_state=0), DecisionTreeRegressor(ccp_alpha=0, max_depth=6, max_features=35,
                      min_impurity_decrease=0, min_samples_leaf=6,
                      random_state=0), RandomForestRegressor(max_depth=27, max_features=12, min_samples_split=3,
                      n_estimators=1150, n_jobs=-1, random_state=0), XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=No

[LinearRegression(),
 Ridge(alpha=2, random_state=0),
 Lasso(alpha=41, random_state=0),
 DecisionTreeRegressor(ccp_alpha=0, max_depth=6, max_features=35,
                       min_impurity_decrease=0, min_samples_leaf=6,
                       random_state=0),
 RandomForestRegressor(max_depth=27, max_features=12, min_samples_split=3,
                       n_estimators=1150, n_jobs=-1, random_state=0),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
              device=None, early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.25, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=7, missing=nan,
              mon

In [25]:
linear_regression_fit(X, y, rmse_log)

0.11992314746501571


In [26]:
with open('linear_regression.pkl', 'rb') as f:
    linear_regression = pickle.load(f)

In [20]:
test_data = pd.read_csv('../Data/test_data_processed.csv')

In [27]:
linear_regression_prediction = linear_regression.predict(test_data)

NotFittedError: This LinearRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [1]:
def environment_setting():
    pip install -r '../requirements.txt'
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_selection import mutual_info_regression

SyntaxError: invalid syntax (Temp/ipykernel_11160/3049604703.py, line 2)