## Imports

In [8]:
%pip install lightgbm
%pip install catboost
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, cross_validate
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

In [10]:
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)

## Methods

In [11]:
def load_train():
    data = pd.read_csv("../HPP/train_preprocessed.csv")
    return data

def load_test():
    data = pd.read_csv("../HPP/test_preprocessed.csv")
    return data

######################################################
# Base Models
######################################################


def base_models(X, y, scoring = 'neg_root_mean_squared_error'):
    print("Base Models....")
    models = [('LR', LinearRegression()),
                   ("Ridge", Ridge()),
                   ("Lasso", Lasso()),
                   ("ElasticNet", ElasticNet()),
                   ('KNN', KNeighborsRegressor()),
                   ('CART', DecisionTreeRegressor()),
                   ('RF', RandomForestRegressor()),
                   ('SVR', SVR()),
                   ('GBM', GradientBoostingRegressor()),
                   ("XGBoost", XGBRegressor(objective = 'reg:squarederror')),
                   ("LightGBM", LGBMRegressor()),
                   ("CatBoost", CatBoostRegressor(verbose=False))
          ]

    for name, regressor in models:
        cv_results = cross_validate(regressor, X, y, cv = 5, scoring = scoring)
        print(f"{scoring}: {round(-cv_results['test_score'].mean(), 4)} ({name}) ")

    #for name, regressor in models:
        #rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring=scoring)))
        #print(f"RMSE: {round(rmse, 4)} ({name}) ")



######################################################
# Automated Hyperparameter Optimization
######################################################

# Ridge Regression parameters
ridge_params = {"alpha": [0.1, 1.0, 10.0, 100.0]}

# Lasso Regression parameters
lasso_params = {"alpha": [0.01, 0.1, 1.0, 10.0]}

# ElasticNet parameters
elasticnet_params = {"alpha": [0.1, 1.0, 10.0],
                     "l1_ratio": [0.1, 0.5, 0.9]}

# K-Nearest Neighbors parameters
knn_params = {"n_neighbors": range(2, 50)}

# Decision Tree Regressor parameters
cart_params = {'max_depth': range(1, 20),
               "min_samples_split": range(2, 30)}

# Random Forest Regressor parameters
rf_params = {"max_depth": [8, 15, None],
             "max_features": [5, 7, "sqrt"],
             "min_samples_split": [15, 20],
             "n_estimators": [200, 300]}

# Support Vector Regressor parameters
svr_params = {'kernel': ['linear', 'rbf'],
              'C': [0.1, 1, 10],
              'gamma': ['scale', 'auto']}

# Gradient Boosting Regressor parameters
gbm_params = {"learning_rate": [0.01, 0.1],
              "n_estimators": [100, 200],
              "max_depth": [3, 5, 7]}

# XGBoost parameters
xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8],
                  "n_estimators": [100, 200]}

# LightGBM parameters
lightgbm_params = {"learning_rate": [0.01, 0.1],
                   "n_estimators": [300, 500]}

# CatBoost parameters
catboost_params = {"depth": [4, 6, 10],
                   "learning_rate": [0.01, 0.1],
                   "iterations": [100, 200]}

# Creating a list of models and their respective parameter grids
regressors = [
    ("Ridge", Ridge(), ridge_params),
    ("Lasso", Lasso(), lasso_params),
    ("ElasticNet", ElasticNet(), elasticnet_params),
    ('KNN', KNeighborsRegressor(), knn_params),
    ('CART', DecisionTreeRegressor(), cart_params),
    ('RF', RandomForestRegressor(), rf_params),
    #('SVR', SVR(), svr_params),
    ('GBM', GradientBoostingRegressor(), gbm_params),
    ("XGBoost", XGBRegressor(objective='reg:squarederror'), xgboost_params),
    ("LightGBM", LGBMRegressor(), lightgbm_params),
    ("CatBoost", CatBoostRegressor(verbose=False), catboost_params)
]



def hyperparameter_optimization(X, y, cv = 5, scoring = "neg_root_mean_squared_error"):
    print("Hyperparameter Optimization....")
    best_models = {}
    for name, regressor, params in regressors:
        print(f"########## {name} ##########")
        cv_results = cross_validate(regressor, X, y, cv = cv, scoring = scoring)
        print(f"{scoring} (Before): {round(-cv_results['test_score'].mean(), 4)}")

        gs_best = GridSearchCV(regressor, params, cv=cv, n_jobs=-1, verbose=False).fit(X, y)
        final_model = regressor.set_params(**gs_best.best_params_)

        cv_results = cross_validate(final_model, X, y, cv = cv, scoring = scoring)
        print(f"{scoring} (After): {round(-cv_results['test_score'].mean(), 4)}")
        print(f"{name} best params: {gs_best.best_params_}", end = "\n\n")
        best_models[name] = final_model
    return best_models


######################################################
# Stacking & Ensemble Learning
######################################################

def voting_regressor(best_models, X, y):
    print("Voting Regressor...")

    voting_reg = VotingRegressor(estimators=[('CatBoost', best_models["CatBoost"]),
                                              ('GBM', best_models["GBM"]),
                                              ('XGBoost', best_models["XGBoost"]),
                                              ('Lasso', best_models["Lasso"]),
                                              ('RF', best_models["RF"]),
                                              ('LightGBM', best_models["LightGBM"]),
                                              ('ElasticNet', best_models["ElasticNet"]),
                                              #('Ridge', best_models["Ridge"]),
                                              ], 
                                ).fit(X, y)

    cv_results = cross_validate(voting_reg, X, y, cv = 5, scoring = 'neg_root_mean_squared_error')
    print(f"RMSE: {-cv_results['test_score'].mean()}")
    return voting_reg

# MODEL

In [21]:
data = load_train()
columns = data.columns.tolist()

In [22]:
scaler = RobustScaler()

data = pd.DataFrame(scaler.fit_transform(data))

ValueError: could not convert string to float: 'LotFrontage'

In [20]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,193,194,195,196,197,198,199,200,201,202
0,-0.257516,-0.267660,0.5,0.0,0.652174,0.243243,1.033190,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.464671,0.029682,0.0,3.0,0.065217,-0.486486,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.100761,0.399288,0.5,0.0,0.608696,0.216216,0.996140,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.535329,0.017513,0.5,0.0,-1.260870,-0.648649,0.000000,0.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.634651,0.951802,1.0,0.0,0.586957,0.162162,1.146142,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-0.421564,-0.419488,0.0,0.0,0.565217,0.162162,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1456,0.675896,0.767380,0.0,1.0,0.108696,-0.162162,0.936248,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1457,-0.204486,-0.109865,0.5,4.0,-0.695652,0.324324,0.000000,2.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1458,-0.100761,0.057911,-0.5,1.0,-0.500000,0.054054,0.000000,0.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X = data.drop('SalePrice', axis = 1)
y = data["SalePrice"]

KeyError: "['SalePrice'] not found in axis"

In [None]:
y = np.log1p(y)

In [None]:
y = np.ravel(y.values)

In [None]:
base_models(X, y)

In [None]:
best_models = hyperparameter_optimization(X, y)

In [None]:
voting_reg = voting_regressor(best_models, X, y)

In [None]:
test_df = load_test()
print(test_df.shape)
print(X.shape)
scaler_test = RobustScaler()
test_df = pd.DataFrame(scaler_test.fit_transform(test_df))

In [None]:
#model = GradientBoostingRegressor()
#model.fit(X, y)
#predictions = model.predict(test_df)

#predictions = final_model.predict(test_df, axis=1)

predictions = voting_reg.predict(test_df)

dictionary = {"Id":test_df.index + 1461, "SalePrice":predictions}
dfSubmission = pd.DataFrame(dictionary)

In [None]:
dfSubmission['SalePrice'] = np.exp(dfSubmission['SalePrice'])
dfSubmission['SalePrice'] = pd.DataFrame(scaler.inverse_transform(dfSubmission['SalePrice']))

In [None]:
dfSubmission.to_csv("housePricePredictions.csv", index=False)