In [1]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from xgboost import XGBRegressor


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')

  plt.style.use("seaborn-whitegrid")


In [2]:
#Read Data
def load_data():
    data_directory = Path("/kaggle/input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_directory / "train.csv", index_col = "Id")
    df_test = pd.read_csv(data_directory / "test.csv", index_col = "Id")

    #Merging data for preprocessing
    df = pd.concat([df_train, df_test])
    
    #Preprocessing
    df = clean(df)
    df = encode(df)
    #Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]

    #dropping NA in target column
    df_train.dropna(subset=["SalePrice"], inplace=True)

    #X_train, X_valid = train_test_split(df_train, test_size=0.3, random_state=6)
    #X_train, X_valid, df_test = impute(X_train, X_valid, df_test)
    #X = pd.concat([X_train, X_valid])
    return df_train, df_test

In [3]:
def clean(df):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn":"BrkComm", "Wd Shng":"WdShing", "CmentBd":"CemntBd"})
    df["MSZoning"] = df["MSZoning"].replace({"C (all)":"C"})
    df["Neighborhood"] = df["Neighborhood"].replace({"NAmes":"Names"})
    df["BldgType"] = df["BldgType"].replace({"2fmCon":"2FmCon", "Duplex":"Duplx", "Twnhs":"TwnhsI"})

    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)

    df.rename(columns = {
        "1stFlrSF":"FirstFlrSF",
        "2ndFlrSF":"SecondFlrSF",
        "3SsnPorch":"Threeseasonporch"
    }, inplace = True)
    return df

In [4]:
# The numeric features are already encoded correctly (`float` for
# continuous, `int` for discrete), but the categoricals we'll need to
# do ourselves. Note in particular, that the `MSSubClass` feature is
# read as an `int` type, but is actually a (nominative) categorical.

# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", 
                "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", 
                "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", 
                "Heating", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]


# The ordinal (ordered) categorical features 

# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

#Add a none for missing values
ordered_levels = {key:["None"]+values for key,values in ordered_levels.items()}
ordered_cols = [col for col,values in ordered_levels.items()]

#high_cardinality_nom_cols = ["MSSubClass","Neighborhood","Exterior1st","Exterior2nd"]
#low_cardinality_nom_cols = df[features_nom].drop(high_cardinality_nom_cols, axis=1)

In [5]:
def encode(df):
    #Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        #Add a none for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")

    #Ordinal categories
    for name,values in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(values,
                                                    ordered=True))
    return df

In [6]:
def impute(X_train, X_valid, df_test):
    df = pd.concat([X_train, X_valid, df_test])
    cat_cols = features_nom + ordered_cols
    numerical_cols = df.drop(cat_cols, axis=1).select_dtypes("number").columns

    X_train = df.loc[X_train.index,:]
    X_valid = df.loc[X_valid.index,:]
    df_test = df.loc[df_test.index,:]

    num_imputer = SimpleImputer(strategy = "mean")
    cat_imputer = SimpleImputer(strategy = "constant", fill_value = "0")

    #numerical imputation
    imputed_num_X_train = pd.DataFrame(num_imputer.fit_transform(X_train[numerical_cols]))
    imputed_num_X_valid = pd.DataFrame(num_imputer.transform(X_valid[numerical_cols]))
    imputed_num_df_test = pd.DataFrame(num_imputer.transform(df_test[numerical_cols]))

    imputed_num_X_train.columns = X_train[numerical_cols].columns
    imputed_num_X_valid.columns = X_valid[numerical_cols].columns
    imputed_num_df_test.columns = df_test[numerical_cols].columns

    #categorical imputation
    imputed_cat_X_train = pd.DataFrame(cat_imputer.fit_transform(X_train[cat_cols]))
    imputed_cat_X_valid = pd.DataFrame(cat_imputer.transform(X_valid[cat_cols]))
    imputed_cat_df_test = pd.DataFrame(cat_imputer.transform(df_test[cat_cols]))

    imputed_cat_X_train.columns = X_train[cat_cols].columns
    imputed_cat_X_valid.columns = X_valid[cat_cols].columns
    imputed_cat_df_test.columns = df_test[cat_cols].columns

    #merging and return
    X_train = pd.concat([imputed_num_X_train, imputed_cat_X_train], axis=1, join="outer")
    X_valid = pd.concat([imputed_num_X_valid, imputed_cat_X_valid], axis=1, join="outer")
    df_test = pd.concat([imputed_num_df_test, imputed_cat_df_test], axis=1, join="outer")

    return X_train, X_valid, df_test

In [7]:
df_train, df_test = load_data()
X_train, X_valid = train_test_split(df_train, test_size=0.3, random_state=6)

In [8]:
#Imputation(Preprocessing)
df = pd.concat([X_train, X_valid, df_test])

cat_cols = features_nom + ordered_cols

numerical_cols = df.drop(cat_cols, axis=1).select_dtypes("number").columns

#num_col_missing = [col for col in numerical_cols if df[col].isnull().any()]

#cat_col_missing = [col for col in cat_cols if df[col].isnull().any()]
    

X_train = df.loc[X_train.index,:]

X_valid = df.loc[X_valid.index,:]

df_test = df.loc[df_test.index,:]


num_imputer = SimpleImputer(strategy = "median")

cat_imputer = SimpleImputer(strategy = "most_frequent")
    

#numerical imputation

imputed_num_X_train = pd.DataFrame(num_imputer.fit_transform(X_train[numerical_cols]))

imputed_num_X_valid = pd.DataFrame(num_imputer.transform(X_valid[numerical_cols]))

imputed_num_df_test = pd.DataFrame(num_imputer.transform(df_test[numerical_cols]))



imputed_num_X_train.columns = X_train[numerical_cols].columns

imputed_num_X_valid.columns = X_valid[numerical_cols].columns

imputed_num_df_test.columns = df_test[numerical_cols].columns




#categorical imputation

imputed_cat_X_train = pd.DataFrame(cat_imputer.fit_transform(X_train[cat_cols]))

imputed_cat_X_valid = pd.DataFrame(cat_imputer.transform(X_valid[cat_cols]))

imputed_cat_df_test = pd.DataFrame(cat_imputer.transform(df_test[cat_cols]))



imputed_cat_X_train.columns = X_train[cat_cols].columns

imputed_cat_X_valid.columns = X_valid[cat_cols].columns

imputed_cat_df_test.columns = df_test[cat_cols].columns


#merging and return

X_train = pd.concat([imputed_num_X_train, imputed_cat_X_train], axis=1, join="outer")

X_valid = pd.concat([imputed_num_X_valid, imputed_cat_X_valid], axis=1, join="outer")

df_test = pd.concat([imputed_num_df_test, imputed_cat_df_test], axis=1, join="outer")

df_train = pd.concat([X_train, X_valid])


In [9]:
df_train = pd.concat([X_train, X_valid])

In [10]:
def score_dataset(X, y, model=XGBRegressor()):
    #Label Encoding
    X = X.apply(lambda x : pd.factorize(x)[0])

    #we need RMSLE(Root mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error"
    )
    score = -1*score.mean()
    score = np.sqrt(score)
    return score

In [11]:
"""
X = df_train.copy()
y = X.pop("SalePrice")

BaselineScore = score_dataset(X, y)
print(f"Baseline Score: {BaselineScore:.5f} RMSLE")
"""

'\nX = df_train.copy()\ny = X.pop("SalePrice")\n\nBaselineScore = score_dataset(X, y)\nprint(f"Baseline Score: {BaselineScore:.5f} RMSLE")\n'

In [12]:
def make_mi_scores(X, y, discrete_fea=True):
    X = X.copy()
    #Label Encoding
    for colname in X.select_dtypes(["category", "object"]):
        X[colname],_ = X[colname].factorize()
    #All discrete features should now have integer data types
    if discrete_fea:
        discrete_fea = X.dtypes == "int"
    mi_scores = mutual_info_regression(X, y, discrete_features = discrete_fea, random_state = 6)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index = X.columns)
    mi_scores = mi_scores.sort_values(ascending = False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending = True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [13]:
def drop_uninformative(X, mi_scores):
    return X.loc[:, mi_scores > 0.0]

In [14]:
"""
X = df_train.copy()
y = X.pop("SalePrice")
mi_scores = make_mi_scores(X, y)
X = drop_uninformative(X, mi_scores)

score_dataset(X, y)
#mi_scores.head(20)
"""

'\nX = df_train.copy()\ny = X.pop("SalePrice")\nmi_scores = make_mi_scores(X, y)\nX = drop_uninformative(X, mi_scores)\n\nscore_dataset(X, y)\n#mi_scores.head(20)\n'

In [15]:
#Feature Engineering_Creating Features
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category", "object"]):
        X[colname],_ = X[colname].factorize()
    return X

def mathematical_transforms(df):
    X = pd.DataFrame()
    X["LivLotRatio"] = df.GrLivArea / df.LotArea
    X["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
    return X

def interactions(df):
    X1 = pd.get_dummies(df.BldgType, prefix = "Bldg")
    X1 = X1.mul(df.GrLivArea, axis = 0)
    return X1

def more_interactions(df):
    X = df.copy()
    X1 = pd.DataFrame()
    X["OverallQual"],_ = X["OverallQual"].factorize()
    X["OverallCond"],_ = X["OverallCond"].factorize()
    X1["ResultCond"] = (X.OverallQual * X.OverallCond)

    X["BsmtQual"],_ = X["BsmtQual"].factorize()
    X1["BsmtWeightage"] = (X.BsmtQual * X.TotalBsmtSF)
    return X1
    
def linearize_area(df):
    X = pd.DataFrame()
    area_features = ["WoodDeckSF","OpenPorchSF","EnclosedPorch",
                     "FirstFlrSF","SecondFlrSF","ScreenPorch","PoolArea","LotArea","GarageArea",
                     "GrLivArea","MasVnrArea","BsmtUnfSF","TotalBsmtSF"]
    for colname in area_features:
        X[colname + "_squareroot(in ft)"] = np.sqrt(df[colname])
    return X

def counts(df):
    X1 = pd.DataFrame()
    X1 = df[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "Threeseasonporch", "ScreenPorch"]].gt(0.0).sum(axis=1)
    return X1

def break_down(df):
    X1 = pd.DataFrame()
    X1["MSClass"] = df.MSSubClass.str.split("_", n=1, expand = True)[0]
    return X1

def group_transforms(df):
    X1 = pd.DataFrame()
    X1["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
    X1["VarFromMed"] = (df.GrLivArea - df.MedNhbdArea)
    return X1

In [16]:
#Clustering with K-Means
cluster_features = ["LotArea", "TotalBsmtSF", "FirstFlrSF", "SecondFlrSF", "GrLivArea", "GarageArea"]

def cluster_labels(df, features, n_clusters = 20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters = n_clusters, n_init = 50, random_state = 6)
    X_new = pd.DataFrame()
    X_new["Cluster"] = kmeans.fit_predict(X_scaled)
    return X_new

def cluster_distance(df, features, n_clusters = 20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters = n_clusters, n_init = 50, random_state = 6)
    X_cd = kmeans.fit_transform(X_scaled)
    X_cd = pd.DataFrame(
        X_cd, columns = [f"Centroid_{i}" for i in range(X_cd.shape[1])]
    )
    return X_cd

In [17]:
#Principal Component Analysis
def apply_pca(X, standardize = True):
    #Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    #Create Principal Components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    #Convert to DataFrame
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns = component_names)
    #Create Loadings
    loadings = pd.DataFrame(
        pca.components_.T, columns = component_names, index = X.columns
    )
    return pca, X_pca, loadings

def plot_variance(pca, width = 8, dpi = 100):
    #Create Figure
    fig, axs = plt.subplots(2, 1)
    n = pca.n_components_
    grid = np.arange(1, n+1)
    #% Explained Variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel = "Principal Components", title = "% Explained Variance", ylim = (0.0, 0.15)
    )
    #%Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-" )
    axs[1].set(
        xlabel = "Principal Components", title = "% Cumulative Variance", ylim = (0.0, 1.0)
    )
    #Set up figure
    fig.set(figwidth = width, dpi = dpi)
    return axs

In [18]:
#X = label_encode(X)
#X["Utilities"]

In [19]:
pca_features = ["GarageArea", "YearRemodAdd", "TotalBsmtSF", "GrLivArea"] #Select only numeric features

def pca_components(df, features):
    X = df.loc[:, features]
    _, X_pca, _ = apply_pca(X)
    return X_pca

def pca_inspired(df):
    X = pd.DataFrame()
    X["Feature 1"] = df.GrLivArea + df.TotalBsmtSF
    X["Feature 2"] = df.YearRemodAdd * df.TotalBsmtSF
    X["Feature 3"] = df.GarageArea * df.YearRemodAdd
    return X

In [20]:
#Outliers Indicator
def indicate_outliers(df):
    X_new = pd.DataFrame()
    X_new["Outlier"] = (df.Neighborhood == "Edwards") & (df.SaleCondition == "Partial")
    return X_new

In [21]:
#Correlation Matrix Plot
def corrplot(df, method = "pearson", annot=True, **kwargs):
    sns.clustermap(
        df.corr(method, numeric_only=True),
        vmin = 0.0,
        vmax = 1.0,
        cmap = "icefire",
        method = "complete",
        annot = annot, 
        **kwargs
    )
#corrplot(X, annot=None)

In [22]:
#Target Encoding(strategy like cross-validation to use 100% data)
class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs
        self.cv_ = KFold(n_splits = 5)
        
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            #print(f"Encoding indices: {idx_encode} and {idx_train}")
            fitted_encoder = self.encoder_(cols = cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode]
            )
            transformed_data = fitted_encoder.transform(X.iloc[idx_train, :])[cols]
            #print(f"Transformed data columns: {transformed_data.columns}")

            X_encoded.append(transformed_data)
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    #for test data - averaging out all the transformations learned through training data
    def transform(self, X):
        from functools import reduce
        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value = 0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

In [23]:
def create_features(df, df_test = None):
    X = df.copy()
    y = X.pop("SalePrice")
    mi_scores = make_mi_scores(X, y)
    #print(len(X))
    #print(len(df_test))
    
    #Combine test and training data
    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop("SalePrice")
        X = pd.concat([X_test, X])

    #print(len(X))
    #Based on MI Scores
    X = drop_uninformative(X, mi_scores)

    #Transformations
    # 1.Mathematical Transformations
    #X = X.join(Mathematical_transforms(X))
    # 2.Interactions
    #X = X.join(interactions(X))
    """
    X1 = pd.get_dummies(df.BldgType, prefix = "Bldg")
    X = X1.mul(X.GrLivArea, axis = 0)
    """
    #X = X.join(more_interactions(X))
    """
    X["OverallQual"],_ = X["OverallQual"].factorize()
    X["OverallCond"],_ = X["OverallCond"].factorize()
    X["BsmtQual"],_ = X["BsmtQual"].factorize()
    
    X["ResultCond"] = (X.OverallQual * X.OverallCond)
    X["BsmtWeightage"] = (X.BsmtQual * X.TotalBsmtSF)
    """
    # 3.Linearize Area by Taking Square Root
    #X = X.join(linearize_area(X))
    
    area_features = ["WoodDeckSF","OpenPorchSF","EnclosedPorch",
                     "FirstFlrSF","SecondFlrSF","ScreenPorch","PoolArea","LotArea","GarageArea",
                     "GrLivArea","MasVnrArea","BsmtUnfSF","TotalBsmtSF"]
    for colname in area_features:
        X[colname] = X[colname].fillna(0)
        new_column_name = colname + "_squareroot(in ft)"
        X[new_column_name] = np.sqrt(X[colname].clip(lower=0))
    
    # 4.Counts
    #X = X.join(counts(X))
    X["Counts_"] = X[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "ScreenPorch"]].gt(0.0).sum(axis=1)
    # 5.Break Down
    #X = X.join(break_down(X))
    X['MSSubClass'] = X['MSSubClass'].astype(str)
    X["MSClass"] = X.MSSubClass.str.split("_", n=1, expand = True)[0]
    X['MSSubClass'] = X['MSSubClass'].astype("object")
    # 6.Group Transformations
    #X = X.join(group_transforms(X))
    """
    X["MedNhbdArea"] = X.groupby("Neighborhood")["GrLivArea"].transform("median")
    X["VarFromMed"] = (X.GrLivArea - X.MedNhbdArea)
    """
    #Principal Component Analysis (PCA)
    #X = X.join(indicate_outliers(X))
    #X["Outlier"] = (X.Neighborhood == "Edwards") & (X.SaleCondition == "Partial")
    #X = X.join(pca_inspired(X))
    """
    X["Feature 1"] = X.GrLivArea + X.TotalBsmtSF
    X["Feature 2"] = X.YearRemodAdd * X.TotalBsmtSF
    X["Feature 3"] = X.GarageArea * X.YearRemodAdd
    """
    #Clustering
    #X = X.join(cluster_labels(X, cluster_features))
    #X = X.join(cluster_distance(X, cluster_features))

    #Label Encoding
    X = label_encode(X)

    #print(len(X))
    #Reform Splits
    #print(df_test.index.isin(X.index).sum())
    if df_test is not None:
        X.reset_index(drop=True, inplace=True) #It is very very important step otherwise loc may not work
        df_test.reset_index(drop=True, inplace=True)

        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    #print(len(X))
    #print(len(X_test))
    """
    #Removing duplicate indexes by resetting
    X = X.reset_index(drop=True) #It is very very important step
    y = y.reset_index(drop=True)

    #Aligning training and target dataframe for mismatched indexes or NaNs
    X, y = X.align(y, join='inner', axis=0) #It is very very important step so that no. of rows remain
                                            #same in X and y
    #Target Encoding
    encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    X = X.join(encoder.fit_transform(X, y, cols = ["MSSubClass"]))
    X = X.join(encoder.fit_transform(X, y, cols = ["Neighborhood"]))
    if df_test is not None:
        X_test = X_test.join(encoder.transform(X_test))
    """
    if df_test is not None:
        return X, y, X_test
    else:
        return X, y

In [24]:
X_train, y_train, X_test = create_features(df_train, df_test)

score_dataset(X_train, y_train)

0.17453565383024972

In [25]:
"""
#Optuna Visualization for hyperparamter tuning
import optuna

def objective(trial):
    xgb_params = dict(
        max_depth = trial.suggest_int("max_depth", 2, 10),
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators = trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight = trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample = trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha = trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda = trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True)
    )
    xgb = XGBRegressor(**xgb_params)
    return score_dataset(X_train, y_train, xgb)

study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 20)
xgb_params = study.best_params
"""

'\n#Optuna Visualization for hyperparamter tuning\nimport optuna\n\ndef objective(trial):\n    xgb_params = dict(\n        max_depth = trial.suggest_int("max_depth", 2, 10),\n        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),\n        n_estimators = trial.suggest_int("n_estimators", 1000, 8000),\n        min_child_weight = trial.suggest_int("min_child_weight", 1, 10),\n        colsample_bytree = trial.suggest_float("colsample_bytree", 0.2, 1.0),\n        subsample = trial.suggest_float("subsample", 0.2, 1.0),\n        reg_alpha = trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),\n        reg_lambda = trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True)\n    )\n    xgb = XGBRegressor(**xgb_params)\n    return score_dataset(X_train, y_train, xgb)\n\nstudy = optuna.create_study(direction = "minimize")\nstudy.optimize(objective, n_trials = 20)\nxgb_params = study.best_params\n'

In [26]:
#since trial_9 has minimum error as obtained from optuna visualization
xgb_params = dict(
    max_depth = 8,
    learning_rate = 0.0012352725932045963,
    n_estimators = 7121,
    min_child_weight = 2,
    colsample_bytree = 0.49085760982707777,
    subsample = 0.44421169619647927,
    reg_alpha = 0.00021184484530038537,
    reg_lambda = 0.07172122643270866,
    num_parallel_tree = 1
)
xgb = XGBRegressor(**xgb_params)

In [27]:
xgb.fit(X_train, np.log(y_train))
predictions = np.exp(xgb.predict(X_test))

id_list = []
for i in range(1461, 2920):
    id_list.append(i)
output = pd.DataFrame({"Id":id_list, "SalePrice":predictions})
output.to_csv("my_submission.csv", index = False)
print("Submission saved successfully_RR")
see = pd.read_csv("my_submission.csv")
see.head()

Submission saved successfully_RR


Unnamed: 0,Id,SalePrice
0,1461,127792.78
1,1462,158114.86
2,1463,189654.42
3,1464,188545.52
4,1465,185565.92
