In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
%matplotlib inline

In [2]:
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

In [3]:
train_df.head() 

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


<h2>Dealing with outliers</h2>

In [4]:
# outlier notes:
# TotalBsmtSF > 3000 price < 200000 BsmtQual = Ex
# GrLivArea > 4000 price < 200000
# take a look at PoolArea > 550 price > 700000

In [5]:
outlier_indexes = train_df[(train_df["SalePrice"]<200000) & (train_df["TotalBsmtSF"]>3000)].index

In [6]:
outlier_indexes

Index([523, 1298], dtype='int64')

In [7]:
train_df.drop(outlier_indexes,inplace=True)

In [8]:
# everything about this house is perfect it`s a novelty here
train_df[(train_df["PoolArea"]>550) & (train_df["SalePrice"]>700000)]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1182,1183,60,RL,160.0,15623,Pave,,IR1,Lvl,AllPub,...,555,Ex,MnPrv,,0,7,2007,WD,Abnorml,745000


<h2>Handling missing values
</h2>

In [9]:
def percent_missing(df):
    percent_nan = df.isna().sum() / len(df)
    percent_nan = percent_nan[percent_nan > 0].sort_values()
    return percent_nan

In [10]:
percent_missing(train_df)

Electrical      0.000686
MasVnrArea      0.005487
BsmtQual        0.025377
BsmtCond        0.025377
BsmtFinType1    0.025377
BsmtExposure    0.026063
BsmtFinType2    0.026063
GarageCond      0.055556
GarageQual      0.055556
GarageFinish    0.055556
GarageYrBlt     0.055556
GarageType      0.055556
LotFrontage     0.177641
FireplaceQu     0.473251
MasVnrType      0.598080
Fence           0.807270
Alley           0.937586
MiscFeature     0.962963
PoolQC          0.995885
dtype: float64

In [11]:
train_df.drop("Utilities", axis=1, inplace=True)

In [12]:
train_df["LotFrontage"] = train_df.groupby(["Neighborhood","MSZoning"])["LotFrontage"].transform(lambda x:x.fillna(x.mean()))
train_df["LotFrontage"] = train_df.groupby("Neighborhood")["LotFrontage"].transform(lambda x:x.fillna(x.mean()))
train_df.dropna(subset="Electrical",axis=0,inplace=True)

<h2>preprocessing data</h2>

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [14]:
X = train_df.drop("SalePrice",axis=1)
y = np.log(train_df["SalePrice"])
# y = train_df["SalePrice"]
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
categorical_columns = X_train.select_dtypes(include=["object","category"]).columns
numerical_columns = X_train.select_dtypes(exclude=["object","category"]).columns

In [16]:
numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
    ("scaler", StandardScaler()),
])
categorical_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy="constant",fill_value="None")),
    ("onehot", OneHotEncoder(handle_unknown="ignore",sparse_output =False)),
])

In [17]:
preprocessor = ColumnTransformer([
    ("num",numerical_transformer,numerical_columns),
    ("cat",categorical_transformer,categorical_columns)
],remainder="passthrough")

In [18]:
pipeline = Pipeline([
    ("preprocessor",preprocessor)
])

In [19]:
X_train_preprocessed = pipeline.fit_transform(X_train)
X_val_preprocessed = pipeline.transform(X_val)

<h3>Base features pred</h3>

In [20]:
from sklearn.svm import LinearSVR,SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,StackingRegressor,AdaBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error,root_mean_squared_error,accuracy_score
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from xgboost import XGBRegressor

In [21]:
models = {
    "ElasticNet": ElasticNet(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42,n_jobs=-2),
    "GradientBoosting":GradientBoostingRegressor(random_state=42),
    "SVR":SVR(),
    "Ada":AdaBoostRegressor(random_state=42),
    "XGBoost":XGBRegressor(random_state=42,n_jobs=-2),
}
param_grids = {
    "ElasticNet":{
        "alpha": np.logspace(0,100,10),
        "l1_ratio":np.linspace(.5,1,10)
    },
    "RandomForest":{
        "n_estimators":[200,300,350,400],
        # "min_samples_split":[2,3,4],
        "min_samples_split":[2],
        "min_samples_leaf":[1,2],
        "max_depth":[None,10],
    },
    "GradientBoosting":{
        # "learning_rate":[0.01,0.08,0.09,0.1,0.2],
        "learning_rate":[0.08,0.09],
        "n_estimators":[300,350,400],
        # "min_samples_split":[2,10,15],
        "min_samples_split":[2,5],
        "max_depth":[3,4],
    },
    "SVR":{
        # "kernel":["linear", "poly", "rbf",],
        "kernel":["rbf"],
        # "gamma":["scale","auto"],
        "gamma":["auto"],
        # "C":[.1,1.5,10,80,100],
        "C":[0.1,0.7,1,1.5],
        "epsilon":[0.07,0.08,0.1],
    },
    "Ada":{
        "loss" : ['linear','exponential'],
        "n_estimators":[500,600],
        "learning_rate":[1.0,4,5,8],
    },
    "XGBoost":{
        "n_estimators": [100,200,400,600,700,800,900,1000,1100,1200],
        "objective": ["reg:squarederror"],
        "max_depth": [2,3,4,5,6,8,10],
        "learning_rate": [0.01,0.03,0.04,0.05,0.6,0.1],
        "subsample":[0.4,0.6,0.8,1],
    },
}

In [22]:
cv = KFold(n_splits=5,shuffle=True,random_state=42)

In [23]:
grids = {}
for model_name, model in models.items():
    grids[model_name] = GridSearchCV(model,param_grids[model_name],cv=cv,scoring='neg_mean_squared_error',n_jobs=-2)
    grids[model_name].fit(X_train_preprocessed,y_train)
    best_params = grids[model_name].best_params_
    best_score = np.sqrt(-1 * grids[model_name].best_score_)
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best RMSE for {model_name}: {best_score}\n")

Best parameters for ElasticNet: {'alpha': 1.0, 'l1_ratio': 0.5}
Best RMSE for ElasticNet: 0.3956761919827401

Best parameters for RandomForest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 400}
Best RMSE for RandomForest: 0.13974646761602755

Best parameters for GradientBoosting: {'learning_rate': 0.08, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 400}
Best RMSE for GradientBoosting: 0.12106129775384859

Best parameters for SVR: {'C': 1, 'epsilon': 0.07, 'gamma': 'auto', 'kernel': 'rbf'}
Best RMSE for SVR: 0.11752158667808911

Best parameters for Ada: {'learning_rate': 4, 'loss': 'exponential', 'n_estimators': 600}
Best RMSE for Ada: 0.15748570296100853

Best parameters for XGBoost: {'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 1200, 'objective': 'reg:squarederror', 'subsample': 0.4}
Best RMSE for XGBoost: 0.1160657492238416



<h2>Creating featured columns</h2>

In [23]:
def get_target_mean_columns(X1,X2):
    
    df_train = pd.concat([X1,y_train],axis=1)
    df_valid = X2.copy()
    
    df_train["YrMoSoldMeanEncoded"] = df_train["YrSold"].apply(str) + "-" + df_train["MoSold"].apply(str)
    df_valid["YrMoSoldMeanEncoded"] = df_valid["YrSold"].apply(str) + "-" + df_valid["MoSold"].apply(str)
    dt = df_train.groupby(["YrMoSoldMeanEncoded"])["SalePrice"].agg(["mean"]).reset_index(drop=False)
    df_train["YrMoSoldMeanEncoded"] = df_train.merge(dt, on="YrMoSoldMeanEncoded",how="left")["mean"].values
    df_valid["YrMoSoldMeanEncoded"] = df_valid.merge(dt, on="YrMoSoldMeanEncoded",how="left")["mean"].values
    
    df_train["ContourSlopeMeanEncoded"] = df_train["LandContour"].apply(str) + "-" + df_train["LandSlope"].apply(str)
    df_valid["ContourSlopeMeanEncoded"] = df_valid["LandContour"].apply(str) + "-" + df_valid["LandSlope"].apply(str)
    dt = df_train.groupby(["ContourSlopeMeanEncoded"])["SalePrice"].agg(["mean"])
    df_train["ContourSlopeMeanEncoded"] = df_train.merge(dt,on=["ContourSlopeMeanEncoded"],how="left")["mean"].values
    df_valid["ContourSlopeMeanEncoded"] = df_valid.merge(dt,on=["ContourSlopeMeanEncoded"],how="left")["mean"].values
    
    df_train["ExterQualCondMeanEncoded"] = df_train["ExterQual"].apply(str) + "-" + df_train["ExterCond"].apply(str)
    df_valid["ExterQualCondMeanEncoded"] = df_valid["ExterQual"].apply(str) + "-" + df_valid["ExterCond"].apply(str)
    dt = df_train.groupby(["ExterQualCondMeanEncoded"])["SalePrice"].agg(["mean"])
    df_train["ExterQualCondMeanEncoded"] = df_train.merge(dt,on=["ExterQualCondMeanEncoded"],how="left")["mean"].values
    df_valid["ExterQualCondMeanEncoded"] = df_valid.merge(dt,on=["ExterQualCondMeanEncoded"],how="left")["mean"].values
    
    df_train["HeatingHeatQCMeanEncoded"] = df_train["Heating"].apply(str) + "-" + df_train["HeatingQC"].apply(str)
    df_valid["HeatingHeatQCMeanEncoded"] = df_valid["Heating"].apply(str) + "-" + df_valid["HeatingQC"].apply(str)
    dt = df_train.groupby(["HeatingHeatQCMeanEncoded"])["SalePrice"].agg(["mean"])
    df_train["HeatingHeatQCMeanEncoded"] = df_train.merge(dt,on=["HeatingHeatQCMeanEncoded"],how="left")["mean"].values
    df_valid["HeatingHeatQCMeanEncoded"] = df_valid.merge(dt,on=["HeatingHeatQCMeanEncoded"],how="left")["mean"].values
    
    df_train["FirePlacesFQuMeanEncoded"] = df_train["Fireplaces"].apply(str) + "-" + df_train["FireplaceQu"].apply(str)
    df_valid["FirePlacesFQuMeanEncoded"] = df_valid["Fireplaces"].apply(str) + "-" + df_valid["FireplaceQu"].apply(str)
    dt = df_train.groupby(["FirePlacesFQuMeanEncoded"])["SalePrice"].agg(["mean"])
    df_train["FirePlacesFQuMeanEncoded"] = df_train.merge(dt,on=["FirePlacesFQuMeanEncoded"],how="left")["mean"].values
    df_valid["FirePlacesFQuMeanEncoded"] = df_valid.merge(dt,on=["FirePlacesFQuMeanEncoded"],how="left")["mean"].values
    
    dt = df_train.groupby("Neighborhood")["SalePrice"].agg(["mean"])
    df_train["NeighborhoodMeanEncoded"] = df_train.merge(dt,on=["Neighborhood"],how="left")["mean"].values
    df_valid["NeighborhoodMeanEncoded"] = df_valid.merge(dt,on=["Neighborhood"],how="left")["mean"].values
    X_train_mean_encoded = df_train.drop("SalePrice",axis=1)
    X_valid_mean_encoded = df_valid
    return X_train_mean_encoded, X_valid_mean_encoded

In [24]:
X_train_mean_encoded,X_val_mean_encoded = get_target_mean_columns(X_train,X_val)

In [25]:
def custom_features(df):
    df_out = df.copy()
    remod_cat = []
    shed_val = []
    for i,row in df_out.iterrows():
        if row["YearRemodAdd"] - row["YearBuilt"] == 0:
            remod_cat.append(0)
        elif row["YearRemodAdd"] - row["YearBuilt"] <= 5:
            remod_cat.append(1)
        elif row["YearRemodAdd"] - row["YearBuilt"] <= 15:
            remod_cat.append(2)
        elif row["YearRemodAdd"] - row["YearBuilt"] <= 30:
            remod_cat.append(3)
        else:
            remod_cat.append(4)
        if row["3SsnPorch"] > 0:
            row["ScreenPorch"] = row["3SsnPorch"]
            
        if row["MiscFeature"] == "Shed":
            shed_val.append(row["MiscVal"])
        else:
            shed_val.append(0)
        
    df_out["Age"] = df_out["YrSold"] - df_out["YearBuilt"]
    df_out["HasBsmtBath"] = ((df_out["BsmtFullBath"] > 0) | (df_out["BsmtHalfBath"] > 0)).astype(object)
    df_out["NonBedroomRoomsAbvGrd"] = df_out["TotRmsAbvGrd"] - df_out["BedroomAbvGr"]
    df_out["HasGarage"] = (df_out["GarageArea"] > 0).astype(object)
    df_out["HasBsmt"] = (df_out["TotalBsmtSF"] > 0).astype(object)
    df_out["Has2ndFloor"] = (df_out["2ndFlrSF"] > 0).astype(object)
    df_out["YrSold_cat"] = df_out["YrSold"].astype(object)
    df_out["MoSold_cat"] = df_out["MoSold"].astype(object)
    df_out["YearBuilt_cat"] = df_out["YearBuilt"].astype(object)
    df_out["MSSubClass_cat"] = df_out["MSSubClass"].astype(object)
    df_out["RemodCat"] = remod_cat
    df_out["RemodCat"] = df_out["RemodCat"].astype(object)
    df_out["ShedVal"] = shed_val
    df_out.drop("3SsnPorch",axis=1,inplace=True)
    df_out.drop("MiscVal",axis=1,inplace=True)
    return df_out
feature_engineering_transformer = FunctionTransformer(custom_features)

In [26]:
new_cols_categorical = pd.Index(["HasBsmtBath","YrSold_cat","MoSold_cat","YearBuilt_cat","MSSubClass_cat","RemodCat","Has2ndFloor","HasBsmt","HasGarage",])
new_cols_numerical = pd.Index(["Age","ShedVal","NonBedroomRoomsAbvGrd"])
categorical_columns = X_train_mean_encoded.drop(["MiscVal","3SsnPorch"],axis=1).select_dtypes(include=["object","category"]).columns.append(new_cols_categorical)
numerical_columns = X_train_mean_encoded.drop(["3SsnPorch","MiscVal"],axis=1).select_dtypes(exclude=["object","category"]).columns.append(new_cols_numerical)

In [27]:
preprocessor = ColumnTransformer([
    ("num",numerical_transformer,numerical_columns),
    ("cat",categorical_transformer,categorical_columns)
],remainder="passthrough")

In [28]:
pipeline_fe = Pipeline([
    ("fe",feature_engineering_transformer),
    ("preprocessor",preprocessor),
])

In [29]:
X_train_fe_preprocessed = pipeline_fe.fit_transform(X_train_mean_encoded)
X_val_fe_preprocessed = pipeline_fe.transform(X_val_mean_encoded)

<h3>finding hyperparameters</h3>

In [31]:
grids = {}
for model_name, model in models.items():
    grids[model_name] = GridSearchCV(model,param_grids[model_name],cv=cv,scoring='neg_mean_squared_error',n_jobs=-2)
    grids[model_name].fit(X_train_fe_preprocessed,y_train)
    best_params = grids[model_name].best_params_
    best_score = np.sqrt(-1 * grids[model_name].best_score_)
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best RMSE for {model_name}: {best_score}\n")

Best parameters for ElasticNet: {'alpha': 1.0, 'l1_ratio': 0.5}
Best RMSE for ElasticNet: 0.3956761919827401

Best parameters for RandomForest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 400}
Best RMSE for RandomForest: 0.13730683746497369

Best parameters for GradientBoosting: {'learning_rate': 0.09, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 350}
Best RMSE for GradientBoosting: 0.12151664619314677

Best parameters for SVR: {'C': 1.5, 'epsilon': 0.07, 'gamma': 'auto', 'kernel': 'rbf'}
Best RMSE for SVR: 0.11410775154143743

Best parameters for Ada: {'learning_rate': 5, 'loss': 'exponential', 'n_estimators': 600}
Best RMSE for Ada: 0.15143689522894868

Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 1200, 'objective': 'reg:squarederror', 'subsample': 0.4}
Best RMSE for XGBoost: 0.11657105557808814



<h3>testing with pca</h3>

In [32]:
pipeline_fe_pca = Pipeline([
    ("fe",feature_engineering_transformer),
    ("preprocessor",preprocessor),
    ("pca",PCA(n_components=4,random_state=42))
])

In [33]:
X_train_fe_pca_preprocessed = pipeline_fe_pca.fit_transform(X_train_mean_encoded)
X_val_fe_pca_preprocessed = pipeline_fe_pca.transform(X_val_mean_encoded)

In [34]:
grids = {}
for model_name, model in models.items():
    grids[model_name] = GridSearchCV(model,param_grids[model_name],cv=cv,scoring='neg_mean_squared_error',n_jobs=-2)
    grids[model_name].fit(X_train_fe_pca_preprocessed,y_train)
    best_params = grids[model_name].best_params_
    best_score = np.sqrt(-1 * grids[model_name].best_score_)
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best RMSE for {model_name}: {best_score}\n")

Best parameters for ElasticNet: {'alpha': 1.0, 'l1_ratio': 0.5}
Best RMSE for ElasticNet: 0.2233227494258543

Best parameters for RandomForest: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 350}
Best RMSE for RandomForest: 0.14611938064591923

Best parameters for GradientBoosting: {'learning_rate': 0.08, 'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 300}
Best RMSE for GradientBoosting: 0.14842168729080868

Best parameters for SVR: {'C': 1, 'epsilon': 0.07, 'gamma': 'auto', 'kernel': 'rbf'}
Best RMSE for SVR: 0.17277036636337503

Best parameters for Ada: {'learning_rate': 5, 'loss': 'exponential', 'n_estimators': 500}
Best RMSE for Ada: 0.1565437869349402

Best parameters for XGBoost: {'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 200, 'objective': 'reg:squarederror', 'subsample': 0.6}
Best RMSE for XGBoost: 0.14092388068456058



<h3>validating models</h3>

In [36]:
tuned_models = {
    "RandomForest":RandomForestRegressor(min_samples_leaf = 2,n_estimators=400,random_state=42,n_jobs=-2),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=350,learning_rate=.09,max_depth=3,min_samples_split=2,random_state=42),
    "SVR":SVR(C=1.5,epsilon=0.07,gamma="auto"),
    "XGBoost":XGBRegressor(learning_rate=0.01,n_estimators=1200,subsample=0.4,max_depth=3,random_state=42,n_jobs=-2,objective='reg:squarederror'),
    # "KNN":KNeighborsRegressor(n_neighbors=11),
}

In [37]:
grids = {}
for model_name,model in tuned_models.items():
    grids[model_name] = model
    grids[model_name].fit(X_train_fe_preprocessed,y_train)
    pred = np.exp(grids[model_name].predict(X_val_fe_preprocessed))
    print(f"RMSE for {model_name} is {root_mean_squared_error(np.exp(y_val),pred)}")

RMSE for RandomForest is 24733.76261688844
RMSE for GradientBoosting is 21009.198249106594
RMSE for SVR is 22697.09249409938
RMSE for XGBoost is 20812.369113423763


<h3>creating a stacked regressor</h3>

In [38]:
stacked_model = StackingRegressor([(model_name,model) for model_name,model in tuned_models.items()],n_jobs=-2)
stacked_model.fit(X_train_fe_preprocessed,y_train)
pred = np.exp(stacked_model.predict(X_val_fe_preprocessed))
print(f"RMSE: {root_mean_squared_error(np.exp(y_val),pred)}")

RMSE: 20206.32141045509


<h3>predicting final test set</h3>

In [40]:
df_test = pd.read_csv("test.csv")
X_full_mean_encoded, df_test_mean_encoded = get_target_mean_columns(X,df_test)
X_full_fe_preprocessed = pipeline_fe.fit_transform(X_full_mean_encoded)
df_test_fe_preprocessed = pipeline_fe.transform(df_test_mean_encoded)

In [41]:
# stacked_model.fit(X_full_fe_preprocessed,y)
# final_pred = np.exp(stacked_model.predict(df_test_fe_preprocessed))

In [43]:
grids["XGBoost"].fit(X_full_fe_preprocessed,y)
final_pred = np.exp(grids["XGBoost"].predict(df_test_fe_preprocessed))

In [44]:
df_xgboost_out = df_test[["Id"]].copy()
df_xgboost_out["SalePrice"] = final_pred

In [74]:
# df_xgboost_out.to_csv("submission_stacked_normalized_full_data_no_leakage.csv",index=False)

In [45]:
df_xgboost_out.to_csv("XGBRegressor_2_full_no_leakage.csv",index=False)

<h3>final score on kaggle: 0.13017</h3>