In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
import optuna
from sklearn.preprocessing import StandardScaler

In [43]:
from warnings import filterwarnings
filterwarnings("ignore")

# Loading Data

In [5]:
# setting a base path variable for easy access
BASE_PATH = Path("/kaggle/input/playground-series-s3e6")
train = pd.read_csv(BASE_PATH / "train.csv").drop(columns=["id"])

test = pd.read_csv(BASE_PATH / "test.csv")
# we need the test id column to make the submission
test_idx = test.id
test = test.drop(columns=["id"])

# we also load original dataset and will try to gauge if adding it to the competition datasets boosts our score or not
original = pd.read_csv("/kaggle/input/paris-housing-price-prediction/ParisHousing.csv")

### Features Presence Check
Checkingg if the competition dataset and original dataset contain the same features in same order, so we can concatenate them easily. Otherwise we'll have to manually make them consistent

In [13]:
all(original.columns == train.columns)

True

#### Result:
Well both dataset indeed contain the same features in the same order. No need for any manual work!

# Analyzing Data
Here I won't go in detail about the features distribution similarities and contrasts between the competition and original datasets because that would make this notebook quite complicated for newcomers to this platform and field.

I'll most probably make a separate notebook for this comparison b/w competition and original dataset.

## Checking for missing values

In [14]:
pd.concat([train.isnull().sum().rename("Missing In Train"),
          test.isnull().sum().rename("Missing in Test"),
          original.isnull().sum().rename("Missing in Original")], axis=1)

Unnamed: 0,Missing In Train,Missing in Test,Missing in Original
squareMeters,0,0.0,0
numberOfRooms,0,0.0,0
hasYard,0,0.0,0
hasPool,0,0.0,0
floors,0,0.0,0
cityCode,0,0.0,0
cityPartRange,0,0.0,0
numPrevOwners,0,0.0,0
made,0,0.0,0
isNewBuilt,0,0.0,0


### INSIGHTS:
We're in luck, none of the datasets contain any missing value.

Also test dataframe has **NaN** for price because it doesn't contain price i.e. target feature.

## Checking for Categorical Values
We now check if our dataset contains any categorical features, if so we'll encode them before feeding to our model

In [19]:
pd.concat([train.dtypes.rename("Data Type"),
          train.nunique().rename("Unique Values")], axis=1).sort_values(by="Unique Values")

Unnamed: 0,Data Type,Unique Values
hasYard,int64,2
hasPool,int64,2
hasStorageRoom,int64,2
isNewBuilt,int64,2
hasStormProtector,int64,2
cityPartRange,int64,10
numPrevOwners,int64,10
hasGuestRoom,int64,11
made,int64,33
numberOfRooms,int64,100


### INSIGHTS:

These features hasYard, hasPool, hasStorageRoom, isNewBuilt, hasStormProtector, are already binary encoded, as they contain either 0 or 1. So though they are categorical, we don't need to encode them.

cityPartRange, numPreviousOwners, hasGuestRoom, made are good candidates for categorical features.

In [26]:
features_to_encode = [col for col in train.columns 
                      if train[col].nunique() <= 33 and train[col].nunique() > 2]
features_to_encode

['cityPartRange', 'numPrevOwners', 'made', 'hasGuestRoom']

## Deciding which encoding technique to use
We could use simple one hot encoding, but that would add 64 new features, which for a dataset of this size seems quite high.
There are many alternative and better encoding techniques available, but we'll use OridinalEncoder here which makes intuitive sense for these features (other than cityPartRange, no idea what that means).

In [31]:
# # counting the number of new columns we'll add if we do one hot encoding
# total_new_possible_cols = 0
# for col in features_to_encode:
#     total_new_possible_cols += train[col].nunique()
    
# total_new_possible_cols

64

In [34]:
oe = OrdinalEncoder()

train[features_to_encode] = oe.fit_transform(train[features_to_encode])
test[features_to_encode] = oe.transform(test[features_to_encode])
original[features_to_encode] = oe.transform(original[features_to_encode])

# Preprocessing

In [39]:
X = train.drop(columns=["price"])
y = train.price

X_org = original.drop(columns=["price"])
y_org = original.price

In [81]:
X_combined = pd.concat([X, X_org], axis=0)
y_combined = pd.concat([y, y_org], axis=0)

# Modelling
Before we train our models, it's a good idea to set up cross validation

## Setting up K-Fold Cross Validation

In [59]:
def cross_validate(X, y, X_org, y_org, model, model_verbose):
    N_FOLDS = 5
    cv_scores = np.zeros(N_FOLDS)
    
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=1337)
    
    for fold_num, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # As has been shown in the previous competitions that
        # train on the combined(competition + original) dataset
        # but evaluate only on competition dataset
        
        X_train = pd.concat([X_train, X_org], axis=0)
        y_train = pd.concat([y_train, y_org], axis=0)
        
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=50,
                 verbose=model_verbose)
        
        y_preds = model.predict(X_val)
        
        # to calculate rmse instead of mse, we set squared=False
        rmse = mean_squared_error(y_val, y_preds, squared=False)
        cv_scores[fold_num] = rmse
        
        print(f"Fold {fold_num} \t RMSE: {rmse}")
        
    avg_rmse = np.mean(cv_scores)
    print(f"AVG RMSE: {avg_rmse}")

## XGBoost

In [82]:
xgb_reg = xgb.XGBRegressor()
# cross_validate(X, y, X_org, y_org, xgb_reg, model_verbose=False)
xgb_reg.fit(X_combined, y_combined)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [93]:
y_pred_xgb = xgb_reg.predict(test)

## LightGBM

In [64]:
# lgbm_reg = lgbm.LGBMRegressor()
# cross_validate(X, y, X_org, y_org, lgbm_reg, model_verbose=-1)

Fold 0 	 RMSE: 169525.18276428097
Fold 1 	 RMSE: 130547.74933439208
Fold 2 	 RMSE: 125402.25802604215
Fold 3 	 RMSE: 258143.4245789291
Fold 4 	 RMSE: 147464.31278337684
AVG RMSE: 166216.58549740422


In [83]:
# slightly tuned params - make sure to tune yours properly
lgbm_params = {'n_estimators': 667,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}

lgbm_reg = lgbm.LGBMRegressor(**lgbm_params)
# cross_validate(X, y, X_org, y_org, lgbm_reg, model_verbose=-1)
lgbm_reg.fit(X_combined, y_combined)



LGBMRegressor(bagging_fraction=0.6, feature_fraction=0.9, lambda_l1=0.96,
              lambda_l2=0.01, learning_rate=0.19, max_depth=8,
              min_data_in_leaf=36, min_gain_to_split=11.32, n_estimators=667,
              num_leaves=17, num_rounds=404)

In [94]:
y_pred_lgbm = lgbm_reg.predict(test)

## CatBoost

In [91]:
catboost_reg = catboost.CatBoostRegressor()

# cross_validate(X, y, X_org, y_org, catboost_reg, model_verbose=False)

catboost_reg.fit(X_combined, y_combined, verbose=False)

<catboost.core.CatBoostRegressor at 0x7fe04c0c9ad0>

In [95]:
y_pred_cat = catboost_reg.predict(test)

# Ensembling
Though there are more advanced and better ensembling techniques available out there, we'll just use the simple average here.

In [98]:
test.shape

(15154, 16)

In [101]:
y_pred_final = np.array([y_pred_xgb, y_pred_lgbm, y_pred_cat]).mean(axis=0)

In [102]:
submission = pd.DataFrame({'id': test_idx, 'price': y_pred_final})
submission.head()

Unnamed: 0,id,price
0,22730,4755007.0
1,22731,6186019.0
2,22732,9055223.0
3,22733,1607585.0
4,22734,6750943.0


In [104]:
submission.to_csv("submission.csv", index=False)