# XGBoost

In [18]:
import pandas as pd

## Data Import

In [19]:
df_train = pd.read_csv('raw_data/train.csv')
df_test = pd.read_csv('raw_data/test.csv')

Import auxiliary data and select useful columns

In [20]:
df_train_aux = pd.read_csv('raw_data/train_auxiliary_data.csv', index_col=0)
df_test_aux = pd.read_csv('raw_data/test_auxiliary_data.csv', index_col=0)

useful_columns = ['nearest_mrt_line']
df_train_aux = df_train_aux[useful_columns]
df_test_aux = df_test_aux[useful_columns]

Join the auxiliary data to the dataset

In [21]:
df_train = df_train.join(df_train_aux)
df_test = df_test.join(df_test_aux)

## Data pre-processing

Clean dirty values and drop outliers identified in EDA.

In [22]:
from src.preprocessing import clean_property_type, fill_missing_values, update_data, drop_outliers

def data_cleaning(df):
    df_clean = df.copy()
    df_clean = clean_property_type(df_clean)
    df_clean = fill_missing_values(df_clean)
    df_clean = update_data(df_clean)
    return df_clean

In [23]:
df_train = data_cleaning(df_train)
df_train = drop_outliers(df_train)

df_test = data_cleaning(df_test)

Drop unused columns from the original dataset.

In [24]:
from src.preprocessing import drop_columns
df_train = drop_columns(df_train)
df_test = drop_columns(df_test)

### Separate into X and y DataFrames

In [25]:
y_train = df_train['price']
X_train = df_train.drop('price', axis=1)
X_test = df_test

## Encoding

Check the number of unique values for each categorical feature.

In [26]:
X_train.describe(include='object')

Unnamed: 0,property_type,tenure,planning_area,nearest_mrt_line
count,20242,20242,20242,20242
unique,15,11,43,8
top,condo,99-year leasehold,bukit timah,dt
freq,9642,12842,1323,4494


For the numerical variables, the order of magnitude varies, so normalization is carried out.

In [27]:
X_train.describe()

Unnamed: 0,built_year,num_beds,num_baths,size_sqft,lat,lng
count,20242.0,20242.0,20242.0,20242.0,20242.0,20242.0
mean,2010.78678,3.118615,2.631311,1711.291374,1.339994,103.843159
std,15.66283,1.280566,1.461078,1850.382364,0.046302,0.054085
min,1963.0,1.0,1.0,65.0,1.239621,103.685206
25%,2000.0,2.0,2.0,807.0,1.307313,103.806671
50%,2017.0,3.0,2.0,1119.0,1.328225,103.841572
75%,2023.0,4.0,3.0,1528.0,1.371107,103.880155
max,2028.0,10.0,10.0,30000.0,1.461582,103.99751


In [28]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, minmax_scale
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import make_column_transformer

# cat_vars = list(X_train.select_dtypes('object'))
num_vars = list(X_train.select_dtypes('number'))

ct = make_column_transformer(
    (MinMaxScaler(), num_vars),
    (TargetEncoder(), ['planning_area']),
    (OneHotEncoder(max_categories=3), ['tenure']),
    (OneHotEncoder(), ['property_type', 'nearest_mrt_line']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# Fit the column transformer
ct.fit(X_train, minmax_scale(y_train))



In [29]:
# Transform the features
X_train = ct.transform(X_train)
X_test = ct.transform(X_test)

## Fitting model with Cross Validation

In [30]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth' : [2, 4, 6],
    'n_estimators' : [50, 100, 200]
}

model = GridSearchCV(xgb.XGBRegressor(), param_grid, scoring='neg_root_mean_squared_error')
model.fit(X_train, y_train)

In [31]:
print(f'Best parameters: {model.best_params_}')
print(f'Best score: {model.best_score_}')

Best parameters: {'max_depth': 4, 'n_estimators': 200}
Best score: -2002239.6168093786


In [32]:
cv_results = pd.DataFrame(model.cv_results_).T
cv_results

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.099135,0.163033,0.301007,0.137438,0.271983,0.53808,0.207549,0.383489,0.779743
std_fit_time,0.005592,0.004737,0.004639,0.000871,0.009672,0.010341,0.006757,0.004886,0.026103
mean_score_time,0.003391,0.003202,0.003093,0.002992,0.00359,0.003989,0.002992,0.003391,0.004787
std_score_time,0.000798,0.000406,0.000201,0.0,0.000489,0.0,0.0,0.000489,0.000399
param_max_depth,2,2,2,4,4,4,6,6,6
param_n_estimators,50,100,200,50,100,200,50,100,200
params,"{'max_depth': 2, 'n_estimators': 50}","{'max_depth': 2, 'n_estimators': 100}","{'max_depth': 2, 'n_estimators': 200}","{'max_depth': 4, 'n_estimators': 50}","{'max_depth': 4, 'n_estimators': 100}","{'max_depth': 4, 'n_estimators': 200}","{'max_depth': 6, 'n_estimators': 50}","{'max_depth': 6, 'n_estimators': 100}","{'max_depth': 6, 'n_estimators': 200}"
split0_test_score,-2209881.698343,-2012397.279344,-1933960.628124,-2305233.107946,-2317314.213599,-2353199.90314,-2475544.05553,-2466397.305862,-2461569.619399
split1_test_score,-3618204.626466,-3479456.286007,-3357259.187588,-3067535.097435,-2947431.601678,-2780314.296117,-2730563.274457,-2689128.634615,-2685790.659181
split2_test_score,-1754617.626286,-1720789.998773,-1717236.003431,-1609881.722326,-1616575.864326,-1582554.810423,-1557670.966761,-1541865.725865,-1544985.113561


## Predictions

In [33]:
import joblib
# Save the model
joblib.dump(model, 'models/xgboost_model_with_aux_data.joblib')

['models/xgboost_model_with_aux_data.joblib']

In [34]:
df_test.describe()

Unnamed: 0,built_year,num_beds,num_baths,size_sqft,lat,lng
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,2010.754571,3.085857,2.614714,1709.027,1.339711,103.843177
std,15.537008,1.277779,1.456611,1860.113894,0.04602,0.054518
min,1963.0,1.0,1.0,68.0,1.239621,103.686082
25%,2000.0,2.0,2.0,797.0,1.307069,103.806577
50%,2016.0,3.0,2.0,1119.0,1.328282,103.842259
75%,2023.0,4.0,3.0,1528.0,1.369892,103.87906
max,2028.0,10.0,10.0,27500.0,1.461582,103.99751


In [35]:
print("Generate predictions")
pred = model.predict(X_test)
print("predictions shape:", pred.shape)

Generate predictions
predictions shape: (7000,)


In [36]:
df_submission = pd.read_csv("raw_data/example-submission.csv")
df_submission["Predicted"] = pred
df_submission

Unnamed: 0,Id,Predicted
0,0,1.153526e+06
1,1,1.418528e+06
2,2,1.143806e+06
3,3,7.396819e+05
4,4,5.575652e+05
...,...,...
6995,6995,1.874818e+07
6996,6996,1.242276e+07
6997,6997,3.462699e+06
6998,6998,4.405036e+05


In [37]:
df_submission.to_csv("xgboost_submission.csv", index=0)