# **Preprocessing & Pipeline**


#### **Import Libraries**


In [None]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
# Import Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor
import warnings
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

#### **Reading Data**


In [22]:
df = pd.read_csv(
    r'G:\Projects\Melborn_House_Price_Prediction\cleaned_data.csv')
df

Unnamed: 0,suburb,rooms,type,price,method,sellerg,distance,bedroom2,bathroom,car,landsize,yearbuilt,councilarea,regionname,year,month,day,season,street_name
0,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra,Northern Metropolitan,2016,3,12,Spring,Turner
1,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,1900.0,Yarra,Northern Metropolitan,2016,4,2,Spring,Bloomburg
2,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,1900.0,Yarra,Northern Metropolitan,2017,4,3,Spring,Charles
3,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3.0,2.0,1.0,94.0,,Yarra,Northern Metropolitan,2017,4,3,Spring,Federation
4,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3.0,1.0,2.0,120.0,2014.0,Yarra,Northern Metropolitan,2016,4,6,Spring,Park
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,4,h,1245000.0,S,Barry,16.7,4.0,2.0,2.0,652.0,1981.0,,South-Eastern Metropolitan,2017,8,26,Summer,Strada
13576,Williamstown,3,h,1031000.0,SP,Williams,6.8,3.0,2.0,2.0,333.0,1995.0,,Western Metropolitan,2017,8,26,Summer,Merrett
13577,Williamstown,3,h,1170000.0,S,Raine,6.8,3.0,2.0,4.0,436.0,1997.0,,Western Metropolitan,2017,8,26,Summer,Power
13578,Williamstown,4,h,2500000.0,PI,Sweeney,6.8,4.0,1.0,5.0,866.0,1920.0,,Western Metropolitan,2017,8,26,Summer,Verdon


- **Select Numerical Columns**


In [23]:
df.select_dtypes(include="number").head()

Unnamed: 0,rooms,price,distance,bedroom2,bathroom,car,landsize,yearbuilt,year,month,day
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,,2016,3,12
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,1900.0,2016,4,2
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,1900.0,2017,4,3
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,,2017,4,3
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,2014.0,2016,4,6


In [24]:
df.select_dtypes(include='number').isna().sum()

rooms           0
price           0
distance        0
bedroom2        0
bathroom        0
car            62
landsize        0
yearbuilt    5375
year            0
month           0
day             0
dtype: int64

## **Numerical Pipeline**


- **Pipeline 1 -> ( car , yearbuilt ) -> impute missing by Median -> Scaling By Standard Scaling**


In [25]:
num_pipeline1 = Pipeline(steps=[('imputer', SimpleImputer(
    strategy='most_frequent')), ('scaler', StandardScaler())])

num_pipeline1

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


- **Pipeline 2 --> ( rooms , distance , bedroom2 , bathroom , landsize , year , month , day ) --> Scaling By Standard Scaling**


In [26]:
num_pipeline2 = Pipeline(steps=[('scaler', StandardScaler())])
num_pipeline2

0,1,2
,steps,"[('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


- **Select Categorical Columns**


In [27]:
cat = df.select_dtypes(include="object").head()
cat

Unnamed: 0,suburb,type,method,sellerg,councilarea,regionname,season,street_name
0,Abbotsford,h,S,Biggin,Yarra,Northern Metropolitan,Spring,Turner
1,Abbotsford,h,S,Biggin,Yarra,Northern Metropolitan,Spring,Bloomburg
2,Abbotsford,h,SP,Biggin,Yarra,Northern Metropolitan,Spring,Charles
3,Abbotsford,h,PI,Biggin,Yarra,Northern Metropolitan,Spring,Federation
4,Abbotsford,h,VB,Nelson,Yarra,Northern Metropolitan,Spring,Park


In [28]:
df.select_dtypes(include='object').isna().sum()

suburb            0
type              0
method            0
sellerg           0
councilarea    1369
regionname        0
season            0
street_name       0
dtype: int64

In [29]:
for col in cat:
    print(f'Col : {col}')
    print(df[col].nunique())
    print('-' * 20)

Col : suburb
314
--------------------
Col : type
3
--------------------
Col : method
5
--------------------
Col : sellerg
268
--------------------
Col : councilarea
33
--------------------
Col : regionname
8
--------------------
Col : season
4
--------------------
Col : street_name
4132
--------------------


In [30]:
df = df.drop(columns=['street_name'])

## **Categorical Pipeline**


- **Pipeline 1 ( Councilarea ) --> Impute Using Simple Imputer --> Encoding Using Target Encoding**


In [None]:
cat_pipeline1 = Pipeline(steps=[('imputer', SimpleImputer(
    strategy='most_frequent')), ('encoder', TargetEncoder())])
cat_pipeline1

0,1,2
,steps,"[('imputer', ...), ('encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,


- **Pipeline 2 ( suburb - sellerg ) --> Encoding Using Target Encoder**


In [32]:
cat_pipeline2 = Pipeline(steps=[('encoder', TargetEncoder())])
cat_pipeline2

0,1,2
,steps,"[('encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,


- **Pipeline 3 ( type - method - regionname - season ) --> Encoding Using One Hot Encoder**


In [None]:
cat_pipeline3 = Pipeline(
    steps=[('OneHotEncoder', OneHotEncoder(drop='first', sparse_output=False))])
cat_pipeline3

0,1,2
,steps,"[('OneHotEncoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


## **Column Transformer**


In [34]:
preprocessing = ColumnTransformer(transformers=[
    ('num_pipeline1', num_pipeline1, ['car', 'yearbuilt']),
    ('num_pipeline2', num_pipeline2, [
     'rooms', 'distance', 'bedroom2', 'bathroom', 'landsize', 'year', 'month', 'day']),
    ('cat_pipeline1', cat_pipeline1, ['councilarea']),
    ('cat_pipeline2', cat_pipeline2, ['suburb', 'sellerg']),
    ('cat_pipeline3', cat_pipeline3, ['type', 'method', 'regionname', 'season'])],
    remainder='passthrough')

preprocessing

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


### **Splitting Data Into X,y**


In [35]:
X = df.drop('price', axis=1)
y = df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

models = [
    ('Linear Regression', LinearRegression(n_jobs=-1)),
    ('Knn', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)),
    ('Xgboost', XGBRegressor()),
    ('CatBoost', CatBoostRegressor(verbose=0)),
    ('LightGBM', LGBMRegressor(force_col_wise=True))
]

for name, reg in models:

    model_pipeline = Pipeline(
        steps=[('Preprocessing', preprocessing), ('Model', reg)])

    # 2) train on train set
    model_pipeline.fit(X_train, y_train)

    # 3) predict on train and test
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)

    # 4) compute R2
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(name)
    print('Train R2 Score :', round(train_r2 * 100, 2))
    print('Test  R2 Score :', round(test_r2 * 100, 2))
    print('-' * 50)

Linear Regression
Train R2 Score : 65.99
Test  R2 Score : 67.14
--------------------------------------------------
Knn
Train R2 Score : 66.0
Test  R2 Score : 52.5
--------------------------------------------------
Decision Tree
Train R2 Score : 100.0
Test  R2 Score : 63.76
--------------------------------------------------
Random Forest
Train R2 Score : 96.92
Test  R2 Score : 80.85
--------------------------------------------------
Xgboost
Train R2 Score : 95.43
Test  R2 Score : 78.84
--------------------------------------------------
CatBoost
Train R2 Score : 92.02
Test  R2 Score : 83.08
--------------------------------------------------
[LightGBM] [Info] Total Bins 1083
[LightGBM] [Info] Number of data points in the train set: 10864, number of used features: 29
[LightGBM] [Info] Start training from score 1074964.928203
LightGBM
Train R2 Score : 89.15
Test  R2 Score : 83.02
--------------------------------------------------


##### **TransformedTargetRegressor**

### I applied Target Transformation because the price is highly skewed. Using log1p makes the target more normally distributed, reduces the impact of extreme values, and allows the regression models to learn more effectively, resulting in better accuracy and less overfitting


In [None]:
# 1) split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

models = [
    ('Linear Regression', LinearRegression(n_jobs=-1)),
    ('Knn', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)),
    ('Xgboost', XGBRegressor()),
    ('CatBoost', CatBoostRegressor(verbose=0)),
    ('LightGBM', LGBMRegressor(force_col_wise=True))
]

for name, reg in models:

    model_pipeline = Pipeline(
        steps=[('Preprocessing', preprocessing), ('Model', reg)])
    model_pipeline_scaled_target = TransformedTargetRegressor(
        regressor=model_pipeline, func=np.log1p, inverse_func=np.expm1)

    # 2) train on train set
    model_pipeline.fit(X_train, y_train)

    # 3) predict on train and test
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)

    # 4) compute R2
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(name)
    print('Train R2 Score :', round(train_r2 * 100, 2))
    print('Test  R2 Score :', round(test_r2 * 100, 2))
    print('-' * 50)

Linear Regression
Train R2 Score : 65.99
Test  R2 Score : 67.14
--------------------------------------------------
Knn
Train R2 Score : 66.0
Test  R2 Score : 52.5
--------------------------------------------------
Decision Tree
Train R2 Score : 100.0
Test  R2 Score : 63.76
--------------------------------------------------
Random Forest
Train R2 Score : 96.92
Test  R2 Score : 80.85
--------------------------------------------------
Xgboost
Train R2 Score : 95.43
Test  R2 Score : 78.84
--------------------------------------------------
CatBoost
Train R2 Score : 92.02
Test  R2 Score : 83.08
--------------------------------------------------
[LightGBM] [Info] Total Bins 1083
[LightGBM] [Info] Number of data points in the train set: 10864, number of used features: 29
[LightGBM] [Info] Start training from score 1074964.928203
LightGBM
Train R2 Score : 89.15
Test  R2 Score : 83.02
--------------------------------------------------


- **Best Model Is CatBoost**


In [38]:
# 1) Pipeline: preprocessing + CatBoost
catboost_pipeline = Pipeline(
    steps=[('Preprocessing', preprocessing), ('Model', CatBoostRegressor(verbose=0))])

# 2) Wrap with TransformedTargetRegressor (log1p on y)
catboost_scaled_target = TransformedTargetRegressor(
    regressor=catboost_pipeline, func=np.log1p, inverse_func=np.expm1)

# 3) Train on training set
catboost_scaled_target.fit(X_train, y_train)

# 4) Predict on train and test
y_train_pred = catboost_scaled_target.predict(X_train)
y_test_pred = catboost_scaled_target.predict(X_test)

# 5) R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("CatBoost with Target Transformation")
print("Train R2 Score :", round(train_r2 * 100, 2))
print("Test  R2 Score :", round(test_r2 * 100, 2))
print("-" * 50)

CatBoost with Target Transformation
Train R2 Score : 89.21
Test  R2 Score : 83.91
--------------------------------------------------


- **HyperParameterTuning**


In [39]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'regressor__Model__depth': [4, 6, 8, 10],
    'regressor__Model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__Model__l2_leaf_reg': [1, 3, 5, 7, 9]
}

result = RandomizedSearchCV(catboost_scaled_target, param_grid,
                            cv=5, scoring='r2', return_train_score=True, n_jobs=-1)

result.fit(X, y)

0,1,2
,estimator,TransformedTa...531FA1BD0>)]))
,param_distributions,"{'regressor__Model__depth': [4, 6, ...], 'regressor__Model__l2_leaf_reg': [1, 3, ...], 'regressor__Model__learning_rate': [0.01, 0.05, ...]}"
,n_iter,10
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [42]:
result.cv_results_['mean_train_score'].mean().round(2) * 100

np.float64(90.0)

In [43]:
result.cv_results_['mean_test_score'].mean().round(2) * 100

np.float64(78.0)

In [40]:
print("Best R2:", result.best_score_)
print("Best Params:", result.best_params_)

Best R2: 0.7974274242040508
Best Params: {'regressor__Model__learning_rate': 0.1, 'regressor__Model__l2_leaf_reg': 9, 'regressor__Model__depth': 6}


### **Final Model**


In [44]:
catboost_pipeline = Pipeline(steps=[('Preprocessing', preprocessing), (
    'Model', CatBoostRegressor(learning_rate=0.1, depth=6, l2_leaf_reg=9))])

catboost_model_final = TransformedTargetRegressor(
    regressor=catboost_pipeline, func=np.log1p, inverse_func=np.expm1)
catboost_model_final.fit(X, y)

0:	learn: 0.4943453	total: 21.7ms	remaining: 21.7s
1:	learn: 0.4640151	total: 29.1ms	remaining: 14.5s
2:	learn: 0.4374447	total: 33.3ms	remaining: 11.1s
3:	learn: 0.4145592	total: 37.7ms	remaining: 9.4s
4:	learn: 0.3943554	total: 40.6ms	remaining: 8.08s
5:	learn: 0.3754690	total: 45ms	remaining: 7.45s
6:	learn: 0.3590452	total: 48ms	remaining: 6.81s
7:	learn: 0.3443041	total: 52.5ms	remaining: 6.51s
8:	learn: 0.3316255	total: 55.4ms	remaining: 6.1s
9:	learn: 0.3200020	total: 59.3ms	remaining: 5.88s
10:	learn: 0.3105335	total: 62.1ms	remaining: 5.58s
11:	learn: 0.3024280	total: 64.9ms	remaining: 5.35s
12:	learn: 0.2935281	total: 80.5ms	remaining: 6.11s
13:	learn: 0.2853420	total: 88.2ms	remaining: 6.21s
14:	learn: 0.2786323	total: 94.1ms	remaining: 6.18s
15:	learn: 0.2728954	total: 99.5ms	remaining: 6.12s
16:	learn: 0.2678433	total: 103ms	remaining: 5.98s
17:	learn: 0.2632395	total: 108ms	remaining: 5.88s
18:	learn: 0.2586759	total: 111ms	remaining: 5.75s
19:	learn: 0.2539659	total: 115

0,1,2
,regressor,Pipeline(step...4531F8A1D0>)])
,transformer,
,func,<ufunc 'log1p'>
,inverse_func,<ufunc 'expm1'>
,check_inverse,True

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


### **Save Model**


In [46]:
import joblib
joblib.dump(catboost_model_final, 'model.pkl', compress=3)

['model.pkl']

In [47]:
model = joblib.load('model.pkl')

In [50]:
model.predict(X.head(5))

array([1200449.35633809,  996208.09803097, 1280126.37733963,
       1126334.04749861, 1318467.3159377 ])

In [49]:
print(model.regressor_)

Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num_pipeline1',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['car', 'yearbuilt']),
                                                 ('num_pipeline2',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['rooms', 'distance',
                                                   'bedroom2', 'bathroom',
                                   