# **Preprocessing & Pipeline**

#### **Read Data**

In [33]:
import pandas as pd

data = pd.read_csv('cleaned_data.csv' , index_col= 0)
data

Unnamed: 0,suburb,rooms,type,price,method,sellerg,distance,bedroom2,bathroom,car,landsize,yearbuilt,councilarea,regionname,year,month,day,season
0,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra,Northern Metropolitan,2016,3,12,Spring
1,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,1900.0,Yarra,Northern Metropolitan,2016,4,2,Spring
2,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,1900.0,Yarra,Northern Metropolitan,2017,4,3,Spring
3,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3.0,2.0,1.0,94.0,,Yarra,Northern Metropolitan,2017,4,3,Spring
4,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3.0,1.0,2.0,120.0,2014.0,Yarra,Northern Metropolitan,2016,4,6,Spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,4,h,1245000.0,S,Barry,16.7,4.0,2.0,2.0,652.0,1981.0,,South-Eastern Metropolitan,2017,8,26,Summer
13576,Williamstown,3,h,1031000.0,SP,Williams,6.8,3.0,2.0,2.0,333.0,1995.0,,Western Metropolitan,2017,8,26,Summer
13577,Williamstown,3,h,1170000.0,S,Raine,6.8,3.0,2.0,4.0,436.0,1997.0,,Western Metropolitan,2017,8,26,Summer
13578,Williamstown,4,h,2500000.0,PI,Sweeney,6.8,4.0,1.0,5.0,866.0,1920.0,,Western Metropolitan,2017,8,26,Summer


#### **Spilt Data To ( x , y )**

In [34]:
x = data.drop(columns=['price'])
y = data['price']

## **Numerical Pipeline**

- **Select Numerical Columns**

In [35]:
num_col = x.select_dtypes(include= 'number').columns
num_col

Index(['rooms', 'distance', 'bedroom2', 'bathroom', 'car', 'landsize',
       'yearbuilt', 'year', 'month', 'day'],
      dtype='object')

In [36]:
x.select_dtypes(include= 'number').head()

Unnamed: 0,rooms,distance,bedroom2,bathroom,car,landsize,yearbuilt,year,month,day
0,2,2.5,2.0,1.0,1.0,202.0,,2016,3,12
1,2,2.5,2.0,1.0,0.0,156.0,1900.0,2016,4,2
2,3,2.5,3.0,2.0,0.0,134.0,1900.0,2017,4,3
3,3,2.5,3.0,2.0,1.0,94.0,,2017,4,3
4,4,2.5,3.0,1.0,2.0,120.0,2014.0,2016,4,6


In [37]:
x.select_dtypes(include= 'number').isna().sum()

rooms           0
distance        0
bedroom2        0
bathroom        0
car            62
landsize        0
yearbuilt    5375
year            0
month           0
day             0
dtype: int64

### **Select Pipelines**

- **Pipeline 1 -> ( car , yearbuilt ) -> impute missing by  Median ->  Scaling By Standard Scaling**

In [38]:
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

imputer = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()

num_pipeline1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

num_pipeline1

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


- **Pipeline 2 --> (  rooms , distance ,  bedroom2 ,  bathroom  ,  landsize ,  year ,  month ,  day ) --> Scaling By Standard Scaling**

In [39]:
scaler2 = StandardScaler()

num_pipeline2 = Pipeline(steps=[
    ('scaler', StandardScaler())
])

num_pipeline2

0,1,2
,steps,"[('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


## **Categorical Pipeline**

- **Select Categorical Columns**

In [40]:
cat_col =x.select_dtypes(include= 'object').columns
cat_col

Index(['suburb', 'type', 'method', 'sellerg', 'councilarea', 'regionname',
       'season'],
      dtype='object')

In [41]:
x.select_dtypes(include= 'object').head()

Unnamed: 0,suburb,type,method,sellerg,councilarea,regionname,season
0,Abbotsford,h,S,Biggin,Yarra,Northern Metropolitan,Spring
1,Abbotsford,h,S,Biggin,Yarra,Northern Metropolitan,Spring
2,Abbotsford,h,SP,Biggin,Yarra,Northern Metropolitan,Spring
3,Abbotsford,h,PI,Biggin,Yarra,Northern Metropolitan,Spring
4,Abbotsford,h,VB,Nelson,Yarra,Northern Metropolitan,Spring


In [42]:
for col in cat_col:
    print(f'Col : {col}')
    print(x[col].nunique())
    print('-' * 20)

Col : suburb
314
--------------------
Col : type
3
--------------------
Col : method
5
--------------------
Col : sellerg
268
--------------------
Col : councilarea
33
--------------------
Col : regionname
8
--------------------
Col : season
4
--------------------


In [43]:
x.select_dtypes(include= 'object').isna().sum()

suburb            0
type              0
method            0
sellerg           0
councilarea    1369
regionname        0
season            0
dtype: int64

### **Select Pipelines**

- **Pipeline 1 ( Councilarea ) --> Impute Using Simple Imputer --> Encoding Using Target Encoding**

In [44]:
from category_encoders import TargetEncoder

imputer_cat = SimpleImputer(strategy='most_frequent')
encoder1 = TargetEncoder()

cat_pipeline1 = Pipeline(steps=[
    ('imputer', imputer_cat),
    ('encoder', encoder1)
])

cat_pipeline1

0,1,2
,steps,"[('imputer', ...), ('encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,


- **Pipeline 2 ( suburb - sellerg ) --> Encoding Using Target Encoder**

In [45]:
encoder2 = TargetEncoder()

cat_pipeline2 = Pipeline(steps=[
    ('encoder', encoder2)
])

cat_pipeline2

0,1,2
,steps,"[('encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,


- **Pipeline 3 ( type - method - regionname - season ) --> Encoding Using One Hot Encoder**

In [46]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop= 'first', sparse_output=False)

cat_pipeline3 = Pipeline(steps=[
    ('OneHotEncoder', ohe)
])

cat_pipeline3

0,1,2
,steps,"[('OneHotEncoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


## **Column Transformer**

In [47]:
from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer(transformers=[
    ('num_pipeline1', num_pipeline1, ['car', 'yearbuilt']),
    ('num_pipeline2', num_pipeline2, ['rooms' , 'distance' , 'bedroom2' , 'bathroom'  , 'landsize' , 'year' , 'month' , 'day']),
    ('cat_pipeline1', cat_pipeline1, ['councilarea']),  
    ('cat_pipeline2', cat_pipeline2, ['suburb', 'sellerg']),  
    ('cat_pipeline3', cat_pipeline3, ['type', 'method', 'regionname', 'season'])

],
    remainder='passthrough'
)

preprocessing

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


### **Select Best Model**

In [48]:
# Import cross validation
from sklearn.model_selection import cross_validate
# Import Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

models = [
    ('Linear Regression', LinearRegression(n_jobs= -1)),
    ('Knn', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor(random_state= 42)),
    ('Random Forest', RandomForestRegressor(random_state= 42, n_jobs= -1)),
    ('Xgboost', XGBRegressor()),
    ('CatBoost', CatBoostRegressor()),
    ('LightGBM', LGBMRegressor())
]

for model in models:

    model_pipeline = Pipeline(steps= [ ('Preprocessing', preprocessing), ('Model', model[1]) ])

    result = cross_validate(model_pipeline, x, y, cv= 5, scoring= 'r2', return_train_score= True, n_jobs= -1)

    print(model[0])
    print('Train R2 Score :', result['train_score'].mean().round(2) * 100)
    print('Test R2 Score :', result['test_score'].mean().round(2) * 100)
    # print('Total Training Time :', result['fit_time'].sum().round(2))
    print('-' * 50)

Linear Regression
Train R2 Score : 67.0
Test R2 Score : 64.0
--------------------------------------------------
Knn
Train R2 Score : 67.0
Test R2 Score : 40.0
--------------------------------------------------
Decision Tree
Train R2 Score : 100.0
Test R2 Score : 51.0
--------------------------------------------------
Random Forest
Train R2 Score : 97.0
Test R2 Score : 76.0
--------------------------------------------------
Xgboost
Train R2 Score : 95.0
Test R2 Score : 76.0
--------------------------------------------------
CatBoost
Train R2 Score : 92.0
Test R2 Score : 79.0
--------------------------------------------------
LightGBM
Train R2 Score : 89.0
Test R2 Score : 78.0
--------------------------------------------------


In [49]:
import numpy as np
from sklearn.compose import TransformedTargetRegressor
# Import cross validation
from sklearn.model_selection import cross_validate
# Import Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

models = [
    ('Linear Regression', LinearRegression(n_jobs= -1)),
    ('Knn', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor(random_state= 42)),
    ('Random Forest', RandomForestRegressor(random_state= 42, n_jobs= -1)),
    ('Xgboost', XGBRegressor()),
    ('CatBoost', CatBoostRegressor()),
    ('LightGBM', LGBMRegressor())
]

for model in models:

    model_pipeline = Pipeline(steps= [ ('Preprocessing', preprocessing), ('Model', model[1]) ])
    model_pipeline_scaled_target = TransformedTargetRegressor(regressor= model_pipeline, func= np.log1p, inverse_func= np.expm1)
    result = cross_validate(model_pipeline_scaled_target, x, y, cv= 5, scoring= 'r2', return_train_score= True, n_jobs= -1)

    print(model[0])
    print('Train R2 Score :', result['train_score'].mean().round(2) * 100)
    print('Test R2 Score :', result['test_score'].mean().round(2) * 100)
    # print('Total Training Time :', result['fit_time'].sum().round(2))
    print('-' * 50)

Linear Regression
Train R2 Score : 69.0
Test R2 Score : -5.0
--------------------------------------------------
Knn
Train R2 Score : 72.0
Test R2 Score : 57.99999999999999
--------------------------------------------------
Decision Tree
Train R2 Score : 100.0
Test R2 Score : 53.0
--------------------------------------------------
Random Forest
Train R2 Score : 96.0
Test R2 Score : 75.0
--------------------------------------------------
Xgboost
Train R2 Score : 94.0
Test R2 Score : 77.0
--------------------------------------------------
CatBoost
Train R2 Score : 90.0
Test R2 Score : 80.0
--------------------------------------------------
LightGBM
Train R2 Score : 87.0
Test R2 Score : 78.0
--------------------------------------------------


- **Best Model Is CatBoost**

In [50]:
catboost = Pipeline(steps= [ ('Preprocessing', preprocessing), ('Model', CatBoostRegressor()) ])
catboost_scaled_target = TransformedTargetRegressor(regressor= catboost, func= np.log1p, inverse_func= np.expm1)
result = cross_validate(catboost_scaled_target, x, y, cv= 5, scoring= 'r2', return_train_score= True, n_jobs= -1)

In [51]:
result['train_score'].mean().round(2) * 100

np.float64(90.0)

In [52]:
result['test_score'].mean().round(2) * 100

np.float64(80.0)

In [53]:
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_validate
import numpy as np

# إنشاء البايبلاين
lightgbm = Pipeline(steps=[
    ('Preprocessing', preprocessing),   # خطوة المعالجة المسبقة
    ('Model', LGBMRegressor(            # موديل LightGBM
        n_estimators=1000,              # عدد الأشجار (iterations)
        learning_rate=0.05,             # معدل التعلم
        num_leaves=31,                  # عدد الأوراق في كل شجرة
        max_depth=6,                    # أقصى عمق للشجرة
        feature_fraction=0.8,           # نسبة الخصائص المستخدمة في كل تكرار
        bagging_fraction=0.8,           # نسبة البيانات المستخدمة في كل تكرار
        bagging_freq=5,                 # عدد مرات تفعيل bagging
        lambda_l1=0.1,                  # Regularization L1
        lambda_l2=0.1,                  # Regularization L2
        min_child_samples=30,          # أقل عدد بيانات في الورقة
        random_state=42
    ))
])

# تحويل الهدف (Target Scaling)
lightgbm_scaled_target = TransformedTargetRegressor(
    regressor=lightgbm,
    func=np.log1p,       # تحويل الهدف باستخدام log(1+x)
    inverse_func=np.expm1  # إعادة الهدف للشكل الأصلي باستخدام exp(x)-1
)

# عمل Cross Validation لتقييم النموذج
result = cross_validate(
    lightgbm_scaled_target,
    x, y,
    cv=5,                    # 5-fold cross validation
    scoring='r2',            # مقياس R²
    return_train_score=True, # إرجاع نتائج التدريب والاختبار
    n_jobs=-1                # استخدام كل الأنوية لتسريع العملية
)

print("Train R² Scores:", result['train_score'])
print("Test R² Scores:", result['test_score'])
print("Average Train R²:", result['train_score'].mean())
print("Average Test R²:", result['test_score'].mean())


Train R² Scores: [0.91966728 0.91798156 0.91605374 0.91871997 0.92654146]
Test R² Scores: [0.805779   0.81120081 0.77510968 0.81677434 0.75814824]
Average Train R²: 0.9197928039612796
Average Test R²: 0.793402413901327


- **HyperParameterTuning**

In [54]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'regressor__Model__depth': [4, 6, 8, 10],
    'regressor__Model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__Model__l2_leaf_reg': [1, 3, 5, 7, 9]
}   

result = RandomizedSearchCV(catboost_scaled_target, param_grid, cv= 5, scoring= 'r2', return_train_score= True, n_jobs= -1)

result.fit(x, y)

0:	learn: 0.4669229	total: 8.51ms	remaining: 8.5s
1:	learn: 0.4261855	total: 12.9ms	remaining: 6.45s
2:	learn: 0.3908735	total: 18.2ms	remaining: 6.06s
3:	learn: 0.3629170	total: 22.7ms	remaining: 5.65s
4:	learn: 0.3406693	total: 26.4ms	remaining: 5.26s
5:	learn: 0.3241608	total: 30ms	remaining: 4.97s
6:	learn: 0.3078272	total: 35.1ms	remaining: 4.98s
7:	learn: 0.2954988	total: 38.8ms	remaining: 4.82s
8:	learn: 0.2854385	total: 42.6ms	remaining: 4.69s
9:	learn: 0.2759400	total: 46.2ms	remaining: 4.57s
10:	learn: 0.2680109	total: 56.1ms	remaining: 5.04s
11:	learn: 0.2625193	total: 60.5ms	remaining: 4.98s
12:	learn: 0.2575562	total: 67.1ms	remaining: 5.1s
13:	learn: 0.2528637	total: 71.3ms	remaining: 5.02s
14:	learn: 0.2490673	total: 74.9ms	remaining: 4.92s
15:	learn: 0.2452678	total: 78.7ms	remaining: 4.84s
16:	learn: 0.2422316	total: 83.3ms	remaining: 4.82s
17:	learn: 0.2398219	total: 87.2ms	remaining: 4.76s
18:	learn: 0.2372661	total: 91.5ms	remaining: 4.72s
19:	learn: 0.2354764	total

0,1,2
,estimator,TransformedTa...5B71FAB10>)]))
,param_distributions,"{'regressor__Model__depth': [4, 6, ...], 'regressor__Model__l2_leaf_reg': [1, 3, ...], 'regressor__Model__learning_rate': [0.01, 0.05, ...]}"
,n_iter,10
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [55]:
result.best_params_

{'regressor__Model__learning_rate': 0.2,
 'regressor__Model__l2_leaf_reg': 3,
 'regressor__Model__depth': 4}

In [56]:
result.cv_results_['mean_train_score'].mean().round(2) * 100

np.float64(88.0)

In [57]:
result.cv_results_['mean_test_score'].mean().round(2) * 100

np.float64(77.0)

- **Random Forest**

In [58]:
random_forest = Pipeline(steps= [ ('Preprocessing', preprocessing), ('Model', RandomForestRegressor()) ])
random_scaled_target = TransformedTargetRegressor(regressor= random_forest, func= np.log1p, inverse_func= np.expm1)
result = cross_validate(random_scaled_target, x, y, cv= 5, scoring= 'r2', return_train_score= True, n_jobs= -1)

In [59]:
result['train_score'].mean().round(2) * 100

np.float64(96.0)

In [60]:
result['test_score'].mean().round(2) * 100

np.float64(75.0)

In [61]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    
    'regressor__Model__n_estimators': [100, 200, 300, 500],
    'regressor__Model__max_depth': [4, 6, 8, 10, None],
    'regressor__Model__min_samples_split': [2, 5, 10],
    'regressor__Model__min_samples_leaf': [1, 2, 4],
    'regressor__Model__max_features': ['sqrt', 'log2', None]

}


result = RandomizedSearchCV(
    estimator=random_scaled_target,     
    param_distributions=param_grid, 
    cv=5,
    scoring='r2',
    return_train_score=True,
    n_jobs=-1,
    random_state=42
)

result.fit(x, y)

0,1,2
,estimator,TransformedTa...egressor())]))
,param_distributions,"{'regressor__Model__max_depth': [4, 6, ...], 'regressor__Model__max_features': ['sqrt', 'log2', ...], 'regressor__Model__min_samples_leaf': [1, 2, ...], 'regressor__Model__min_samples_split': [2, 5, ...], ...}"
,n_iter,10
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [62]:
result.best_params_

{'regressor__Model__n_estimators': 200,
 'regressor__Model__min_samples_split': 5,
 'regressor__Model__min_samples_leaf': 2,
 'regressor__Model__max_features': None,
 'regressor__Model__max_depth': None}

In [63]:
result.cv_results_['mean_train_score']

array([0.72660721, 0.56949779, 0.93009508, 0.57065229, 0.95484611,
       0.57034224, 0.79141652, 0.84581752, 0.78782307, 0.76080016])

### **Final Model**

In [64]:
catboost_pipeline = Pipeline(steps= [ ('Preprocessing', preprocessing),
                                      ('Model', CatBoostRegressor(learning_rate = 0.002, depth = 4 , l2_leaf_reg = 3)) ])

catboost_model_final = TransformedTargetRegressor(regressor= catboost_pipeline, func= np.log1p, inverse_func= np.expm1)
catboost_model_final.fit(x, y)

0:	learn: 0.5261398	total: 12.7ms	remaining: 12.7s
1:	learn: 0.5255231	total: 19.2ms	remaining: 9.56s
2:	learn: 0.5249165	total: 24.5ms	remaining: 8.13s
3:	learn: 0.5242915	total: 28.9ms	remaining: 7.19s
4:	learn: 0.5236811	total: 32.9ms	remaining: 6.55s
5:	learn: 0.5230798	total: 39.8ms	remaining: 6.6s
6:	learn: 0.5224729	total: 43.7ms	remaining: 6.19s
7:	learn: 0.5218550	total: 47.5ms	remaining: 5.89s
8:	learn: 0.5212623	total: 52.9ms	remaining: 5.83s
9:	learn: 0.5206653	total: 56.9ms	remaining: 5.63s
10:	learn: 0.5200529	total: 60.7ms	remaining: 5.45s
11:	learn: 0.5194474	total: 64.5ms	remaining: 5.31s
12:	learn: 0.5188598	total: 70.3ms	remaining: 5.34s
13:	learn: 0.5182536	total: 74.4ms	remaining: 5.24s
14:	learn: 0.5176704	total: 78.2ms	remaining: 5.13s
15:	learn: 0.5170869	total: 82.5ms	remaining: 5.07s
16:	learn: 0.5165130	total: 89.5ms	remaining: 5.17s
17:	learn: 0.5159203	total: 93.2ms	remaining: 5.08s
18:	learn: 0.5153155	total: 97.5ms	remaining: 5.03s
19:	learn: 0.5147173	to

0,1,2
,regressor,Pipeline(step...55B71EB530>)])
,transformer,
,func,<ufunc 'log1p'>
,inverse_func,<ufunc 'expm1'>
,check_inverse,True

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


### **Save Model**

In [73]:
import joblib
joblib.dump(model, 'model.pkl', compress=3)

['model.pkl']

In [74]:
model = joblib.load('model.pkl')

In [75]:
model.predict(x.head(1))

array([973504.88831593])

In [76]:
import joblib

model = joblib.load('model.pkl')
print(model.regressor_)   # دا الموديل الداخلي اللي جوه TransformedTargetRegressor


Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num_pipeline1',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['car', 'yearbuilt']),
                                                 ('num_pipeline2',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['rooms', 'distance',
                                                   'bedroom2', 'bathroom',
                                   