<a href="https://www.kaggle.com/code/habchiabdennour/prediction-with-decision-tree-regression?scriptVersionId=296761819" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import  Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error


In [2]:
df = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')

In [3]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


# Quick info

In [4]:
df.shape

(20640, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [7]:
df["total_bedrooms"]

0         129.0
1        1106.0
2         190.0
3         235.0
4         280.0
          ...  
20635     374.0
20636     150.0
20637     485.0
20638     409.0
20639     616.0
Name: total_bedrooms, Length: 20640, dtype: float64

#  Split features and target

In [8]:
y=df['median_house_value']
X=df.drop('median_house_value',axis=1)

# Define columns

In [9]:
num_cols=X.select_dtypes('number').columns
cat_cols=['ocean_proximity']


# Preprocessing pipelines

In [10]:
num_pipeline =Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))
])

cat_pipeline=Pipeline(steps=[
    ("onehot",OneHotEncoder(drop='first',sparse_output=False))
])

preprocessor=ColumnTransformer(
    transformers=[
        ('num',num_pipeline,num_cols),
        ('cat',cat_pipeline,cat_cols)
    ],
)

# Train / Test split

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=47)

# Full Pipeline  ( preprocessing + model )

In [12]:
model= Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('tree',DecisionTreeRegressor(criterion='squared_error',max_depth=6))
])

# Train model

In [13]:
model.fit(X_train,y_train)

# Predictions

In [14]:
y_train_pred=model.predict(X_train)
y_test_pred=model.predict(X_test)


# Metrics

In [15]:
r2_train=model.score(X_train,y_train)
r2_test=model.score(X_test,y_test)
mse=mean_squared_error(y_test,y_test_pred)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'r2_train : {r2_train:.4f}')
print(f'r2_test : {r2_test:.4f}')
print(f'mse : {mse:.2f}')
print(f'rmse : {rmse:.2f}')

r2_train : 0.6775
r2_test : 0.6590
mse : 4647632497.58
rmse : 68173.55


# Hyperparameter Tuning with RandomizedSearchCV


In [16]:
param_dist = {
    'tree__max_depth': [None, 3, 5, 8, 12, 20],
    'tree__min_samples_leaf': [1, 5, 10, 20, 50],
    'tree__min_samples_split': [2, 10, 20, 50],
    'tree__max_features': [None, 'sqrt', 'log2', 0.5]
}


In [17]:

tree_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeRegressor(random_state=42))
])

search = RandomizedSearchCV(
    tree_pipeline,
    param_distributions=param_dist,
    n_iter=40,
    scoring='r2',
    cv=5,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)


In [18]:
print("Best params:", search.best_params_)
print("Best CV R2:", search.best_score_)


Best params: {'tree__min_samples_split': 50, 'tree__min_samples_leaf': 10, 'tree__max_features': None, 'tree__max_depth': None}
Best CV R2: 0.737595937613378


In [19]:
best_tree = search.best_estimator_
r2_test = best_tree.score(X_test, y_test)


In [20]:
print("best_tree : ,",best_tree)


best_tree : , Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse_output=False))]),
                                                  ['ocean_proximity'])])),
                ('tree',
                 DecisionTreeRegressor(min_samples_leaf=10,
     

In [21]:
print("r2_test : ,",r2_test)


r2_test : , 0.7599091752852362


In [22]:
search.best_score_


np.float64(0.737595937613378)