In [1]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import plot
import seaborn as sns
%matplotlib inline

In [3]:
df=pd.read_csv('../Data_Set/housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
df['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,20640.0,-119.569704,2.003532,-124.35,-121.8,-118.49,-118.01,-114.31
latitude,20640.0,35.631861,2.135952,32.54,33.93,34.26,37.71,41.95
housing_median_age,20640.0,28.639486,12.585558,1.0,18.0,29.0,37.0,52.0
total_rooms,20640.0,2635.763081,2181.615252,2.0,1447.75,2127.0,3148.0,39320.0
total_bedrooms,20433.0,537.870553,421.38507,1.0,296.0,435.0,647.0,6445.0
population,20640.0,1425.476744,1132.462122,3.0,787.0,1166.0,1725.0,35682.0
households,20640.0,499.53968,382.329753,1.0,280.0,409.0,605.0,6082.0
median_income,20640.0,3.870671,1.899822,0.4999,2.5634,3.5348,4.74325,15.0001
median_house_value,20640.0,206855.816909,115395.615874,14999.0,119600.0,179700.0,264725.0,500001.0


In [8]:
df1= df.copy()

# convert categorical to numerical 
df1['ocean_proximity']=df1['ocean_proximity'].map({
    '<1H OCEAN':0,
    'INLAND':1,
    'NEAR OCEAN':2,
    'NEAR BAY':3,
    'ISLAND':4
})

# Repalce nan value with mean
df1['total_bedrooms'].fillna(df1['total_bedrooms'].mean(), inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['total_bedrooms'].fillna(df1['total_bedrooms'].mean(), inplace= True)


In [9]:
df1.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,3
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,3


In [10]:
# Normalize
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_cols = df1.drop(columns='ocean_proximity')  # numeric columns
df1[num_cols.columns] = scaler.fit_transform(num_cols)

In [11]:
# train test split
from sklearn.model_selection import train_test_split

X = df1.drop('median_house_value', axis=1)
y = df1['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [12]:
# Shapes
X_train.shape,y_train.shape,X_test.shape, y_test.shape

((16512, 9), (16512,), (4128, 9), (4128,))

**Gradiant Boosting**

In [13]:
# gradiant bosting and xgboosting
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    random_state=42
)

# Fit the model
gbr.fit(X_train,y_train)


In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# accuracy
y_pred_gbr= gbr.predict(X_test)

# Gradient Boosting
mse_gbr = mean_squared_error(y_test, y_pred_gbr)*100
# rmse_gbr = mean_squared_error(y_test, y_pred_gbr, squared=False)  # Root MSE
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)*100
r2_gbr  = r2_score(y_test, y_pred_gbr)*100

print("Gradient Boosting:")
print("  MSE :", mse_gbr)
# print("  RMSE:", rmse_gbr)
print("  MAE :", mae_gbr)
print("  R²  :", r2_gbr)

Gradient Boosting:
  MSE : 0.9837432910751408
  MAE : 6.5992709522537565
  R²  : 82.34116768493863


**XG-Boost**

In [17]:
import xgboost
from xgboost import XGBRegressor
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# train the model
xgb.fit(X_train,y_train)

In [19]:
# predict
y_pred_xgb = xgb.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)*100
# rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)*100
r2_xgb  = r2_score(y_test, y_pred_xgb)*100

print("XGBoost Results:")
print("  MSE :", mse_xgb)
# print("  RMSE:", rmse_xgb)
print("  MAE :", mae_xgb)
print("  R²  :", r2_xgb)

XGBoost Results:
  MSE : 0.9605916779730925
  MAE : 6.5502830648307935
  R²  : 82.7567542076639


In [4]:
import pandas as pd

df = pd.read_csv("../artifacts/train_processed.csv")
print(df['price_class'].value_counts())


KeyError: 'price_class'