In [118]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [119]:
path = '../../dataset/housing.csv'
house_rent_df = pd.read_csv(path)

In [120]:
house_rent_df = house_rent_df.dropna()

In [121]:
house_rent_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [122]:
house_rent_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [123]:
# let's one-hot encode categorical columns
categorical_cols = house_rent_df.select_dtypes(include=['object']).columns

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cols = pd.DataFrame(encoder.fit_transform(house_rent_df[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

house_rent_df = house_rent_df.drop(categorical_cols, axis=1)
house_rent_df = pd.concat([house_rent_df.reset_index(drop=True), encoded_cols.reset_index(drop=True)], axis=1)

In [124]:
# Feature Engineering

house_rent_df['rooms_per_household'] = house_rent_df['total_rooms'] / house_rent_df['households']
house_rent_df['bedrooms_per_room'] = house_rent_df['total_bedrooms'] / house_rent_df['total_rooms']
house_rent_df['population_per_household'] = house_rent_df['population'] / house_rent_df['households']

# ;et's create median_income_cat for stratified sampling
house_rent_df['median_income_cat'] = pd.cut(house_rent_df['median_income'],
                                            bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                            labels=[1, 2, 3, 4, 5])

In [125]:
house_rent_df.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,rooms_per_household,bedrooms_per_room,population_per_household,median_income_cat
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.0,0.0,0.0,1.0,0.0,6.984127,0.146591,2.555556,5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0.0,0.0,0.0,1.0,0.0,6.238137,0.155797,2.109842,5
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0.0,0.0,0.0,1.0,0.0,8.288136,0.129516,2.80226,5
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0.0,0.0,0.0,1.0,0.0,5.817352,0.184458,2.547945,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0.0,0.0,0.0,1.0,0.0,6.281853,0.172096,2.181467,3
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,0.0,0.0,0.0,1.0,0.0,4.761658,0.231774,2.139896,3
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,0.0,0.0,0.0,1.0,0.0,4.931907,0.192899,2.128405,3
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,0.0,0.0,0.0,1.0,0.0,4.797527,0.221327,1.788253,3
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,0.0,0.0,0.0,1.0,0.0,4.294118,0.260274,2.026891,2
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,0.0,0.0,0.0,1.0,0.0,4.970588,0.199211,2.172269,3


In [126]:
# let's check  correlation matrix
corr_matrix = house_rent_df.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

median_house_value            1.000000
median_income                 0.688355
median_income_cat             0.643941
ocean_proximity_<1H OCEAN     0.257614
ocean_proximity_NEAR BAY      0.160526
rooms_per_household           0.151344
ocean_proximity_NEAR OCEAN    0.140378
total_rooms                   0.133294
housing_median_age            0.106432
households                    0.064894
total_bedrooms                0.049686
ocean_proximity_ISLAND        0.023525
population_per_household     -0.023639
population                   -0.025300
longitude                    -0.045398
latitude                     -0.144638
bedrooms_per_room            -0.255880
ocean_proximity_INLAND       -0.484787
Name: median_house_value, dtype: float64

In [127]:
X_train, X_test, y_train, y_test = train_test_split(house_rent_df.drop(['median_house_value'], axis=1),
                                                    house_rent_df['median_house_value'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=house_rent_df['median_income_cat'])

In [128]:
X_train['median_income_cat'].value_counts() / len(X_train)

median_income_cat
3    0.350239
2    0.318855
4    0.176740
5    0.114340
1    0.039826
Name: count, dtype: float64

In [129]:
X_test['median_income_cat'].value_counts() / len(X_test)

median_income_cat
3    0.350135
2    0.318816
4    0.176658
5    0.114509
1    0.039883
Name: count, dtype: float64

In [130]:
X_test = X_test.drop('median_income_cat', axis=1)
X_train = X_train.drop('median_income_cat', axis=1)

In [131]:
def check_scores(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    print(f'RMSE train: {rmse_train}')
    y_test_pred = model.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    print(f'RMSE test: {rmse_test}')
    score = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score = -score.mean()
    print(f'Cross-validated Train RMSE: {score}')
    # Predict one row
    some_data = X_test.iloc[:1]
    some_label = y_test.iloc[:1]
    print("Predicted:", model.predict(some_data))
    print("Label:", list(some_label))

In [132]:
model = LinearRegression()
#
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [133]:
check_scores(model, X_train, X_test, y_train, y_test)

RMSE train: 68240.43200842815
RMSE test: 66538.24009857781
Cross-validated Train RMSE: 68573.71459386975
Predicted: [356075.73339247]
Label: [451400.0]


In [134]:
# let's use DecisionTreeRegressor and compare results
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

check_scores(tree_model, X_train, X_test, y_train, y_test) # Model is overfitting

RMSE train: 0.0
RMSE test: 69800.98207334532
Cross-validated Train RMSE: 70888.27485666268
Predicted: [338200.]
Label: [451400.0]


In [137]:
# let's use RandomForestRegressor and compare results
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
forest_model.fit(X_train, y_train)

check_scores(forest_model, X_train, X_test, y_train, y_test) # Model is performing well

RMSE train: 18698.36506440432
RMSE test: 48769.218719756245
Cross-validated Train RMSE: 50547.104969649095
Predicted: [402679.15]
Label: [451400.0]


In [140]:
# tuned RandomForestRegressor using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [10, 30, 50],
    'max_features': [2,4,8],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
}

search = RandomizedSearchCV(forest_model,
                            param_grid,
                            n_iter=10,
                            cv=3,
                            scoring='neg_root_mean_squared_error',
                            random_state=42,
                            verbose=0,
                            return_train_score=False)
search.fit(X_train, y_train)

best_model = search.best_estimator_
check_scores(best_model, X_train, X_test, y_train, y_test)

RMSE train: 22486.895797380428
RMSE test: 47865.50434516497
Cross-validated Train RMSE: 49702.79326523064
Predicted: [388929.98959524]
Label: [451400.0]


In [135]:
# # let's predict all test set and compare with real values and compare error % in separate column
# y_pred = model.predict(X_test)
# results_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test})
# results_df['Error'] = results_df['Predicted'] - results_df['Actual']
# results_df['Error %'] = (results_df['Error'] / results_df['Actual']).abs() * 100
# results_df

In [136]:
# results_df.to_csv('house_rent_predictions.csv', index=False)