In [1]:
import os
import tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from six.moves import urllib
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import category_encoders as ce
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [5]:

housing=pd.read_csv('housing.csv')

In [6]:
# Create new features
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population'] / housing['households']

In [7]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.802260
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,5.045455,0.224625,2.560606
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,6.114035,0.215208,3.122807
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,5.205543,0.215173,2.325635
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,5.329513,0.219892,2.123209


In [8]:
# Log transformation for skewed features
housing['log_median_income'] = np.log1p(housing['median_income'])
housing['log_median_house_value'] = np.log1p(housing['median_house_value'])

In [9]:
# Drop NaN values created by divisions
housing = housing.dropna()

In [10]:
null_values = housing.isnull().sum()
print(null_values)

longitude                   0
latitude                    0
housing_median_age          0
total_rooms                 0
total_bedrooms              0
population                  0
households                  0
median_income               0
median_house_value          0
ocean_proximity             0
rooms_per_household         0
bedrooms_per_room           0
population_per_household    0
log_median_income           0
log_median_house_value      0
dtype: int64


In [11]:
# Prepare the features and target variable
x = housing.drop(columns=['median_house_value', 'log_median_house_value'])
y = housing['median_house_value']

In [12]:
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,log_median_income
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,6.984127,0.146591,2.555556,2.232720
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,6.238137,0.155797,2.109842,2.230165
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,8.288136,0.129516,2.802260,2.111110
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,5.817352,0.184458,2.547945,1.893579
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,6.281853,0.172096,2.181467,1.578195
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND,5.045455,0.224625,2.560606,0.940124
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND,6.114035,0.215208,3.122807,1.268861
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND,5.205543,0.215173,2.325635,0.993252
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND,5.329513,0.219892,2.123209,1.053336


In [13]:
import category_encoders as ce
# Target Encoding for 'ocean_proximity'
encoder = ce.TargetEncoder(cols=['ocean_proximity'])
x = encoder.fit_transform(x, y)

In [14]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [15]:
# Initialize and train the RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [16]:
# Predict on the test set
y_pred = rf.predict(x_test)

# Evaluate the model
print(f"R2 score: {r2_score(y_test, y_pred):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")

R2 score: 0.81
MAE: 32934.06
MSE: 2582190300.99
RMSE: 50815.26


In [17]:
some_data = x_train[:10]
some_labels = y_train.iloc[:10]
my_pred = rf.predict(some_data).tolist()

for i in range(10):
    print(f'real price: {some_labels.tolist() [i]:8.2f}\t\tpredicted price: {my_pred[i]:8.2f}')

real price: 227600.00		predicted price: 231064.00
real price: 110400.00		predicted price: 104211.00
real price: 248100.00		predicted price: 268063.00
real price: 305600.00		predicted price: 275838.01
real price: 214600.00		predicted price: 201209.00
real price: 227300.00		predicted price: 219136.00
real price: 177200.00		predicted price: 188114.04
real price: 84700.00		predicted price: 77318.00
real price: 451400.00		predicted price: 444487.11
real price: 500001.00		predicted price: 457274.68


In [18]:
joblib.dump(rf, 'random_forest_regressor_model.pkl')

['random_forest_regressor_model.pkl']

In [19]:
# Save the model column names to a file
model_columns = x_train.columns.tolist()
joblib.dump(model_columns, 'model_columnsCOPY.pkl')


['model_columnsCOPY.pkl']

In [20]:
joblib.dump(encoder, 'target_encoder.pkl') 

['target_encoder.pkl']

In [None]:
def predict_house_price():
    try:
        longitude = float(input("Enter longitude: "))
        latitude = float(input("Enter latitude: "))
        housing_median_age = float(input("Enter housing median age: "))
        total_rooms = float(input("Enter total rooms: "))
        total_bedrooms = float(input("Enter total bedrooms: "))
        population = float(input("Enter population: "))
        households = float(input("Enter households: "))
        median_income = float(input("Enter median income: "))
        ocean_proximity = input("Enter ocean proximity (NEAR BAY, <1H OCEAN, INLAND, NEAR OCEAN, ISLAND): ")
        
        rooms_per_household = total_rooms / households
        bedrooms_per_room = total_bedrooms / total_rooms
        population_per_household = population / households
        log_median_income = np.log(median_income)
        
        user_input = pd.DataFrame({
            'longitude': [longitude],
            'latitude': [latitude],
            'housing_median_age': [housing_median_age],
            'total_rooms': [total_rooms],
            'total_bedrooms': [total_bedrooms],
            'population': [population],
            'households': [households],
            'median_income': [median_income],
            'ocean_proximity': [ocean_proximity],
            'rooms_per_household': [rooms_per_household],
            'bedrooms_per_room': [bedrooms_per_room],
            'population_per_household': [population_per_household],
            'log_median_income': [log_median_income]
        })
        
        encoder = joblib.load('target_encoder.pkl')
        user_input = encoder.transform(user_input)

        user_input = user_input[model_columns]

        predicted_price = rf.predict(user_input)

        print(f"The predicted house price is: ${predicted_price[0]:,.2f}")

    except ValueError as ve:
        print(f"Invalid input: {ve}")
    except Exception as e:
        print(f"An error occurred: {e}")

predict_house_price()

The predicted house price is: $306,734.13
