In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

housing_original = pd.read_csv("housing.csv")

housing_original.info()
housing_original.iloc[:,:6].head()
housing_original.iloc[:, 5:].head()

housing_original['ocean_proximity'].value_counts()
housing = housing_original[housing_original['ocean_proximity'] != 'ISLAND']
housing['ocean_proximity'].value_counts()

housing = pd.get_dummies(housing, columns = ['ocean_proximity'], dtype = int, prefix = 'dmy')
housing.iloc[[1, 200, 1000, 1850, 5000], 9:]

train_full, test = train_test_split(housing, test_size=0.2, random_state=40)
train, val = train_test_split(train_full, test_size=0.25, random_state=36)
train.info()
train = train.dropna()
train.info()

val = val.dropna()
test = test.dropna()

from sklearn.impute import SimpleImputer 
imputer_demo_df = pd.DataFrame({
'A': [7, 4, 10],
'B': [2, np.nan, 5],
'C': [3, 6, 9]
})
print(imputer_demo_df)
imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(imputer_demo_df)
print(df_imputed)

print("Before imputation, type:", type(imputer_demo_df))
print("After imputation, type:", type(df_imputed))

import matplotlib.image as mpimg
california_img=mpimg.imread('california.png')
ax = train.plot(kind="scatter", x="longitude", y="latitude",
figsize=(10,7), s=train['population']/100, label="Population", c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=False, alpha=0.4)
plt.imshow(california_img, extent=[-124.55,-113.80, 32.45,
42.05], alpha=0.5,
cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
 
prices = train["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in
tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)
plt.legend(fontsize=16)

corr_matrix = train.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

attributes = ["median_house_value", "median_income", "total_bedrooms", "dmy_INLAND"]
pd.plotting.scatter_matrix(housing[attributes], figsize=(13, 10))
plt.show()

plt.scatter(train['median_income'], train['median_house_value'], alpha=0.7, edgecolor='k')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.show()

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
sns.boxplot(data=train, x='dmy_<1H OCEAN', y='median_house_value', ax=axes[0, 0])
sns.boxplot(data=train, x='dmy_INLAND', y='median_house_value', ax=axes[0, 1])
sns.boxplot(data=train, x='dmy_NEAR BAY', y='median_house_value', ax=axes[1, 0])
sns.boxplot(data=train, x='dmy_NEAR OCEAN', y='median_house_value', ax=axes[1, 1])

X_train_full = train_full.drop(columns=['median_house_value'])
y_train_full = train_full['median_house_value']
X_train, y_train = train.drop(columns=['median_house_value']), train['median_house_value']

X_val, y_val = val.drop(columns=['median_house_value']), val['median_house_value']
X_test, y_test = test.drop(columns=['median_house_value']), test['median_house_value']

print(X_train.shape)
print(y_train.shape)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

import time
from sklearn.ensemble import RandomForestRegressor
start_time = time.time()
rf = RandomForestRegressor()

hyperparam_grid = {'max_depth': [5, 10, 15, 50], 'n_estimators': [1, 5, 10]}

grid_search = GridSearchCV(rf, hyperparam_grid,
scoring='neg_root_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)
end_time = time.time()
 
execution_time = end_time- start_time
print(f"GridSearchCV fitting took {execution_time:.4f} seconds.")
print("Best Hyperparameters from GridSearchCV:", grid_search.best_params_)
# pd.DataFrame(grid_search.cv_results_)

lr_pred_val = lin_reg.predict(X_val)
rf_pred_val = grid_search.predict(X_val)

print('RMSE Linear Regression:', root_mean_squared_error(y_val, lr_pred_val))
print('RMSE Random Forest Regression:',
root_mean_squared_error(y_val, rf_pred_val))

print(np.mean(y_val))
print(root_mean_squared_error(y_val, rf_pred_val)/np.mean(y_val))

errors= y_val-rf_pred_val
plt.figure(figsize=(4,3))
plt.boxplot(errors)
plt.title('BoxplotofPredictionErrors(y_val-rf_pred_val)')
plt.grid(True)

best_params = grid_search.best_params_
best_params

best_rf = RandomForestRegressor(**best_params)
best_rf.fit(X_train_full, y_train_full)

best_rf.get_params()

rf_pred_test = best_rf.predict(X_test) 
rmse_test = root_mean_squared_error(y_test, rf_pred_test)
print('RMSE Random Forest on Test data:', rmse_test)
print(rmse_test/np.mean(y_test))

errors = y_test- rf_pred_test
plt.figure(figsize=(4, 3))
plt.boxplot(errors)
plt.title('Boxplot of Prediction Errors (y_test- rf_pred_test)')
plt.grid(True)

X_full= housing.drop(columns=['median_house_value'])
y_full= housing['median_house_value']
saved_model= RandomForestRegressor(**best_params)
saved_model.fit(X_full,y_full)
joblib.dump(saved_model,'rf_saved_model.pkl')

new_districts = pd.DataFrame({
    'longitude':[-118.30,-117.85],
    'latitude':[34.20, 33.90],
    'housing_median_age':[35.0,20.0],
    'total_rooms':[880.0, 1200.0],
    'total_bedrooms':[200.0, 300.0],
    'population':[500.0, 750.0],
    'households':[220.0, 280.0],
    'median_income':[4.2, 5.1],
    'dmy_<1H OCEAN':[0, 1],
    'dmy_INLAND':[1, 0],
    'dmy_NEAR BAY': [0, 0],
    'dmy_NEAR OCEAN': [0, 0]
})


predicted_values = saved_model.predict(new_districts)

for i, value in enumerate(predicted_values, start=1):
    print(f"Predicted median house value for district {i}: ${value:,.2f}")