# California Housing Data Regression

#### Utilize numerous regression techniques, with Median House Value as the target variable and evaluate the performance of each as well as results.

## Import Libraries

In [None]:
from sklearn.datasets import fetch_california_housing # Brings in Dataset

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import plotly.express as px
#import geopandas
from pandas_profiling import ProfileReport
from urllib.request import urlopen
import json
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.eval_measures import rmse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, plot_importance


### Set plotting parameters

In [None]:
matplotlib.rcParams['figure.figsize'] = [12, 8]
sns.set_style('darkgrid')
sns.set(font_scale=1.2)
%matplotlib inline

### Defining Classes/Functions

In [None]:
def regression(method, x_dat, y_dat, regression_type, **params):
    
    #fit model
    mod = method(**params)
    mod.fit(x_dat, y_dat)
    y_pred = mod.predict(x_dat)
    
    regression_results(y_dat, y_pred)
    
    if regression_type == 'Tree':
        print('Feature Importance Plot')
        sns.barplot(y=x_dat.columns, x=mod.feature_importances_)
        plt.xlabel('Mean Decrease Gini')
        plt.show()
    
    if regression_type == 'Linear':
        coef_results(x_dat, mod)
        
    print('Predicted Vs. Actual By Location')
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24,12))
    sns.scatterplot(x=x_dat.Longitude, y=x_dat.Latitude, hue=y_dat, ax=ax1, palette = 'viridis', hue_norm = (y_dat.min(), y_dat.max()))
    sns.scatterplot(x=x_dat.Longitude, y=x_dat.Latitude, hue=y_pred, ax=ax2, palette= 'viridis', hue_norm = (y_dat.min(), y_dat.max()))
    ax1.set_title('Actual Housing values')
    ax2.set_title('Predicted Housing Values')
    plt.show()
    
    print('Predicted Vs. Actual Values')
    plt.figure(figsize=(12,12))
    sns.scatterplot(x=y_dat, y=y_pred)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.show()

In [None]:
def regression_results(y_true, y_pred):
    print(color.UNDERLINE+'EVALUATION METRICS'+color.END)
    # Regression metrics
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
 
    print(color.BOLD + 'R2:  ' + color.END, round(r2,5))
    print(color.BOLD + 'MAE: ' + color.END, round(mean_absolute_error,5))
    print(color.BOLD + 'MSE: ' + color.END, round(mse,5))
    print(color.BOLD + 'RMSE:' + color.END, round(np.sqrt(mse),5))

In [None]:
def coef_results(x_train,model):
    print(color.UNDERLINE+'COEFFICIENTS'+color.END)
    print(color.BOLD +"Intercept:" + color.END,
          round(model.intercept_,4))
    for i in range(model.n_features_in_):
        print(color.BOLD + str(x.columns[i])+':'+ color.END,
              round(model.coef_[i],6))

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

## Load Dataset, Explore and Display Features

In [None]:
housing = fetch_california_housing()
housing_df = pd.DataFrame(data= np.c_[housing['data'], housing['target']],
                     columns= housing['feature_names'] + ['MedHouseVal'])

In [None]:
housing_df.sample(3)

In [None]:
housing_df.describe()

In [None]:
profile = ProfileReport(housing_df)
profile

# Basic Linear Regression With Statsmodels

In [None]:
model = smf.ols(data=housing_df, formula='MedHouseVal ~ MedInc + AveRooms + Latitude + HouseAge + AveBedrms + Longitude + Population + AveOccup')
results = model.fit()
results.summary()

### Geographic plot of  Median House Value

In [None]:
## Creating a spatial map of the housing data with the Median House Value 
# binned and represented by size of its point and color

fig_dims = (6, 7) # Using Dims to try to simulate Cali Shape
fig, ax = plt.subplots(figsize=fig_dims)  
sns.scatterplot(data=housing_df, x="Longitude", y="Latitude",
                size="MedHouseVal", hue="MedHouseVal",
                palette="viridis", alpha=0.5)

plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 0.95),
           loc="upper left")
_ = plt.title("Median house value by spatial location")

### Pairplot of predictive attributes

In [None]:
## Creating pairplot of predictive attributes and Median House Value (target)
# This is ignoring Latitude and Longitude

housing_df_noGeo = housing_df.drop(columns=['Latitude','Longitude'])
housing_df_noGeo["target"] = pd.qcut(housing_df_noGeo["MedHouseVal"],
                                     6, retbins=False)
housing_df_noGeo["target"] = housing_df_noGeo["target"].apply(lambda x: x.mid)

_ = sns.pairplot(data=housing_df_noGeo, hue="target", palette="viridis")

### Note: can someone find a better way to display this? feels a bit hard to read

### Variables by County Location

Could not get below to run for me so just commented out for time being

In [None]:
# # Create a geopandas dataframe with the latitude/longitude values
# housing_gdf = geopandas.GeoDataFrame(
#     housing_df, geometry=geopandas.points_from_xy(housing_df.Longitude, housing_df.Latitude))

# print(housing_gdf.head())

# # County information from the US 2018 census
# counties = geopandas.read_file('CA_Counties_TIGER2016.shp')
# print(counties.head())


# # Below are all some iterations of what I've tried; I'm getting confused trying to merge the 
# # county lines and the latitude/longitude values from the dataset; I'm having a hard time
# # figuring out how to make them comparable 

# ax = counties.boundary.plot(color='black', figsize=(18, 12))

# ax.plot()

# fig = px.scatter_geo(housing_gdf)

# fig.show()

# housing_gdf.plot(ax=ax, color='red')

# plt.show()

# with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
#     all_counties = json.load(response)

# fig = px.choropleth(housing_gdf, geojson=counties, locations='geometry', color='target',
#                            range_color=(0, 12),
#                            scope="usa"
#                           )
# fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# fig.show()    

### Ordinary Least Squares Regression

In [None]:
def OLS_Regression(df):
    Y = df['MedHouseVal']
    for col in df.iloc[:,0:6]:
        print(color.BOLD + col + color.END)
        X = df[col]
        X = sm.add_constant(X)
        model = sm.OLS(Y,X)
        results = model.fit()
        print(color.BOLD +"Results: " + color.END + str(results.params))
        print(color.BOLD +"T-values: "+ color.END + str(results.tvalues))
        print(color.BOLD +"T-Test: " + color.END + str(results.t_test([1, 0])))
        print('')
    
    
OLS_Regression(housing_df)

### Standardize the data set 

In [None]:
x = housing_df
scaled_array = StandardScaler().fit_transform(x) # This is an array of the standardized values of the original columns
housing_standardized = pd.DataFrame(data= np.c_[scaled_array],\
                                    columns = ('MedInc', 'HouseAge', 'AveRooms', 'AveBedrms','Population',\
                                               'AveOccup','Latitude','Longitude', 'MedHouseVal'))
# View standardized data frame
housing_standardized.head()

In [None]:
# Separate features and target into x and y 
## for both unchanged and standardized Dataframes

x_housing = housing_df.drop(columns='MedHouseVal')
y_housing = housing_df['MedHouseVal']
x_housing_scaled = housing_standardized.drop(columns='MedHouseVal')
y_housing_scaled = housing_standardized['MedHouseVal']

### Multiple Linear Regression Model

In [None]:
regression(LinearRegression, x_housing, y_housing, 'Linear')

In [None]:
# Standardized Data

# Define the multiple linear regression model
linear_regress_scaled = LinearRegression()

# Fit the multiple linear regression model
linear_regress_scaled.fit(x_housing_scaled,y_housing_scaled)

# Predict y hat with the data
y_pred_mlr_scaled = linear_regress_scaled.predict(x_housing_scaled)

# Return R-squared, MSE, and RMSE scores
coef_results(x_housing,linear_regress_scaled)
regression_results(y_housing_scaled,y_pred_mlr_scaled)

In [None]:
regression(LinearRegression, x_housing_scaled, y_housing_scaled, 'Linear')

### RidgeCV Model

In [None]:
regression(RidgeCV, x_housing, y_housing, 'Linear')

In [None]:
regression(RidgeCV, x_housing_scaled, y_housing_scaled, 'Linear')

### Lasso Model

In [None]:
regression(Lasso, x_housing, y_housing, 'Linear', alpha=.1)

In [None]:
regression(Lasso, x_housing_scaled, y_housing_scaled, 'Linear', alpha=.1)

### Random Forest

In [None]:
regression(RandomForestRegressor, x_housing, y_housing, 'Tree')

### XGBoost

In [None]:
regression(XGBRegressor, x_housing, y_housing, 'Tree')

The XGBoost model performs well, although it is noticable worse than the Random forest. It performs much better if it is given more estimators and depth, but this algorithm can also be prone to overfit and the feature importance starts looking odd if the number of estimators is increased too much, so I decided to stick with the default parameters for the moment.

## Conclusions

Before tuning hyper parameters, the best performing models are Linear Regression and CV Ridge Regression. The feature MedInc seems to have the largest impact on the set

The decision tree based models perform much better than the linear models. The linear models tend to miss the high value areas and assign less importance to latitude and longitude, where the more sophisticated decision tree based models do a better job of correctly classifying the high value areas and they seem to be assigning more weight to latitude and longitude in their decisions.