# California Housing Data Regression

#### Utilize numerous regression techniques, with Median House Value as the target variable and evaluate the performance of each as well as results.

#### Note techniques used include:
1. Linear Regression
2. RidgeCV
3. Lasso
4. Random Forest
5. XGBoost

## Import Libraries

In [None]:
from sklearn.datasets import fetch_california_housing # Brings in Dataset

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import plotly.express as px
#import geopandas
from pandas_profiling import ProfileReport
from urllib.request import urlopen
import json
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.eval_measures import rmse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, plot_importance

### Set plotting parameters

In [None]:
matplotlib.rcParams['figure.figsize'] = [12, 8]
sns.set_style('darkgrid')
sns.set(font_scale=1.2)
%matplotlib inline

### Defining Classes/Functions

In [None]:
def regression(method, x_dat, y_dat, regression_type, **params):
    
    #fit model
    mod = method(**params)
    mod.fit(x_dat, y_dat)
    y_pred = mod.predict(x_dat)
    
    regression_results(y_dat, y_pred)
    
    if regression_type == 'Tree':
        print('Feature Importance Plot')
        sns.barplot(y=x_dat.columns, x=mod.feature_importances_)
        plt.xlabel('Mean Decrease Gini')
        plt.show()
    
    if regression_type == 'Linear':
        coef_results(x_dat, mod)
        
    print('Predicted Vs. Actual By Location')
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24,12))
    sns.scatterplot(x=x_dat.Longitude, y=x_dat.Latitude, hue=y_dat, ax=ax1, palette = 'viridis',\
                    hue_norm = (y_dat.min(), y_dat.max()))
    sns.scatterplot(x=x_dat.Longitude, y=x_dat.Latitude, hue=y_pred, ax=ax2, palette= 'viridis',\
                    hue_norm = (y_dat.min(), y_dat.max()))
    ax1.set_title('Actual Housing values')
    ax2.set_title('Predicted Housing Values')
    plt.show()
    
    print('Predicted Vs. Actual Values')
    plt.figure(figsize=(12,12))
    sns.scatterplot(x=y_dat, y=y_pred)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.show()


In [None]:
def regression_results(y_true, y_pred):
    print(color.UNDERLINE+'EVALUATION METRICS'+color.END)
    # Regression metrics
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
 
    print(color.BOLD + 'R2:  ' + color.END, round(r2,5))
    print(color.BOLD + 'MAE: ' + color.END, round(mean_absolute_error,5))
    print(color.BOLD + 'MSE: ' + color.END, round(mse,5))
    print(color.BOLD + 'RMSE:' + color.END, round(np.sqrt(mse),5))

In [None]:
def coef_results(x_train,model):
    print(color.UNDERLINE+'COEFFICIENTS'+color.END)
    print(color.BOLD +"Intercept:" + color.END,
          round(model.intercept_,4))
    for i in range(model.n_features_in_):
        print(color.BOLD + str(x.columns[i])+':'+ color.END,
              round(model.coef_[i],6))

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

## Load Dataset, Explore and Display Features

In [None]:
housing = fetch_california_housing()
housing_df = pd.DataFrame(data= np.c_[housing['data'], housing['target']],
                     columns= housing['feature_names'] + ['MedHouseVal'])

In [None]:
housing_df.sample(3)

In [None]:
housing_df.describe()

In [None]:
profile = ProfileReport(housing_df)
profile

### Pairplot of predictive attributes

In [None]:
# Using pandas qcut with q=4 divides the median house values into 4 categories, consistent with the quantiles
# seen in the Describe() function above.  This will make using the median house value as the hue in pairplots easier.
housing_df_pairplots = housing_df.copy(deep=True)

housing_df_pairplots['MedHouseValQuartiles'] = pd.qcut(housing_df_pairplots['MedHouseVal'], q=4)

housing_df_pairplots['MedHouseValQuartiles'].value_counts()

In [None]:
## Creating pairplot of predictive attributes and Median House Value (target) using q.cut into quantiles

bin_labels = ['min-25%', '25%-50%', '50%-75%', '75%-max']

housing_df_pairplots["MedHouseValQuartiles"] = pd.qcut(housing_df_pairplots["MedHouseVal"], q=4, labels=bin_labels)

# Using corner = True will only display the lower triangle. Use for easier viewing as the top triangle is the same info
sns.pairplot(housing_df_pairplots, hue='MedHouseValQuartiles', palette="viridis", corner=True);

### Standardize the data set 

In [None]:
x = housing_df
scaled_array = StandardScaler().fit_transform(x) # This is an array of the standardized values of the original columns
housing_standardized = pd.DataFrame(data= np.c_[scaled_array],\
                                    columns = ('MedInc', 'HouseAge', 'AveRooms', 'AveBedrms','Population',\
                                               'AveOccup','Latitude','Longitude', 'MedHouseVal'))
# View standardized data frame
housing_standardized.head()

In [None]:
# Separate features and target into x and y 
## for both unchanged and standardized Dataframes

x_housing = housing_df.drop(columns='MedHouseVal')
y_housing = housing_df['MedHouseVal']
x_housing_scaled = housing_standardized.drop(columns='MedHouseVal')
y_housing_scaled = housing_standardized['MedHouseVal']

### Multiple Linear Regression Model

In [None]:
regression(LinearRegression, x_housing, y_housing, 'Linear')

In [None]:
regression(LinearRegression, x_housing_scaled, y_housing_scaled, 'Linear')

### RidgeCV Model

In [None]:
regression(RidgeCV, x_housing, y_housing, 'Linear')

In [None]:
regression(RidgeCV, x_housing_scaled, y_housing_scaled, 'Linear')

### Lasso Model

In [None]:
regression(Lasso, x_housing, y_housing, 'Linear', alpha=.1)

In [None]:
regression(Lasso, x_housing_scaled, y_housing_scaled, 'Linear', alpha=.1)

### Random Forest

In [None]:
regression(RandomForestRegressor, x_housing, y_housing, 'Tree')

### XGBoost

In [None]:
regression(XGBRegressor, x_housing, y_housing, 'Tree')

The XGBoost model performs well, although it is noticable worse than the Random forest. It performs much better if it is given more estimators and depth, but this algorithm can also be prone to overfit and the feature importance starts looking odd if the number of estimators is increased too much, so I decided to stick with the default parameters for the moment.

## Conclusions

Before tuning hyper parameters, the best performing models are Linear Regression and CV Ridge Regression. The feature MedInc seems to have the largest impact on the set

The decision tree based models perform much better than the linear models. The linear models tend to miss the high value areas and assign less importance to latitude and longitude, where the more sophisticated decision tree based models do a better job of correctly classifying the high value areas and they seem to be assigning more weight to latitude and longitude in their decisions.