# California Housing Data Regression

#### Utilize numerous regression techniques, with Median House Value as the target variable and evaluate the performance of each as well as results.

## Import Libraries

In [None]:
from sklearn.datasets import fetch_california_housing # Brings in Dataset

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import plotly.express as px
import geopandas
from pandas_profiling import ProfileReport
from urllib.request import urlopen
import json
import plotly.graph_objects as go
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso

## Load Dataset, Explore and Display Features

In [None]:
housing = fetch_california_housing()
housing_df = pd.DataFrame(data= np.c_[housing['data'], housing['target']],
                     columns= housing['feature_names'] + ['MedHouseVal'])

In [None]:
housing_df.sample(3)

In [None]:
housing_df.describe()

In [None]:
profile = ProfileReport(housing_df)
profile

### Geographic plot of  Median House Value

In [None]:
## Creating a spatial map of the housing data with the Median House Value 
# binned and represented by size of its point and color

fig_dims = (6, 6.5) # Using Dims to try to simulate Cali Shape
fig, ax = plt.subplots(figsize=fig_dims)  
sns.scatterplot(data=housing_df, x="Longitude", y="Latitude",
                size="MedHouseVal", hue="MedHouseVal",
                palette="viridis", alpha=0.5)

plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 0.95),
           loc="upper left")
_ = plt.title("Median house value by spatial location")

### Pairplot of predictive attributes

In [None]:
## Creating pairplot of predictive attributes and Median House Value (target)
# This is ignoring Latitude and Longitude

housing_df_noGeo = housing_df.drop(columns=['Latitude','Longitude'])
housing_df_noGeo["target"] = pd.qcut(housing_df_noGeo["MedHouseVal"],
                                     6, retbins=False)
housing_df_noGeo["target"] = housing_df_noGeo["target"].apply(lambda x: x.mid)

_ = sns.pairplot(data=housing_df_noGeo, hue="target", palette="viridis")




### Note: can someone find a better way to display this? feels a bit hard to read

### Variables by County Location

In [None]:
# Create a geopandas dataframe with the latitude/longitude values
housing_gdf = geopandas.GeoDataFrame(
    housing_df, geometry=geopandas.points_from_xy(housing_df.Longitude, housing_df.Latitude))

print(housing_gdf.head())

# County information from the US 2018 census
counties = geopandas.read_file('CA_Counties_TIGER2016.shp')
print(counties.head())


# Below are all some iterations of what I've tried; I'm getting confused trying to merge the 
# county lines and the latitude/longitude values from the dataset; I'm having a hard time
# figuring out how to make them comparable 

ax = counties.boundary.plot(color='black', figsize=(18, 12))

ax.plot()

fig = px.scatter_geo(housing_gdf)

fig.show()

housing_gdf.plot(ax=ax, color='red')

plt.show()

with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    all_counties = json.load(response)

fig = px.choropleth(housing_gdf, geojson=counties, locations='geometry', color='target',
                           range_color=(0, 12),
                           scope="usa"
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()    

### Ordinary Least Squares Regression

In [None]:
def OLS_Regression(df):
    Y = df['MedHouseVal']
    for col in df.iloc[:,0:5]:
        print(col)
        X = df[col]
        X = sm.add_constant(X)
        model = sm.OLS(Y,X)
        results = model.fit()
        print("Results" + str(results.params))
        print("T-values" + str(results.tvalues))
        print("T-Test" + str(results.t_test([1, 0])))
    
    
print(OLS_Regression(housing_df))

### Standardize the data set 

In [None]:
x = housing_df
scaled_array = StandardScaler().fit_transform(x) #This is an array of the standardized values of the original columns
housing_standardized = pd.DataFrame(data= np.c_[scaled_array],\
                                    columns = ('MedInc', 'HouseAge', 'AveRooms', 'AveBedrms','Population',\
                                               'AveOccup','Latitude','Longitude', 'MedHouseVal'))

# View standardized data frame
housing_standardized.head()

### Multiple Linear Regression Model

In [None]:
# Separate features and target into x and y
x = housing_standardized[['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude']]
y = housing_standardized['MedHouseVal']

# Define the multiple linear regression model
linear_regress = LinearRegression()

# Fit the multiple linear regression model
linear_regress.fit(x,y)

# Predict y hat with the data
y_pred = linear_regress.predict(x)
y_pred

### RidgeCV Model

In [None]:
# Separate features and target into x and y
x = housing_standardized[['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude']]
y = housing_standardized['MedHouseVal']

# Define the multiple linear regression model
#ridge_cv = RidgeCV(alphas=[1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1])
ridge_cv = RidgeCV()
# Fit the multiple linear regression model
ridge_cv.fit(x,y)

# Predict y hat with the data
y_pred = ridge_cv.predict(x)
y_pred

### Lasso Model

In [None]:
# Separate features and target into x and y
x = housing_standardized[['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude']]
y = housing_standardized['MedHouseVal']

# Define the multiple linear regression model
lasso = Lasso(alpha=0.1)

# Fit the multiple linear regression model
lasso.fit(x,y)

# Predict y hat with the data
y_pred = lasso.predict(x)
y_pred