In [32]:
import numpy as np
import pandas as pd

# Problem Understanding

Your Real Estate partner in California needs your help with pricing homes at the optimal level<br>

Help them to predict the expected sale value of properties in their State and you will get slice of their additional sales commission 💸

# Data Understanding

In [33]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [34]:
data['target']

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [35]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Data preparation

### Split your X data in train and test datasets
Here is the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [36]:
from sklearn.model_selection import train_test_split

train_x, test_x = train_test_split(X, train_size=0.8)
train_y, test_y = train_test_split(y, train_size=0.8)

### Split your train data in train and validation datasets

In [37]:
from sklearn.model_selection import train_test_split

train_x, valid_x = train_test_split(train_x, train_size=0.8)
train_y, valid_y = train_test_split(train_y, train_size=0.8)

### Scale the 3 datasets using StandardScaler

In [38]:
from sklearn.preprocessing import StandardScaler 

scalar = StandardScaler()
scalar.fit(train_x)
scalar.transform(train_x)

scalar.fit(valid_x)
scalar.transform(valid_x)

scalar.fit(test_x)
scalar.transform(test_x)

array([[-0.02863908,  0.58533582,  0.08840453, ..., -0.03709304,
         0.97685932, -1.28144244],
       [ 0.36646782,  1.68292595, -0.13017601, ...,  0.03192824,
         0.99565226, -1.45061634],
       [ 1.37173441,  1.36932877,  0.44115916, ..., -0.0364227 ,
        -0.65342781,  0.69888741],
       ...,
       [ 2.33683508, -0.27705642,  0.7228051 , ..., -0.01583396,
        -0.76618542,  1.08699225],
       [ 0.24437195,  1.83972454, -0.39212643, ..., -0.01113072,
         1.00974696, -1.43568924],
       [ 2.62332354, -1.13944866,  0.80064167, ..., -0.03375527,
         1.00974696, -0.73411509]])

# Modelling and Model Evaluation

### Train a linear regression model

In [39]:
data['target']

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [40]:
from sklearn.linear_model import LinearRegression
rgs = LinearRegression()
model1 = rgs.fit(train_x, train_y)

### Measure the R-squared, MSE and MAE of your model
Here is the documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [57]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

y_predict1 = model1.predict(train_x)
mod1_r2 = r2_score(train_y, y_predict)

mod1_mse = mean_squared_error(train_y, y_predict)

mod1_mae = mean_absolute_error(train_y, y_predict)

print(f'R2: {mod1_r2}', '\n', 
      f'MSE: {mod1_mse}', '\n',
      f'MAE: {mod1_mae}')

R2: 0.0004658830293496896 
 MSE: 1.3060459371693622 
 MAE: 0.9011570200322631


### Train a LASSO model

In [52]:
from sklearn.linear_model import Lasso
model2 = Lasso().fit(train_x, train_y)

### Measure the R-squared, MSE and MAE of your model

In [58]:
y_predict2 = model2.predict(train_x)
mod2_r2 = r2_score(train_y, y_predict2)

mod2_mse = mean_squared_error(train_y, y_predict2)

mod2_mae = mean_absolute_error(train_y, y_predict2)

print(f'R2: {mod2_r2}', '\n', 
      f'MSE: {mod2_mse}', '\n',
      f'MAE: {mod2_mae}')

R2: 4.991235943097028e-05 
 MSE: 1.3065894671942058 
 MAE: 0.9015487611237487


# Interprete your winning model

### What can you tell your business partner by looking at the coefficients?

In [None]:
# The Lasso regression model had much lower R2 and explains less of the variance in the training data
# This coencides with the higher values for both MSE and MAE as there are greater magnitudes of error about the lasso 
# regression predicted values and the true values