# Modeling
By Joshua Mayes 08/07/2022

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn import metrics


import wrangle
import my_toolkit

SEED = 8

In [2]:
pd.options.display.max_columns = None

## About this Notebook

In this notebook we will be building and evaluating the performance of machine learning models and selecting the best performer.

### Notebook plan
- Prepare data
    - Scale
    - Cluster
    - Dummy columns
- Models
    - LR on only clusters
    - LR on each cluster only
    - RFR on entire dataset

## Prepare data

### Config variables and helper functions

During exploration I was able to identify some useful clusters.  These variable will help cleanly create those clusters.

In [3]:
## Import the data
train, test, validate = wrangle.wrangle_zillow()
## Make a backup in case I mess something up later
orig_samples = {
    'train': train.copy(),
    'test': test.copy(),
    'validate': validate.copy()
}

In [4]:
def build_kmeans_clusterer(df, cols, k, seed=SEED):
    from sklearn.cluster import KMeans
    clusterer = KMeans(n_clusters=k, random_state=seed)
    clusterer.fit(df[cols])
    return clusterer

In [5]:
def get_kmeans_clusters(df, cols, k, clusterer=None):
    if clusterer == None:
        from sklearn.cluster import KMeans
        clusterer = KMeans(n_clusters=k)
        clusterer.fit(df[cols])
    s = clusterer.predict(df[cols])
    return s

In [6]:
def make_minmax_scaler(df, cols):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(df[cols])
    return scaler

In [7]:
def df_scale_cols(df, cols, scaler, reverse=False):
    out = df.copy()
    if reverse:
        out[cols] = scaler.inverse_transform(out[cols])
    else:
        out[cols] = scaler.transform(out[cols])
    return out


### Scaling

In [8]:
scale_cols = [
#  'logerror',
 'bedroomcnt',
 'calc_bath',
 'structure_sqft',
 'fullbathcnt',
 'latitude',
 'longitude',
 'lot_sqft',
 'roomcnt',
 'tax_structure',
 'tax',
 'tax_land',
 'years_tax_delinquent',
 'bathroom_sum',
 'age',
 ]

In [9]:
scaler1 = make_minmax_scaler(train, scale_cols)


In [10]:
train = df_scale_cols(train, scale_cols, scaler1)
test = df_scale_cols(test, scale_cols, scaler1)
validate = df_scale_cols(validate, scale_cols, scaler1)

### Clustering

In [11]:
cluster1_cols = ['age','tax']
clusterer1 = build_kmeans_clusterer(train, cluster1_cols, k=6)

cluster2_cols = ['latitude','longitude','age', 'tax']
clusterer2 = build_kmeans_clusterer(train, cluster2_cols, k=6)

cluster3_cols = ['latitude','longitude','structure_sqft','lot_sqft']
clusterer3 = build_kmeans_clusterer(train, cluster3_cols, k=5)

all_clusterers = [(clusterer1, cluster1_cols), (clusterer2, cluster2_cols), (clusterer3, cluster3_cols)]

In [12]:
def df_add_clusters(df: pd.DataFrame, clusterer_col_tuples: list):
    
    out = df.copy()
    for i, (clusterer, cols) in enumerate(clusterer_col_tuples):
        out['cluster'+str(i+1)] = clusterer.predict(out[cols])
    return out

In [13]:
train = df_add_clusters(train, all_clusterers)
test = df_add_clusters(test, all_clusterers)
validate = df_add_clusters(validate, all_clusterers)

In [33]:
ytrain = train.logerror
ytest = test.logerror
yval = validate.logerror

In [14]:
train.head().iloc[:,-3:]

Unnamed: 0,cluster1,cluster2,cluster3
10842,5,1,1
46666,4,5,2
29083,5,1,4
44486,4,5,2
49011,1,1,3


## Modeling

### Baseline Model

In [15]:
baseline_val = train.logerror.mean()
baseline_val

0.016146390665833673

In [16]:
class BaselineRegressor:
    """ A simple class meant to mimic sklearn's modeling methods so that I can standardize my workflow.
    Assumes that you are fitting a single predictor.  
    For multiple predictors you will need multiple instances of this class.
    
    TODO: Handle multi-dimensional predictors
    TODO: Handle saving feature names
    """
    def __init__(self):
        """This isn't needed, but I'm leaving this here to remind myself that it's a thing."""
        pass


    def fit(self, y):
        """Calculates the mean for the target variable and assigns it to this instance."""
        if len(y.shape) == 1:
            self.baseline = y.mean()
        else:
             raise ValueError('Expected a 1 dimensional array.')

    def predict(self, x):
        """Always predicts the mean value."""
        n_predictions = len(x)
        return np.full((n_predictions), self.baseline)

In [17]:
baseline = BaselineRegressor()
baseline.fit(train.logerror)
print(baseline.baseline)

0.016146390665833673


In [18]:
pred_baseline_train = baseline.predict(train)
pred_baseline_train.shape, pred_baseline_train

((24603,),
 array([0.01614639, 0.01614639, 0.01614639, ..., 0.01614639, 0.01614639,
        0.01614639]))

In [19]:
train['baseline'] = pred_baseline_train

In [20]:
train.head().iloc[:, -4:]

Unnamed: 0,cluster1,cluster2,cluster3,baseline
10842,5,1,1,0.016146
46666,4,5,2,0.016146
29083,5,1,4,0.016146
44486,4,5,2,0.016146
49011,1,1,3,0.016146


In [21]:
y_train = train.logerror

In [22]:
def regression_metrics(actual: pd.Series, predicted: pd.Series) -> dict:

    import math
    from sklearn import metrics
    
    y = actual
    yhat = predicted
    resid_p = y - yhat
    sum_of_squared_errors = (resid_p**2).sum()

    error_metrics = {
        'max_error': metrics.max_error(actual, predicted),
        'sum_squared_error' : sum_of_squared_errors,
        'mean_squared_error' : metrics.mean_squared_error(actual, predicted),
        'root_mean_squared_error' : metrics.mean_squared_error(actual, predicted, squared=False),
        'mean_aboslute_error' : metrics.mean_absolute_error(actual, predicted),
        'r2_score' : metrics.r2_score(actual, predicted, force_finite=False)
    }

    return error_metrics

In [23]:
regression_metrics(y_train, pred_baseline_train)

{'max_error': 4.6715667683058335,
 'sum_squared_error': 624.5884000681382,
 'mean_squared_error': 0.025386676424344112,
 'root_mean_squared_error': 0.15933196924768148,
 'mean_aboslute_error': 0.0674653209139151,
 'r2_score': 0.0}

In [35]:
model_metrics = {}
model_metrics['train_baseline'] = regression_metrics(ytrain, baseline.predict(ytrain))
model_metrics['test_baseline'] = regression_metrics(ytest, baseline.predict(ytest))
model_metrics['validate_baseline'] = regression_metrics(yval, baseline.predict(yval))
pd.DataFrame.from_dict(model_metrics, orient='index')

Unnamed: 0,max_error,sum_squared_error,mean_squared_error,root_mean_squared_error,mean_aboslute_error,r2_score
train_baseline,4.671567,624.5884,0.025387,0.159332,0.067465,0.0
test_baseline,3.159542,305.544762,0.029876,0.172848,0.070545,-0.000251
validate_baseline,3.378398,295.619116,0.028906,0.170017,0.071659,-2.1e-05


In [39]:
def display_metrics(metrics=model_metrics):
    return pd.DataFrame.from_dict(metrics, orient='index')

### Model 1 - Clusters only (all clusters)

In [25]:
def plot_residuals(actual, predicted):
    yhat = predicted
    resid_p = actual - yhat

    fig, ax1 = plt.subplots(1, 1, constrained_layout=True, sharey=True, figsize=(7,4))
    ax1.set_title('Predicted Residuals')
    ax1.set_ylabel('Error')
    ax1.set_xlabel('Predicted Value')
    ax1.ticklabel_format(useOffset=False, style='plain')
    ax1.scatter(x=yhat, y=resid_p)
    plt.show()

In [26]:
train_c1_dummies = pd.get_dummies(train.cluster1, prefix='cluster1')
train_c2_dummies = pd.get_dummies(train.cluster2, prefix='cluster2')
train_c3_dummies = pd.get_dummies(train.cluster3, prefix='cluster3')
x1 = pd.concat([train_c1_dummies, train_c2_dummies, train_c3_dummies], axis=1)
x1.head()

Unnamed: 0,cluster1_0,cluster1_1,cluster1_2,cluster1_3,cluster1_4,cluster1_5,cluster2_0,cluster2_1,cluster2_2,cluster2_3,cluster2_4,cluster2_5,cluster3_0,cluster3_1,cluster3_2,cluster3_3,cluster3_4
10842,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0
46666,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
29083,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1
44486,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
49011,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


In [37]:
test_c1_dummies = pd.get_dummies(test.cluster1, prefix='cluster1')
test_c2_dummies = pd.get_dummies(test.cluster2, prefix='cluster2')
test_c3_dummies = pd.get_dummies(test.cluster3, prefix='cluster3')
t1 = pd.concat([test_c1_dummies, test_c2_dummies, test_c3_dummies], axis=1)

validate_c1_dummies = pd.get_dummies(validate.cluster1, prefix='cluster1')
validate_c2_dummies = pd.get_dummies(validate.cluster2, prefix='cluster2')
validate_c3_dummies = pd.get_dummies(validate.cluster3, prefix='cluster3')
v1 = pd.concat([validate_c1_dummies, validate_c2_dummies, validate_c3_dummies], axis=1)

In [28]:
y = train.logerror

In [29]:
from sklearn import linear_model
m1 = linear_model.LinearRegression()
m1.fit(x1, y)
p1 = m1.predict(x1)


In [40]:

model_metrics['train_linear_all_clusters'] = regression_metrics(p1, y)
display_metrics()

Unnamed: 0,max_error,sum_squared_error,mean_squared_error,root_mean_squared_error,mean_aboslute_error,r2_score
train_baseline,4.671567,624.5884,0.025387,0.159332,0.067465,0.0
test_baseline,3.159542,305.544762,0.029876,0.172848,0.070545,-0.000251
validate_baseline,3.378398,295.619116,0.028906,0.170017,0.071659,-2.1e-05
train_linear_all_clusters,4.665447,622.056489,0.025284,0.159009,0.067918,-234.266145


Unsurprisingly.  This model doesn't work well.  Though I am curious if other algorithms can perform any better.

In [45]:
m1_a = linear_model.LassoLars(alpha=0.1, normalize=False)
m1_a.fit(x1, ytrain)
p1_a = m1_a.predict(x1)
model_metrics['train_lassolars_all_clusters'] = regression_metrics(p1_a, ytrain)
display_metrics()

  output_scores = 1 - (numerator / denominator)


Unnamed: 0,max_error,sum_squared_error,mean_squared_error,root_mean_squared_error,mean_aboslute_error,r2_score
train_baseline,4.671567,624.5884,0.025387,0.159332,0.067465,0.0
test_baseline,3.159542,305.544762,0.029876,0.172848,0.070545,-0.000251
validate_baseline,3.378398,295.619116,0.028906,0.170017,0.071659,-2.1e-05
train_linear_all_clusters,4.665447,622.056489,0.025284,0.159009,0.067918,-234.266145
train_lassolars_all_clusters,4.671567,624.5884,0.025387,0.159332,0.067465,-inf


It looks like I don't understand LassoLARS very well and fed it an impossible task, causing it to revert to always guess the mean.   

I don't think I'll get much more out of trying to model on clusters alone.  

Moving on.

### Model group 2:
structure and lot size + location/age/tax clusters (cluster2)