# California Housing Prices

Using the California Housing prices dataset to create a sample dataset for the NannyML library.

## Enriching data

We want to change the data to add context related for our use case. This includes:

- A time aspect
- Partitioning the data
- Specifying a target to make the problem a classification problem

In [None]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
# Get the data
cali = fetch_california_housing(as_frame=True)
df = pd.concat([cali.data, cali.target], axis=1)
df.head(2)

In [None]:
# add artificiacl timestamp
timestamps = [dt.datetime(2020,1,1) + dt.timedelta(hours=x) for x in df.index]
df['timestamp'] = timestamps

# add partitions
train_beg = dt.datetime(2020,1,1)
train_end = dt.datetime(2020,10,1)
test_beg = dt.datetime(2020,10,1)
test_end = dt.datetime(2021,6,1)
df.loc[df['timestamp'].between(train_beg, train_end, inclusive='left'), 'partition'] = 'train'
df.loc[df['timestamp'].between(test_beg, test_end, inclusive='left'), 'partition'] = 'test'
df['partition'] = df['partition'].fillna('production')

In [None]:
# create new classification target - house value higher than mean
df_train = df[df['partition']=='train']

df['clf_target'] = np.where(df['MedHouseVal'] > df_train['MedHouseVal'].median(), 1, 0)
df = df.drop('MedHouseVal', axis=1)

del df_train

In [None]:
df['partition'].value_counts()

In [None]:
df.groupby('partition')['timestamp'].max()

In [None]:
df.tail()

## Adding a model

This is the model that we will want to monitor.

In [None]:
# fit classifier
target = 'clf_target'
meta = 'partition'
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


df_train = df[df['partition']=='train']

clf = RandomForestClassifier(random_state=42)
clf.fit(df_train[features], df_train[target])
df['y_pred_proba'] = clf.predict_proba(df[features])[:,1]

In [None]:
# Check roc auc scores
for partition_name, partition_data in df.groupby('partition', sort=False):
    print(partition_name, roc_auc_score(partition_data[target], partition_data['y_pred_proba']))

In [None]:
df.y_pred_proba.plot.kde()

## Prepare data for NannyML

In [None]:
df['y_pred'] = df['y_pred_proba'].map(lambda p: int(p >= 0.8))


In [None]:
df_for_nanny = df[df['partition']!='train'].reset_index(drop=True)
df_for_nanny['partition'] = df_for_nanny['partition'].map({'test':'reference', 'production':'analysis'})
df_for_nanny['identifier'] = df_for_nanny.index

In [None]:
df_ref = df_for_nanny[df_for_nanny['partition']=='reference'].copy()
df_ana = df_for_nanny[df_for_nanny['partition']=='analysis'].copy()
df_gt = df_ana[['identifier', 'clf_target']].copy()
df_ana = df_ana.drop('clf_target', axis=1)

In [None]:
df_ref