In [1]:
>>> # Import required libraries
>>> import pandas as pd
>>> import numpy as np
>>> import datetime as dt

>>> from sklearn.datasets import fetch_california_housing
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.metrics import roc_auc_score

>>> cali = fetch_california_housing(as_frame=True)
>>> df = pd.concat([cali.data, cali.target], axis=1)
>>> df.head(2)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585


In [2]:
>>> # add artificial Timestamp
>>> Timestamps = [dt.datetime(2020,1,1) + dt.timedelta(hours=x/2) for x in df.index]
>>> df['Timestamp'] = Timestamps

>>> # add periods/partitions
>>> train_beg = dt.datetime(2020,1,1)
>>> train_end = dt.datetime(2020,5,1)
>>> test_beg = dt.datetime(2020,5,1)
>>> test_end = dt.datetime(2020,9,1)
>>> df.loc[df['Timestamp'].between(train_beg, train_end, inclusive='left'), 'partition'] = 'train'
>>> df.loc[df['Timestamp'].between(test_beg, test_end, inclusive='left'), 'partition'] = 'test'
>>> df['partition'] = df['partition'].fillna('production')

>>> # create new classification Target - house value higher than mean
>>> df_train = df[df['partition']=='train']
>>> df['Target'] = np.where(df['MedHouseVal'] > df_train['MedHouseVal'].median(), 1, 0)
>>> df = df.drop('MedHouseVal', axis=1)
>>> del df_train


In [3]:
>>> # fit classifier
>>> Target = 'Target'
>>> meta = 'partition'
>>> features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


>>> df_train = df[df['partition']=='train']

>>> clf = RandomForestClassifier(random_state=42)
>>> clf.fit(df_train[features], df_train[Target])
>>> df['y_pred_proba'] = clf.predict_proba(df[features])[:,1]
>>> df['y_pred'] = df['y_pred_proba'].map(lambda p: int(p >= 0.8))

>>> # Check roc auc score
>>> for partition_name, partition_data in df.groupby('partition', sort=False):
...     print(partition_name, roc_auc_score(partition_data[Target], partition_data['y_pred_proba']))

train 1.0
test 0.8737681614409617
production 0.8224322932364313


In [4]:
cols = ['Target'] + df.columns.tolist()[1:-5]
cols

['Target',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [5]:
train = df.loc[df['partition'] == 'train', cols]
train.to_csv('data/california-housing-dataset/train.csv', index=False)

In [6]:
test = df.loc[df['partition'] == 'test', cols]
test.to_csv('data/california-housing-dataset/test.csv', index=False)

In [7]:
prod = df.loc[df['partition'] == 'production', cols]
prod.to_csv('data/california-housing-dataset/prod.csv', index=False)