In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import config
import pickle

In [2]:
#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

In [3]:
#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [None]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [4]:
file = open('pickles/ZRI_filtered.p','rb')
ZRI_filtered = pickle.load(file)

In [5]:
from data_setup import ZRI_format

In [34]:
%%time
time_unit = 'Month'
window_size = 9
future_time = 1

ZRI_diff = ZRI_format(ZRI_filtered, time_unit = time_unit, window_size = window_size, future_time = future_time, percent_change=True)
ZRI_actual = ZRI_format(ZRI_filtered, time_unit = time_unit, window_size = window_size, future_time = future_time)

Wall time: 2min 1s


In [35]:
#Adding real ZRI as a feature. (Either average over past n time_units, or past n as separate features)
feature_columns = [x for x in ZRI_diff.columns if 'minus' in x]
#Rename feature columns of %difference dataframe
ZRI_diff = ZRI_diff.rename({i:i+'_%difference' for i in feature_columns}, axis =1)

In [36]:
ZRI_new = ZRI_diff.merge(ZRI_actual[feature_columns+['Target_index']],how = 'left',on = 'Target_index')

In [37]:
ZRI_new.head()

Unnamed: 0,Target_index,Missing_Months,Target_ZRI,Month,Year,ZRI_minus_1M_%difference,ZRI_minus_2M_%difference,ZRI_minus_3M_%difference,ZRI_minus_4M_%difference,ZRI_minus_5M_%difference,...,ZipCode,ZRI_minus_1M,ZRI_minus_2M,ZRI_minus_3M,ZRI_minus_4M,ZRI_minus_5M,ZRI_minus_6M,ZRI_minus_7M,ZRI_minus_8M,ZRI_minus_9M
0,01013M10Y2014,0,0.0,10,2014,0.006383,0.010753,0.001076,0.0,0.0,...,Y2014,946.0,940.0,930.0,929.0,929.0,929.0,934.0,,
1,01013M10Y2015,0,0.010373,10,2015,0.015806,0.001055,-0.002105,-0.005236,-0.003132,...,Y2015,964.0,949.0,948.0,950.0,955.0,958.0,958.0,950.0,946.0
2,01013M10Y2016,0,-0.002904,10,2016,-0.002896,0.001934,0.003883,0.008815,-0.001955,...,Y2016,1033.0,1036.0,1034.0,1030.0,1021.0,1023.0,1032.0,1027.0,1018.0
3,01013M10Y2017,0,0.018674,10,2017,0.005634,0.002825,0.006635,0.014423,-0.000961,...,Y2017,1071.0,1065.0,1062.0,1055.0,1040.0,1041.0,1039.0,1042.0,1042.0
4,01013M10Y2018,0,-0.012302,10,2018,-0.008711,-0.003472,-0.00346,0.004344,0.001741,...,Y2018,1138.0,1148.0,1152.0,1156.0,1151.0,1149.0,1146.0,1144.0,1141.0


In [46]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [47]:
#Find feature columns
feature_columns = [x for x in ZRI_new.columns if 'minus' in x]

In [48]:
#Drop nan values generated from the difference
ZRI_new = ZRI_new.dropna()

In [49]:
#Train test split, test data is above a given year
test_year = 2019
training_data = ZRI_new[ZRI_new.Year < test_year]
final_test_data = ZRI_new[ZRI_new.Year >= test_year]

In [50]:
#  X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
#                                                      training_data['Target_ZRI'],
#                                                      test_size = .1
#                                                     ) 
X_train, y_train = training_data[feature_columns], training_data['Target_ZRI']

In [51]:
lr = LinearRegression()

In [52]:
lr.fit(X_train,y_train)

LinearRegression()

In [53]:
lr.score(X_train, y_train)

0.1804315358700106

In [19]:
prediction_error = final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns])

In [20]:
prediction_error.describe()

count    5124.000000
mean        0.002671
std         0.088197
min        -0.412530
25%        -0.018224
50%        -0.006806
75%         0.006563
max         2.807729
Name: Target_ZRI, dtype: float64

In [21]:
lr.coef_

array([-1.95114340e-01, -2.63277159e-01, -3.88209189e-01, -3.09924337e-04,
        1.13770734e-04,  1.96026387e-04])

### Classification Problem
To make this easier we can reframe the problem as a classification problem. Does the rent go up or down. (Staying the same counts as going down?)

In [54]:
ZRI_new['ZRI_class'] = ZRI_new['Target_ZRI'].apply(lambda x: 1 if x>0 else -1)
ZRI_new = ZRI_new.dropna()
#Find feature columns
feature_columns = [x for x in ZRI_new.columns if 'minus' in x]

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

In [56]:
logistic_regression = LogisticRegression(class_weight='balanced')
binary_model = make_pipeline(StandardScaler(),logistic_regression)

In [57]:
#Train test split, test data is above a given year
test_year = 2019
training_data = ZRI_new[ZRI_new.Year < test_year]
final_test_data = ZRI_new[ZRI_new.Year >= test_year]
X_train, y_train = training_data[feature_columns], training_data['ZRI_class']
X_test, y_test = final_test_data[feature_columns], final_test_data['ZRI_class']

In [58]:
binary_model.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced'))])

In [59]:
binary_model.score(X_train,y_train), binary_model.score(X_test,y_test)

(0.6333540441937899, 0.5856752537080406)

In [60]:
confusion_matrix(y_test, binary_model.predict(X_test))

array([[2929, 3979],
       [2390, 6074]], dtype=int64)

### Model Tuning
See the effect of window size and future time on the predictive power of the model:

In [None]:
window_sizes = list(range(1,13))
future_time = 24
time_unit = 'Month'
num_obs = defaultdict()
errors = defaultdict()
scores = defaultdict()
coefficients = defaultdict()

for window_size in window_sizes:
    ZRI_new = ZRI_format(ZRI_MF, time_unit = time_unit, 
                         window_size = window_size,
                         future_time = future_time)
    ZRI_new = ZRI_new.dropna()
    num_obs[window_size] = ZRI_new.shape[0]
    feature_columns = [x for x in ZRI_new.columns if 'minus' in x]
    test_year = 2019
    training_data = ZRI_new[ZRI_new.Year < test_year]
    final_test_data = ZRI_new[ZRI_new.Year >= test_year]
    most_recent_feature = f'ZRI_minus_{future_time}{time_unit[0]}'
    X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1
                                                    ) 
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    coefficients[window_size] = defaultdict()
    scores[window_size] = (lr.score(X_test,y_test), lr.score(X_train, y_train))
    errors[window_size] = (final_test_data['Target_ZRI'] - 
                           lr.predict(final_test_data[feature_columns])).div(final_test_data[most_recent_feature])
    

In [None]:
lr.coef_

In [None]:
plt.boxplot(errors.values())

In [None]:
plt.boxplot(list(map(lambda x: x.apply(lambda y: np.log10(y+1250)),errors.values())))

In [None]:
pd.DataFrame(errors).describe()