This is the code for the Machine Learning Model for the Vancouver Housing Affordability project.
To do a basic test you can simply run all the cells in order. For more detailed testing instructions refer
to the README file.

Please note that this code requires two files "ML-Model-Cleaned.csv" and "static_params.txt" to be saved in the same folder, and won't work otherwise.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os

  from pandas.core import (


In [7]:
#Loading the initial data from the file.
filename = "ML-Model-Cleaned.csv"
data = pd.read_csv(filename).dropna()

In [3]:
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#CHANGABLE MODEL PARAMETERS
#These are the only values in the notebook that should be
#changed. Each time you change one of these you need to
#re-run all the cells below in order (including re-training
#the ML model) for the change to take effect.
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#LOOKBACK_YEARS Parameter determines how many previous
#of property values get included in each feature vector.
#For example, if LOOKBACK_YEARS = 5 then the feature vector
#for predicting 2025 prices will include property valeus
#from 2020-2024. Should never be higher than 19. See
#next parameter for additional restrictions.
#Can be set to 0 to predict without using property values.
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
LOOKBACK_YEARS = 5

#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#TRAINING_YEARS parameter lists which years of data 
#data the model will try to predict as part of training.
#For any given year, the model will try to predict every
#property in that year (except those held out for testing).
#Adding more years means more training data, but using
#only 2024 is sufficient to get decent accuracy.
#Only years in the range 2006-2024 should ever be used, but
#if using the early end of the range, you need to also make
#sure that you're not looking back beyond it. For example, if
#LOOKBACK_YEARS = 5, then you can't train on any year earlier
#than 2010 (since the 2009 df would try to incorporate 2005)
#property values, which we don't have.
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
TRAINING_YEARS = [2013, 2017, 2024]

In [4]:
#GLOBAL CONSTANTS AND FILE INPUTS
#This cell defines global constants plus a line that reads a file to
#determine which 'static columns' (columns that don't depend on year)
#to use. Needs to have correctly-named file in the same folder to run
#without errors.


LATEST_DATA_YEAR = 2024
FIRST_DATA_YEAR = 2006
IMPROVEMENT_PREFIX = "CURRENT_IMPROVEMENT_VALUE_"
IMPROVEMENT_SUFFIX = "_YEAR_AGO_IMPROVEMENT_VALUE"
LAND_PREFIX = "CURRENT_LAND_VALUE_"
LAND_SUFFIX = "_YEAR_AGO_LAND_VALUE"
OCCUPANCY_PREFIX = "Count_Per_Year_"
Y_COLS = ["IMPROVEMENT_VALUE", "LAND_VALUE"]

pd.options.mode.copy_on_write = True

f = open("static_params.txt", "r") 
f_params = f.read() 
stat_cols = f_params.replace('\n', ',').split(',')
f.close()
num_cols = len(stat_cols) - 1
stat_cols = stat_cols[:num_cols] #remove empty string in last position

In [5]:
#DATA LOADING AND SHAPING FUNCTIONS
#Function in this cell are used to filter and format the raw data
#from the CSV file into the correct format and shape to be used
#by the ML model.

#Input a year (among the years we have data for) and selects all relevant
#data for that year into a dataframe, altering the column names to a
#consistent standard that doesn't depend on the year.
#If you pass in a year that's too near the start (as determined by global)
#constant LOOKBACK_YEARS, you'll get an error
def get_year_data(pred_year):
    year_cols = [] #generate column names for all the year-dependent stuff
    new_col_names = {}
    
    #We need to change our column names from having absolute years (e.g. 2017)
    #to relative years (7 years ago) so we make a dictionary relating the two
    for year in range(pred_year - LOOKBACK_YEARS, pred_year):
        imp, land = IMPROVEMENT_PREFIX + str(year), LAND_PREFIX + str(year)
        year_cols += [imp, land]
        new_col_names.update({imp:str(pred_year - year) + IMPROVEMENT_SUFFIX})
        new_col_names.update({land:str(pred_year - year) + LAND_SUFFIX})
    
    #only save one year of occupancy data
    occup = OCCUPANCY_PREFIX + str(pred_year - 1)
    year_cols.append(occup)
    new_col_names.update({occup:"OCCUPANCY"})
    
    #Most recent columns of property data get special names, since they are 
    #the "current" values and thus our training labels/prediction targets
    X_cols = stat_cols + year_cols
    yimp, yland = IMPROVEMENT_PREFIX + str(pred_year), LAND_PREFIX + str(pred_year)
    y_cols_temp = [yimp, yland]
    
    #Copy the columns we want from the original df and then rename them
    prepped_df = data[X_cols + y_cols_temp]
    prepped_df.rename(columns = new_col_names, inplace=True)
    prepped_df.rename(columns={yimp:Y_COLS[0], yland:Y_COLS[1]},inplace=True)
    prepped_df.insert(0,'YEAR',pred_year)
    
    return prepped_df

#Splits a prepped dataframe into training and testing sets
#and into X and y components
def split_data(df):
    train, test = train_test_split(full_df,test_size=0.2)
    y_train = train[Y_COLS]
    y_test = test[Y_COLS]
    X_train = train.drop(Y_COLS, axis=1)
    X_test = test.drop(Y_COLS, axis=1)
    
    return X_train, y_train, X_test, y_test

#Takes in a list of years, calls get_year_data for each
#of those years and combines them
def combine_years(years):
    empty = True
    for year in years:
        if(empty):
            df = get_year_data(year)
            empty = False
        else:
            more_data = get_year_data(year)
            df = pd.concat([df,more_data],axis=0)
            
    return df
        



In [8]:
#TRAINING AND TESTING ML MODEL
#This cell gathers the data to feed into the model
#Calls the function to split it into train and test
#sets and then trains and scores the model

full_df = combine_years(TRAINING_YEARS)

X_train, y_train, X_test, y_test = split_data(full_df)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

score = rf.score(X_test, y_test)

print(score)

0.9309219365451689


In [11]:
#FEATURE IMPORTANCE
#Displays feature importances in a readable form
print("Feature Importances for RandomForest Model:")
for i in range(len(X_test.columns)):
    print(X_test.columns[i], ": ", 100*rf.feature_importances_[i], "%")

Feature Importances for RandomForest Model:
YEAR :  1.018662166848989 %
YEAR_BUILT :  0.12616905217861357 %
BIG_IMPROVEMENT_YEAR :  0.1268696158539702 %
LATITUDE :  0.12445921489866726 %
LONGITUDE :  0.08703232503087077 %
MIN_DISTANCE_LIBRARY_METERS :  0.1144828857413671 %
COMMUNITY_CENTRE_DISTANCE_METERS :  0.15360999935857522 %
CLOSEST_DOG_PARK_METERS :  0.15755858286893354 %
CLOSEST_HOMELESS_SHELTER :  0.12009768599015663 %
CLOSEST_PUBLIC_ART :  0.10566161926047017 %
CLOSEST_RAPID_TRANSIST :  0.11977398901935578 %
CLOSEST_BUS_STOPS :  0.1427194780835355 %
SCHOOL_DISTANCE :  0.0770066283649808 %
PARK_DISTANCE :  0.11785963441839033 %
PARK_AREA_HECTARES :  0.11372202229247715 %
CULTURAL_SPACE_YEAR :  0.0510801682103243 %
CULTURAL_SPACE_DISTANCE :  0.08585634929082944 %
5_YEAR_AGO_IMPROVEMENT_VALUE :  0.8287342672472003 %
5_YEAR_AGO_LAND_VALUE :  0.17615736832954676 %
4_YEAR_AGO_IMPROVEMENT_VALUE :  0.7480694989834866 %
4_YEAR_AGO_LAND_VALUE :  0.2991769203096788 %
3_YEAR_AGO_IMPROVEME

In [25]:
#FUNCTIONS FOR PREDICTING FUTURE YEARS
#The two function in this cell are used
#to allow the model to make predictions on
#future years

#Takes in an X dataframe and a y numpy array and updates
#the dataframe to include the new values, pushing out the
#oldest year of data to make room
#Seems to have some sort of error that causes some small
#number of props not to be correctly updated and getting NAN 
#values instead. We didn't have time to chase down the
#error, so it's patched by simply dropping the affected 
#rows in the predict_future_year rows function
def update_prop_values(old_df, new_prop_data):
    updata = old_df
    
    for i in range(LOOKBACK_YEARS - 1):
        updata[str(LOOKBACK_YEARS-i) + IMPROVEMENT_SUFFIX]\
        = updata[str(LOOKBACK_YEARS-i-1) + IMPROVEMENT_SUFFIX]
        
        updata[str(LOOKBACK_YEARS-i) + LAND_SUFFIX]\
        = updata[str(LOOKBACK_YEARS-i-1) + LAND_SUFFIX]
        
    pred_df = pd.DataFrame(data=new_prop_data, columns=Y_COLS)
    updata['1'+IMPROVEMENT_SUFFIX] = pred_df[Y_COLS[0]]
    updata['1'+LAND_SUFFIX] = pred_df[Y_COLS[1]]
    
    return updata

#Makes predictions for some future year. If the year is more
#than one year in advance, it will recursively predict each
#year between that year and 2024. Returns both the predictions
#and the updata dataframe (the latter being necessary so it)
#can feed it forward to predict further-future years
def predict_future_year(model, year):
    
    #If we're predicting just one year in the future we can
    #just use values from our existing dataframe
    if(year == LATEST_DATA_YEAR + 1):
        last_year = get_year_data(year-1)
        prev_data = last_year.drop(Y_COLS, axis=1)
        prev_predictions = last_year[Y_COLS]
    
    #If we're predicting farther in the future, we need to fill in all
    #years between our latest data and the year we're predicting 
    elif(year > LATEST_DATA_YEAR + 1):
        prev_data,prev_predictions  = predict_future_year(model,year-1)
    
    #Update property data (if necessary), advance the year and
    #Make next year's predictions
    if(LOOKBACK_YEARS > 0):
        prev_data = update_prop_values(prev_data,prev_predictions)
    prev_data['YEAR'] = prev_data['YEAR'].apply(lambda x: x+1)    
    predictions = model.predict(prev_data.dropna())
    
    return prev_data, predictions


In [28]:
#TESTING THE MODEL AND MAKING PREDICTIONS
#This is the cell where the model can actually be
#used to predict future property values. You can
#change future_prediction_year to any year after
#2024 to predict future property values

#To inspect the results in Jupyter Notebook,
#uncomment one of the three lines at the end to 
#see either just the predictions, just the updated
#dataframe or both together

future_prediction_year = 2027

updata, preds = predict_future_year(rf, future_prediction_year)
pred_df = pd.DataFrame(data=preds, columns=Y_COLS)

#pred_df
#updata.dropna()
#pd.concat([updata, pred_df], axis=1).dropna()

Unnamed: 0,YEAR,YEAR_BUILT,BIG_IMPROVEMENT_YEAR,LATITUDE,LONGITUDE,MIN_DISTANCE_LIBRARY_METERS,COMMUNITY_CENTRE_DISTANCE_METERS,CLOSEST_DOG_PARK_METERS,CLOSEST_HOMELESS_SHELTER,CLOSEST_PUBLIC_ART,...,4_YEAR_AGO_LAND_VALUE,3_YEAR_AGO_IMPROVEMENT_VALUE,3_YEAR_AGO_LAND_VALUE,2_YEAR_AGO_IMPROVEMENT_VALUE,2_YEAR_AGO_LAND_VALUE,1_YEAR_AGO_IMPROVEMENT_VALUE,1_YEAR_AGO_LAND_VALUE,OCCUPANCY,IMPROVEMENT_VALUE,LAND_VALUE
0,2027.0,1980.0,1990.0,49.221822,-123.026260,1056.697046,927.910113,278.831824,6107.240949,1410.046201,...,14251000.0,1520700.0,14443000.0,1643817.00,1.489339e+07,1779216.02,15168410.02,11.0,1861799.00,15732330.00
1,2027.0,2014.0,2014.0,49.249115,-123.165541,1496.660110,1496.892710,847.954438,4335.685847,70.033544,...,31855000.0,8467000.0,32761000.0,8295586.01,3.325834e+07,8094836.01,33872800.01,11.0,8057142.00,34006470.00
2,2027.0,2013.0,2013.0,49.246082,-123.187859,164.232976,366.343676,940.805926,5731.903505,1156.385507,...,35320000.0,4061000.0,34076000.0,4331049.00,3.425293e+07,4371243.01,34504390.01,14.0,4448107.00,34644850.00
3,2027.0,2016.0,2017.0,49.240163,-123.090320,1185.674067,1091.697614,1168.451346,2711.526929,402.441706,...,31443000.0,13821300.0,30791000.0,13891100.00,3.172146e+07,14274146.00,31981680.00,17.0,14460370.00,32719917.00
4,2027.0,2020.0,2020.0,49.254574,-123.193153,1099.769387,1355.772429,1236.550064,5496.055699,1871.006051,...,37544000.0,9939900.0,38831000.0,10102606.00,3.959046e+07,10229004.00,40350200.00,13.0,10405291.07,42069520.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14666,2027.0,1965.0,1965.0,49.270783,-123.151725,1348.610531,1140.826734,837.427137,1991.396011,446.957018,...,6577000.0,438000.0,6577000.0,1900694.00,1.032487e+08,3302703.00,15588640.00,1.0,8220780.01,67293451.01
14667,2027.0,1997.0,2010.0,49.255306,-123.150535,1262.611972,1025.938535,1372.988140,3054.640381,425.276816,...,22069000.0,6747700.0,22794000.0,33979590.19,7.604385e+07,9674210.10,29000127.10,4.0,2543484.11,16183070.11
14668,2027.0,2019.0,2019.0,49.228852,-123.083792,478.035367,1230.203759,1012.482865,4053.492129,478.582331,...,21665000.0,3969100.0,22367000.0,8065720.00,4.337063e+07,1583917.05,12789635.05,11.0,2356161.97,22924822.02
14669,2027.0,2014.0,2014.0,49.280953,-123.087111,343.218708,220.217271,579.900577,295.483881,52.462039,...,3601000.0,698500.0,3704000.0,2401273.00,1.826968e+07,1333300.05,11184190.05,4.0,1978224.00,15085842.00
