# Group 5
## Team members:
- Jianting Liu(8950907)
- David (8999846) 
- Marieth (9016702)


### Use case
The analysis examines the relationship between housing prices, income, and population in Canada over a 20-year period. By incorporating population data alongside existing housing prices and income metrics, we can better understand the demographic pressures on housing affordability.

## load data

In [5]:
import pandas as pd

df = pd.read_csv("./data/housing-supply-price-rental.csv")
df

Unnamed: 0.1,Unnamed: 0,year,total_dwelling,single_detached,multiple,semi_detached,row,apartment,total_dwelling_market,homeownership_freehold,...,one_bedroom,two_bedroom,three_bedroom,population,labour_participation_rate,employment_change,unemployment_rate,disposable_income_change,migration,region
0,0,1990.0,3297,2847,450,57,6,387,2274,2004.0,...,418,524,608,1105.421,67.300,0.219444,7.4,-2.669133,-5108,manitoba
1,1,1991.0,1950,1589,361,14,0,347,1438,1085.0,...,424,531,630,1109.604,67.025,-1.364064,8.6,-2.669133,-5108,manitoba
2,2,1992.0,2310,1683,627,64,10,553,1821,1383.0,...,434,545,644,1112.689,66.475,-1.351685,9.3,-0.326989,-5108,manitoba
3,3,1993.0,2425,1874,551,52,83,416,1702,1338.0,...,435,550,652,1117.618,66.850,0.766782,9.3,-0.453243,-2520,manitoba
4,4,1994.0,3197,2441,756,72,410,274,1664,1289.0,...,440,551,645,1123.230,66.750,0.774181,8.8,0.161821,-1996,manitoba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,22,2012.0,896,449,447,12,91,344,896,552.0,...,850,1005,1230,165.914,64.300,1.639344,6.7,,1239,kingston
940,23,2013.0,856,325,531,8,103,420,856,436.0,...,859,1054,1463,166.950,64.900,2.481390,6.3,,978,kingston
941,24,2014.0,672,338,334,26,102,206,672,462.0,...,888,1070,1411,168.164,63.600,-1.452785,6.8,,1185,kingston
942,25,2015.0,655,275,380,14,53,313,655,332.0,...,915,1096,1469,169.420,64.300,1.842752,6.6,,1259,kingston


## Converting factor variables to numeric


In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy import stats
from scipy.special import boxcox1p

def convert_factors_to_numeric(df):
    le = LabelEncoder()
    factor_columns = df.select_dtypes(include=['object']).columns
    for col in factor_columns:
        df[col] = le.fit_transform(df[col])
    return df
after_numeric_df= convert_factors_to_numeric(df)
after_numeric_df

Unnamed: 0.1,Unnamed: 0,year,total_dwelling,single_detached,multiple,semi_detached,row,apartment,total_dwelling_market,homeownership_freehold,...,one_bedroom,two_bedroom,three_bedroom,population,labour_participation_rate,employment_change,unemployment_rate,disposable_income_change,migration,region
0,0,1990.0,3297,2847,450,57,6,387,2274,2004.0,...,418,524,608,1105.421,67.300,0.219444,7.4,-2.669133,-5108,11
1,1,1991.0,1950,1589,361,14,0,347,1438,1085.0,...,424,531,630,1109.604,67.025,-1.364064,8.6,-2.669133,-5108,11
2,2,1992.0,2310,1683,627,64,10,553,1821,1383.0,...,434,545,644,1112.689,66.475,-1.351685,9.3,-0.326989,-5108,11
3,3,1993.0,2425,1874,551,52,83,416,1702,1338.0,...,435,550,652,1117.618,66.850,0.766782,9.3,-0.453243,-2520,11
4,4,1994.0,3197,2441,756,72,410,274,1664,1289.0,...,440,551,645,1123.230,66.750,0.774181,8.8,0.161821,-1996,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,22,2012.0,896,449,447,12,91,344,896,552.0,...,850,1005,1230,165.914,64.300,1.639344,6.7,,1239,10
940,23,2013.0,856,325,531,8,103,420,856,436.0,...,859,1054,1463,166.950,64.900,2.481390,6.3,,978,10
941,24,2014.0,672,338,334,26,102,206,672,462.0,...,888,1070,1411,168.164,63.600,-1.452785,6.8,,1185,10
942,25,2015.0,655,275,380,14,53,313,655,332.0,...,915,1096,1469,169.420,64.300,1.842752,6.6,,1259,10


## Converting calendar dates to Julian


In [8]:
def convert_dates_to_julian(df, date_column):
    df[f'{date_column}_julian'] = pd.to_datetime(df[date_column]).map(lambda x: x.to_julian_date())
    return df
after_convert_date=convert_dates_to_julian(after_numeric_df,'year')
after_convert_date

Unnamed: 0.1,Unnamed: 0,year,total_dwelling,single_detached,multiple,semi_detached,row,apartment,total_dwelling_market,homeownership_freehold,...,two_bedroom,three_bedroom,population,labour_participation_rate,employment_change,unemployment_rate,disposable_income_change,migration,region,year_julian
0,0,1990.0,3297,2847,450,57,6,387,2274,2004.0,...,524,608,1105.421,67.300,0.219444,7.4,-2.669133,-5108,11,2440587.5
1,1,1991.0,1950,1589,361,14,0,347,1438,1085.0,...,531,630,1109.604,67.025,-1.364064,8.6,-2.669133,-5108,11,2440587.5
2,2,1992.0,2310,1683,627,64,10,553,1821,1383.0,...,545,644,1112.689,66.475,-1.351685,9.3,-0.326989,-5108,11,2440587.5
3,3,1993.0,2425,1874,551,52,83,416,1702,1338.0,...,550,652,1117.618,66.850,0.766782,9.3,-0.453243,-2520,11,2440587.5
4,4,1994.0,3197,2441,756,72,410,274,1664,1289.0,...,551,645,1123.230,66.750,0.774181,8.8,0.161821,-1996,11,2440587.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,22,2012.0,896,449,447,12,91,344,896,552.0,...,1005,1230,165.914,64.300,1.639344,6.7,,1239,10,2440587.5
940,23,2013.0,856,325,531,8,103,420,856,436.0,...,1054,1463,166.950,64.900,2.481390,6.3,,978,10,2440587.5
941,24,2014.0,672,338,334,26,102,206,672,462.0,...,1070,1411,168.164,63.600,-1.452785,6.8,,1185,10,2440587.5
942,25,2015.0,655,275,380,14,53,313,655,332.0,...,1096,1469,169.420,64.300,1.842752,6.6,,1259,10,2440587.5


## Converting categorical values to dummies
 

In [15]:
def create_dummies(df):
    categorical_columns = ['region']
    df_dummies = pd.get_dummies(df, columns=categorical_columns)
    return df_dummies
after_onehot_df = create_dummies(df)
after_onehot_df.head(10)

Unnamed: 0.1,Unnamed: 0,year,total_dwelling,single_detached,multiple,semi_detached,row,apartment,total_dwelling_market,homeownership_freehold,...,region_25,region_26,region_27,region_28,region_29,region_30,region_31,region_32,region_33,region_34
0,0,1990.0,3297,2847,450,57,6,387,2274,2004.0,...,False,False,False,False,False,False,False,False,False,False
1,1,1991.0,1950,1589,361,14,0,347,1438,1085.0,...,False,False,False,False,False,False,False,False,False,False
2,2,1992.0,2310,1683,627,64,10,553,1821,1383.0,...,False,False,False,False,False,False,False,False,False,False
3,3,1993.0,2425,1874,551,52,83,416,1702,1338.0,...,False,False,False,False,False,False,False,False,False,False
4,4,1994.0,3197,2441,756,72,410,274,1664,1289.0,...,False,False,False,False,False,False,False,False,False,False
5,5,1995.0,1963,1564,399,45,121,233,1215,919.0,...,False,False,False,False,False,False,False,False,False,False
6,6,1996.0,2318,1875,443,126,113,204,1243,927.0,...,False,False,False,False,False,False,False,False,False,False
7,7,1997.0,2612,2019,593,143,96,354,1664,1266.0,...,False,False,False,False,False,False,False,False,False,False
8,8,1998.0,2895,2368,527,131,81,315,1856,1312.0,...,False,False,False,False,False,False,False,False,False,False
9,9,1999.0,3133,2231,902,90,151,661,2094,1286.0,...,False,False,False,False,False,False,False,False,False,False


## Performing Box-Cox transformations


In [21]:
def apply_box_cox(df, numeric_columns):
    for col in numeric_columns:
        # positive
        offset = 0
        if min(df[col]) <= 0:
            offset = abs(min(df[col])) + 1
        df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
    return df
numeric_columns = [col for col in df.select_dtypes(include=['number']).columns if col not in ['year', 'region']]
after_transform = apply_box_cox(after_convert_date,numeric_columns)
after_transform.head(10)

  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1p(df[col] + offset, 0.25)
  df[f'{col}_boxcox'] = boxcox1

Unnamed: 0.1,Unnamed: 0,year,total_dwelling,single_detached,multiple,semi_detached,row,apartment,total_dwelling_market,homeownership_freehold,...,two_bedroom_boxcox_boxcox_boxcox_boxcox,three_bedroom_boxcox_boxcox_boxcox_boxcox,population_boxcox_boxcox_boxcox_boxcox,labour_participation_rate_boxcox_boxcox_boxcox_boxcox,employment_change_boxcox_boxcox_boxcox_boxcox,unemployment_rate_boxcox_boxcox_boxcox_boxcox,disposable_income_change_boxcox_boxcox_boxcox_boxcox,migration_boxcox_boxcox_boxcox_boxcox,region_boxcox_boxcox_boxcox_boxcox,year_julian_boxcox_boxcox_boxcox_boxcox
0,0,1990.0,3297,2847,450,57,6,387,2274,2004.0,...,1.258523,1.270012,1.31456,1.077178,0.834213,0.799996,0.590625,1.374653,0.871636,1.761327
1,1,1991.0,1950,1589,361,14,0,347,1438,1085.0,...,1.259556,1.272733,1.314834,1.076761,0.805781,0.823064,0.590625,1.374653,0.871636,1.761327
2,2,1992.0,2310,1683,627,64,10,553,1821,1383.0,...,1.261577,1.274411,1.315035,1.075923,0.80603,0.834786,0.721698,1.374653,0.871636,1.761327
3,3,1993.0,2425,1874,551,52,83,416,1702,1338.0,...,1.262284,1.275352,1.315355,1.076495,0.842695,0.834786,0.716884,1.420038,0.871636,1.761327
4,4,1994.0,3197,2441,756,72,410,274,1664,1289.0,...,1.262425,1.274529,1.315718,1.076343,0.842805,0.826528,0.738942,1.426196,0.871636,1.761327
5,5,1995.0,1963,1564,399,45,121,233,1215,919.0,...,1.262846,1.275701,1.316098,1.076076,0.856553,0.797871,0.721715,1.434233,0.871636,1.761327
6,6,1996.0,2318,1875,443,126,113,204,1243,927.0,...,1.263541,1.274529,1.316421,1.075872,0.834213,0.79571,0.731635,1.43287,0.871636,1.761327
7,7,1997.0,2612,2019,593,143,96,354,1664,1266.0,...,1.263817,1.275817,1.316544,1.076127,0.851825,0.77949,0.69958,1.404011,0.871636,1.761327
8,8,1998.0,2895,2368,527,131,81,315,1856,1312.0,...,1.264503,1.27651,1.31663,1.076673,0.858355,0.752235,0.802823,1.403542,0.871636,1.761327
9,9,1999.0,3133,2231,902,90,151,661,2094,1286.0,...,1.265586,1.277196,1.316945,1.077479,0.850459,0.75522,0.823236,1.446784,0.871636,1.761327


## Applying Tukey's Ladder


In [22]:
def apply_tukey_ladder(df, numeric_columns):
    for col in numeric_columns:
        df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
    return df
after_tukey = apply_tukey_ladder(after_transform,numeric_columns)
after_tukey

def apply_tukey_ladder(df, numeric_columns):
    for col in numeric_columns:
        df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
    return df

  df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
  df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
  df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
  df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
  df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
  df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
  df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]
  df[f'{col}_tukey'] = stats.yeojohnson(df[col])[0]


ValueError: Yeo-Johnson input must be finite.

## Perform SLR and MLR methods and analyze the results