In [49]:
# Importing necessary libraries and importing CSV files into respective dataframes

import pandas as pd

df_CPI = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/CPI-inflation-by-region-1914-202.csv")
df_Housing = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/housing-supply-price-rental.csv")
df_HPIRegions = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/HPI 1981-2022 by regions.csv")
df_HPIFederalandProvincial = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/HPI 1981-2022 federal and provincial.csv")
df_Income = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/income-distribution-2012-2020.csv")
df_Interest = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/Interest and mortgage rates 1951-2022.csv")
df_PopulationDwellings = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/population_dwellings_count.csv")
df_PopulationRegion = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/population-by-region-1946-2022.csv")
df_Structural = pd.read_csv("/workspaces/MSCI466_Project/data/housing-affordability-in-canada/Structural-dwellings-household-size.csv")

In [50]:
# Removes region rows that aren't Canada and selects "All-items"
df_CPI = df_CPI[df_CPI['GEO'] == 'Canada']
df_CPI.drop(columns=['GEO'], inplace=True)
df_CPI = df_CPI[df_CPI['Products and product groups'] == 'All-items']
df_CPI.drop(columns=['Products and product groups'], inplace=True)

# Removes unnecessary columns
df_CPI.drop(columns=['DGUID'], inplace=True)
df_CPI.drop(columns=['UOM'], inplace=True)
df_CPI.drop(columns=['UOM_ID'], inplace=True)
df_CPI.drop(columns=['VECTOR'], inplace=True)
df_CPI.drop(columns=['COORDINATE'], inplace=True)

# Standardizes "Date" name and format
df_CPI['Date'] = df_CPI['REF_DATE'].astype(str) + '01'
df_CPI.drop(columns=['REF_DATE'], inplace=True)

#df_CPI.head(108)

In [51]:
# Standardizes "Date" format and name
df_Housing['Date'] = df_Housing['year'].astype(str).str.replace('.0', '').str.replace('.1', '') + '01'

df_Housing = df_Housing[['Date', 'total_dwelling', 'total_dwelling_market', 'labour_participation_rate', 'unemployment_rate', 'disposable_income_change', 
                         'rental_vacancy_rate', 'rental_avilability_rate', 'owned_accommodation_costs_change', 'rental_accommodation_costs_change']]

# Averages all of the stats by following the Date
df_Housing = df_Housing.groupby('Date').mean().reset_index()

#df_Housing.head(27)

In [52]:
# Deletes regions other than Canada
canada_Index = df_HPIRegions.columns.get_loc("Canada")
df_HPIRegions = df_HPIRegions.iloc[:, :canada_Index + 1]

# Removes unnecessary columns
df_HPIRegions.drop(columns = ['Unnamed: 0'], inplace=True)
df_HPIRegions.drop(columns = ['Canada'], inplace=True)

# Converts "Month-year" to standard format and renames column to "Date"
df_HPIRegions['Month-year'] = pd.to_datetime(df_HPIRegions['Month-year'], format='%b-%y')
def adjust_year(date):
    
    if date.year - 2000 >= 25 and date.year - 2000 <= 99:
        return date.replace(year=date.year - 100)

    return date
df_HPIRegions['Month-year'] = df_HPIRegions['Month-year'].apply(adjust_year)
df_HPIRegions['Month-year'] = df_HPIRegions['Month-year'].dt.strftime('%Y%m')
df_HPIRegions.rename(columns={'Month-year': 'Date'}, inplace=True)

#df_HPIRegions.head(1494)

In [53]:
# Standardizes date format and changes title to "Date"
df_Income['Date'] = df_Income['year'].astype(str).str.replace('.0', '') + '01'
df_Income.drop(columns=['year'], inplace=True)

#df_Income.head(9)

# Removes all the "nan01" row values from "Date" column
df_Income = df_Income[df_Income['Date'] != 'nan01']

In [54]:
# Standardizes "Date" and format
df_Interest['Date'] = df_Interest['Date'].astype(str).str.replace('-', '')

#df_Interest.head(857)

In [55]:
# Deletes region rows other than Canada
df_PopulationRegion = df_PopulationRegion[df_PopulationRegion['COORDINATE'] == 1]
df_PopulationRegion.drop(columns=['COORDINATE'], inplace=True)
df_PopulationRegion.drop(columns=['GEO'], inplace=True)
df_PopulationRegion.drop(columns=['DGUID'], inplace=True)
df_PopulationRegion.drop(columns=['VECTOR'], inplace=True)

# Converts REF_DATE to standard format and renames column to "Date"
df_PopulationRegion['REF_DATE'] = pd.to_datetime(df_PopulationRegion['REF_DATE'], format='%b-%y')
def adjust_year(date):
    
    if date.year - 2000 >= 25 and date.year - 2000 <= 99:
        return date.replace(year=date.year - 100)

    return date
df_PopulationRegion['REF_DATE'] = df_PopulationRegion['REF_DATE'].apply(adjust_year)
df_PopulationRegion['REF_DATE'] = df_PopulationRegion['REF_DATE'].dt.strftime('%Y%m')
df_PopulationRegion.rename(columns={'REF_DATE': 'Date'}, inplace=True)

#df_PopulationRegion.head(306)

In [56]:
merged_df = pd.merge(df_CPI, df_Housing, on='Date', how='outer')
merged_df = pd.merge(merged_df, df_HPIRegions, on='Date', how='outer')
merged_df = pd.merge(merged_df, df_Income, on='Date', how='outer')
merged_df = pd.merge(merged_df, df_Interest, on='Date', how='outer')
merged_df = pd.merge(merged_df, df_PopulationRegion, on='Date', how='outer')

merged_df.head(10000)

Unnamed: 0,CPI,Date,total_dwelling,total_dwelling_market,labour_participation_rate,unemployment_rate,disposable_income_change,rental_vacancy_rate,rental_avilability_rate,owned_accommodation_costs_change,...,20000,30000,40000,50000,60000,80000,100000,Mortgage Rate,Interest Rate,Population estimate
0,6.0,191401,,,,,,,,,...,,,,,,,,,,
1,6.1,191501,,,,,,,,,...,,,,,,,,,,
2,6.7,191601,,,,,,,,,...,,,,,,,,,,
3,7.9,191701,,,,,,,,,...,,,,,,,,,,
4,8.9,191801,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1901,,202205,,,,,,,,,...,,,,,,,,4.63,1.25,
1902,,202205,,,,,,,,,...,,,,,,,,4.63,1.25,
1903,,202206,,,,,,,,,...,,,,,,,,,,
1904,,202206,,,,,,,,,...,,,,,,,,,,
