In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def compare(dataset1, dataset2, region_type, year_moved_start, year_moved_end):
    df = pd.read_csv('../data/clean/acs_zillow_2015.csv', index_col=0)
    comparison_df, rent_columns, MoE_columns, suffixes = make_comparison_df(
        df, dataset1, dataset2, region_type, year_moved_start, year_moved_end
    )
    plot_results(comparison_df, rent_columns, MoE_columns, suffixes, 
                 dataset1, dataset2, region_type)
    
def plot_results(comparison_df, rent_columns, MoE_columns, suffixes,
                 dataset1, dataset2, region_type):
    display(comparison_df.head())
    line = np.linspace(
        comparison_df[rent_columns[0]].min(),
        comparison_df[rent_columns[0]].max()
    )
    if max(comparison_df[rent_columns[1]]) > 6000:
        plt.xlim(0,4500)
        plt.ylim(0,6000)
    xerr = comparison_df[MoE_columns[0]]
    yerr = comparison_df[MoE_columns[1]]
    plt.errorbar(x = rent_columns[0], y = rent_columns[1], 
                 xerr = xerr, yerr=yerr,
                 data= comparison_df, fmt = 'o',alpha = .2)
    plt.plot(line, line, color = 'r')
    plt.xlabel(rent_columns[0])
    plt.ylabel(rent_columns[1])
    plt.title(f"comparison of {dataset1} and {dataset2} by {region_type}")
    plt.show()
    # get mse
    n = (len(comparison_df[
            ~comparison_df[rent_columns[0]].isna() 
            & ~comparison_df[rent_columns[1]].isna()])
        )
    print(f"n={n}")
    
    # Mean Absolute Error
    MoE_mae = margin_of_error_mae(comparison_df, rent_columns, MoE_columns)
    mae_point = mae(comparison_df, rent_columns)
    print(f'MAE: {mae_point}')
    print(f'Margin of Error on MAE: {MoE_mae}\n')
    
    # Root Mean Squared Error
    rmse_point = rmse(comparison_df, rent_columns)
    MoE_rmse = margin_of_error_rmse(comparison_df, rent_columns, MoE_columns)
    print(f'RMSE: {rmse_point}')
    print(f'Margin of Error on RMSE: {MoE_rmse}\n')
    
    # Mean Relative Error
    mre_point = mre(comparison_df, rent_columns)
    MoE_mre = margin_of_error_mre(comparison_df, rent_columns, MoE_columns)
    print(f'Mean Relative Error (MRE): {mre_point}')
    print(f'Margin of Error on MRE: {MoE_mre}')

def make_comparison_df(df, dataset1, dataset2, region_type, year_moved_start, year_moved_end):
    df1 = (
        df[
            (df.region_type == region_type)
            & (df.source == dataset1)
            & ((
                    ('zillow' in dataset1)
                    & (df.year_moved_start >= year_moved_start)
                    & (df.year_moved_end <= year_moved_end))
                | (
                    ('acs' in dataset1)
                    & (df.year_moved_start == year_moved_start)
                    & (df.year_moved_end == year_moved_end)))
        ]
        .groupby('region_id')
        .mean()
        .reset_index()
    )
    
    df2 = (
        df[
           (df.region_type == region_type)
            & (df.source == dataset2)
            & ((
                    ('zillow' in dataset2)
                    & (df.year_moved_start >= year_moved_start)
                    & (df.year_moved_end <= year_moved_end))
                | (
                    ('acs' in dataset2)
                    & (df.year_moved_start == year_moved_start)
                    & (df.year_moved_end == year_moved_end)))
        ]
        .groupby('region_id')
        .mean()
        .reset_index()
    )
    
    suffixes = ['_' + df_name for df_name in [dataset1, dataset2]]
    rent_columns = ['median_rent_' + df_name for df_name in [dataset1, dataset2]]
    MoE_columns = ['MoE_' + df_name for df_name in [dataset1, dataset2]]
    comparison_df = (
        df1
        .merge(df2, on = 'region_id', suffixes=suffixes)
        [['region_id'] + rent_columns + MoE_columns]
    )
    return comparison_df, rent_columns, MoE_columns, suffixes

def mae(df, rent_columns):
    df = df[~df[rent_columns[0]].isna() & ~df[rent_columns[1]].isna()]
    return (df[rent_columns[0]]-df[rent_columns[1]]).abs().mean()

def rmse(df, rent_columns):
    df = df[~df[rent_columns[0]].isna() & ~df[rent_columns[1]].isna()]
    return np.sqrt(((df[rent_columns[0]]-df[rent_columns[1]])**2).mean())

def mre(df, rent_columns):
    df = df[~df[rent_columns[0]].isna() & ~df[rent_columns[1]].isna()]
    return (df[rent_columns[1]]/df[rent_columns[0]]).mean()

def margin_of_error_rmse(df, rent_columns, MoE_columns):
    # this is almost definitely wrong. MoE(sqrt(X)) != sqrt(MoE(X))
    df = df[~df[rent_columns[0]].isna() & ~df[rent_columns[1]].isna()]
    n = len(df)
    squared_diff = (df[rent_columns[0]]-df[rent_columns[1]])**2
    squared_MoE = df[MoE_columns[0]].fillna(0)**2+df[MoE_columns[1]].fillna(0)**2
    return np.sqrt(1/n*np.sqrt(2*np.sum(squared_diff*squared_MoE)))

def margin_of_error_mae(df, rent_columns, MoE_columns):
        df = df[~df[rent_columns[0]].isna() & ~df[rent_columns[1]].isna()]
        n = len(df)
        squared_MoE = (
            (df[MoE_columns[0]].fillna(0) ** 2)
            + (df[MoE_columns[1]].fillna(0) ** 2)
        )
        return 1/n*np.sqrt(np.sum(squared_MoE))

def margin_of_error_mre(df, rent_columns, MoE_columns):
        df = df[~df[rent_columns[0]].isna() & ~df[rent_columns[1]].isna()]
        n = len(df)
        MoE_fractions = 1 / df[rent_columns[0]] * np.sqrt(
            (df[MoE_columns[1]].fillna(0) ** 2)
            + (df[rent_columns[1]]*df[MoE_columns[0]].fillna(0)/df[rent_columns[0]]) ** 2
        )
        return 1/n*np.sqrt(np.sum(MoE_fractions))

# Sanity Check MoE

In [3]:
df = pd.read_csv('../data/clean/acs_zillow_2015.csv', index_col=0)
comparison_df, rent_columns, MoE_columns, suffixes = make_comparison_df(
    df, 'acs5year_recent', 'zillow', 'zipcode', 2015, 2015
)
comparison_df = comparison_df[~comparison_df[rent_columns[0]].isna() 
                              & ~comparison_df[rent_columns[1]].isna()].head(5)
display(comparison_df)
margin_of_error_mae(comparison_df, rent_columns, MoE_columns)

Unnamed: 0,region_id,median_rent_acs5year_recent,median_rent_zillow,MoE_acs5year_recent,MoE_zillow
9,2114,2186.0,2715.75,154.0,
10,2115,1901.0,2756.25,332.0,
16,2127,2167.0,2815.0,1122.0,
18,2129,2385.0,2686.625,604.0,
19,2130,1979.0,2560.416667,501.0,


283.45306489787686

I guess these numbers make sense?

In [26]:
df = pd.read_csv('../data/clean/acs_zillow_2015.csv', index_col=0)
df.head()

Unnamed: 0,MoE,median_rent,month,region_id,region_name,region_type,source,year_moved_end,year_moved_start
0,,,1.0,6037,Los Angeles County,county,zillow,2010,2010.0
1,,,1.0,17031,Cook County,county,zillow,2010,2010.0
2,,,1.0,48201,Harris County,county,zillow,2010,2010.0
3,,,1.0,4013,Maricopa County,county,zillow,2010,2010.0
4,,,1.0,6073,San Diego County,county,zillow,2010,2010.0


In [25]:
zillow_df = df[
    (df.region_type == 'zipcode') 
    & (df.region_id == 20001)
    & (df.source == 'zillow')
].groupby('year_moved_end').mean()
zillow_df.plot()

Unnamed: 0_level_0,MoE,median_rent,month,region_id,year_moved_start
year_moved_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010,,,7.0,20001,2010.0
2011,,,6.5,20001,2011.0
2012,,,6.5,20001,2012.0
2013,,2815.0,6.5,20001,2013.0
2014,,2735.0,6.5,20001,2014.0
2015,,2812.916667,6.5,20001,2015.0
2016,,2872.916667,6.5,20001,2016.0
2017,,2810.875,6.5,20001,2017.0
2018,,2849.75,6.5,20001,2018.0
2019,,2770.0,2.5,20001,2019.0


In [24]:
zillow_df.groupby('year_moved_end').mean()


Unnamed: 0_level_0,MoE,median_rent,month,region_id,year_moved_start
year_moved_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010,,,7.0,20001,2010.0
2011,,,6.5,20001,2011.0
2012,,,6.5,20001,2012.0
2013,,2815.0,6.5,20001,2013.0
2014,,2735.0,6.5,20001,2014.0
2015,,2812.916667,6.5,20001,2015.0
2016,,2872.916667,6.5,20001,2016.0
2017,,2810.875,6.5,20001,2017.0
2018,,2849.75,6.5,20001,2018.0
2019,,2770.0,2.5,20001,2019.0
