In [1]:
import pandas as pd
import numpy as np

In [31]:
import pandas as pd
import numpy as np

def time_series_gaps(df, ts_start_col):
    '''
    Returns a dictionary containing:
        keys: the row indices of any row with missing data
        values: numeric horizontal (column) start and end indices of any periods of missing data
    
    time series data must be running row-wise for the function to work
    
    ts_start_col is the numeric index of the column in the dataframe at which the time series begins
    the function assumes the time series runs from ts_start_col all the way to the right side of the dataframe
    '''
    gaps = {}
    for i in range(df.shape[0]):
        curr_gaps = []
        curr_ts = df.iloc[i,:]
        if curr_ts.isnull().any():
            j = ts_start_col
            while j < len(curr_ts):
                if pd.isnull(curr_ts[j]):
                    gap_start = int(j)
                    gap_end = None
                    k = int(j)
                    while gap_end is None:
                        if k == len(curr_ts) - 1:
                            if pd.isnull(curr_ts[k]):
                                gap_end = int(k)
                                curr_gaps.append((gap_start, gap_end))
                                j = int(k)+1
                            else:   
                                gap_end = int(k) - 1
                                curr_gaps.append((gap_start, gap_end))
                                j = int(k)+1
                        elif pd.isnull(curr_ts[k]):
                            k+=1
                        else:
                            gap_end = int(k) - 1
                            curr_gaps.append((gap_start, gap_end))
                            j = int(k)
                else:
                    j += 1
        if curr_gaps != []:
            gaps[i] = curr_gaps
    return gaps

def get_rejected_gaps(gaps, gap_limit):
    '''
    takes gaps dictionary generated by time_series_gaps, returns list of row indices that contain gaps >= gap_limit
    '''
    over_limit = []
    for key, val in gaps.items():
        if len(val) > 1:
            for i in range(len(val)):
                width = val[i][1] - val[i][0] + 1
                if width >= gap_limit:
                    over_limit.append(key)
        else:
            width = val[0][1] - val[0][0] + 1
            if width >= gap_limit:
                over_limit.append(key)
    return list(set(over_limit))

In [32]:
multi = pd.read_csv('./../data/Zip_Zri_MultiFamilyResidenceRental.csv')

In [33]:
type(multi['RegionName'][0])

numpy.int64

In [34]:
multi

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,2930.0,2952.0,2926.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1447.0,1465.0,1469.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,2797.0,2811.0,2813.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1437.0,1469.0,1490.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,90755,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,997.0,1019.0,1042.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,94610,84627,Ephraim,UT,,Sanpete County,1858,,,,...,,,,,,,,,1298.0,
1858,84452,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,94629,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,...,,,,,,,,,,1221.0


In [35]:
multi['RegionName'] = multi['RegionName'].map(lambda x: str(x))

In [36]:
multi['RegionName'].map(lambda x: len(x)).value_counts()

5    1621
4     240
Name: RegionName, dtype: int64

In [37]:
multi['RegionName'] = multi['RegionName'].map(lambda x: '0'+x if len(x)<5 else x)

In [38]:
multi.rename(columns = {'RegionName':'zip'}, inplace = True)

In [39]:
multi = multi.iloc[:,1:] # drop RegionID column, just a unique identifier

In [40]:
multi

Unnamed: 0,zip,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,2010-12,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,2930.0,2952.0,2926.0,2904.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1447.0,1465.0,1469.0,1467.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,2797.0,2811.0,2813.0,2784.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1437.0,1469.0,1490.0,1503.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,997.0,1019.0,1042.0,1063.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,84627,Ephraim,UT,,Sanpete County,1858,,,,,...,,,,,,,,,1298.0,
1858,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,,...,,,,,,,,,,1221.0


In [41]:
multi = pd.concat([multi.iloc[:,0:6], multi.iloc[:,46:]], axis = 1)

In [42]:
multi

Unnamed: 0,zip,City,State,Metro,CountyName,SizeRank,2014-01,2014-02,2014-03,2014-04,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3333.0,3338.0,3340.0,3323.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1647.0,1645.0,1635.0,1650.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3135.0,3138.0,3136.0,3131.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,1081.0,1093.0,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1756.0,1759.0,1754.0,1774.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,1297.0,1284.0,1295.0,1297.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,84627,Ephraim,UT,,Sanpete County,1858,,,,,...,,,,,,,,,1298.0,
1858,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,,...,,,,,,,,,,1221.0


In [43]:
gaps = time_series_gaps(multi, 7)

In [44]:
rej_gaps = get_rejected_gaps(gaps, 6)

In [45]:
multi2 = multi.drop(rej_gaps, axis = 0)

In [46]:
multi2

Unnamed: 0,zip,City,State,Metro,CountyName,SizeRank,2014-01,2014-02,2014-03,2014-04,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3333.0,3338.0,3340.0,3323.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1647.0,1645.0,1635.0,1650.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3135.0,3138.0,3136.0,3131.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,1081.0,1093.0,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1756.0,1759.0,1754.0,1774.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1849,04937,Fairfield,ME,,Somerset County,1850,794.0,793.0,790.0,792.0,...,1092.0,1111.0,1108.0,1105.0,1113.0,1113.0,1146.0,,,
1850,60606,Chicago,IL,Chicago-Naperville-Elgin,Cook County,1851,2004.0,2001.0,2016.0,2041.0,...,2145.0,2155.0,2166.0,2183.0,2206.0,2228.0,2217.0,2230.0,2200.0,2192.0
1852,12307,Schenectady,NY,Albany-Schenectady-Troy,Schenectady County,1853,,1044.0,,1028.0,...,937.0,949.0,964.0,974.0,978.0,978.0,,,1039.0,1047.0
1855,18232,Lansford,PA,Allentown-Bethlehem-Easton,Carbon County,1856,775.0,778.0,780.0,783.0,...,781.0,773.0,769.0,762.0,758.0,758.0,,,,


In [47]:
multi2.columns[7:]

Index(['2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07',
       '2014-08', '2014-09', '2014-10', '2014-11', '2014-12', '2015-01',
       '2015-02', '2015-03', '2015-04', '2015-05', '2015-06', '2015-07',
       '2015-08', '2015-09', '2015-10', '2015-11', '2015-12', '2016-01',
       '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07',
       '2016-08', '2016-09', '2016-10', '2016-11', '2016-12', '2017-01',
       '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07',
       '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01',
       '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-07',
       '2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01',
       '2019-02', '2019-03', '2019-04', '2019-05', '2019-06', '2019-07',
       '2019-08', '2019-09', '2019-10', '2019-11', '2019-12', '2020-01'],
      dtype='object')

In [49]:
multi2.to_csv('../../data/HIER_zri_multifamily_v2.csv', index = False)

In [19]:
multi2 = pd.melt(multi2, id_vars = multi2.columns[0:6], value_vars = multi2.columns[6:], var_name = 'year-month', value_name = 'zri')

In [20]:
missing_zri = multi2[multi2['zri'].isnull()]['zip'].unique()

In [21]:
missing_zri

array(['77494', '10002', '78660', '94109', '77584', '28269', '94565',
       '37042', '77429', '30044', '30043', '30024', '77077', '78666',
       '78521', '75243', '94110', '44035', '07302', '37075', '19143',
       '97229', '75056', '89108', '78240', '87111', '14850', '33024',
       '98012', '92677', '20906', '10468', '60634', '22191', '07002',
       '89052', '89117', '30080', '21117', '60628', '02360', '95123',
       '27703', '23454', '85705', '20147', '90025', '20878', '27713',
       '29464', '20019', '33313', '98115', '94568', '19124', '95051',
       '32225', '76116', '23452', '60615', '78249', '77057', '27707',
       '46143', '98133', '19120', '37040', '27705', '93309', '85282',
       '28216', '94538', '33157', '33511', '85204', '59901', '21061',
       '33414', '95014', '53704', '28210', '32771', '92130', '33433',
       '92122', '19148', '76137', '78759', '94087', '97124', '33436',
       '76051', '98034', '54601', '19131', '92117', '85201', '78209',
       '94086', '948

In [22]:
list(multi2.loc[multi2['zip']=='78212','zri'])

[nan,
 nan,
 848.0,
 853.0,
 862.0,
 866.0,
 871.0,
 874.0,
 870.0,
 869.0,
 866.0,
 856.0,
 851.0,
 852.0,
 851.0,
 855.0,
 865.0,
 874.0,
 878.0,
 879.0,
 879.0,
 880.0,
 883.0,
 882.0,
 886.0,
 884.0,
 878.0,
 876.0,
 879.0,
 886.0,
 893.0,
 904.0,
 908.0,
 905.0,
 901.0,
 905.0,
 913.0,
 917.0,
 919.0,
 929.0,
 936.0,
 938.0,
 939.0,
 931.0,
 932.0,
 932.0,
 930.0,
 925.0,
 924.0,
 932.0,
 934.0,
 925.0,
 924.0,
 916.0,
 905.0,
 902.0,
 896.0,
 892.0,
 896.0,
 894.0,
 909.0,
 913.0,
 914.0,
 918.0,
 921.0,
 922.0,
 nan,
 970.0,
 964.0,
 943.0,
 951.0,
 942.0,
 933.0]

In [23]:
multi2.loc[multi2['zip']=='78212','zri'].interpolate(method = 'ffill').interpolate(method = 'bfill')

895      848.0
2197     848.0
3499     848.0
4801     853.0
6103     862.0
         ...  
89431    964.0
90733    943.0
92035    951.0
93337    942.0
94639    933.0
Name: zri, Length: 73, dtype: float64

In [24]:
for item in missing_zri:
    temp = multi2.loc[multi2['zip']==item,'zri']
    multi2.loc[multi2['zip']==item,'zri'] = temp.interpolate(method = 'linear').interpolate(method = 'ffill').interpolate(method = 'bfill')

In [25]:
multi2['zri'].isnull().value_counts()

False    95046
Name: zri, dtype: int64

In [26]:
multi2 = multi2[multi2['year-month']!='2020-01']

In [27]:
multi2.drop(columns = ['SizeRank'], inplace = True)

In [28]:
multi2['year-month'].value_counts()

2014-09    1302
2014-02    1302
2017-09    1302
2016-03    1302
2016-04    1302
           ... 
2019-02    1302
2018-01    1302
2015-07    1302
2015-04    1302
2018-09    1302
Name: year-month, Length: 72, dtype: int64

In [29]:
multi2 = multi2.sort_values(['zip', 'year-month'])

In [None]:
# multi2.to_csv('./../data/zri_multifamily_v2.csv', index = False)

In [30]:
multi2

Unnamed: 0,zip,City,State,Metro,CountyName,year-month,zri
1074,01013,Chicopee,MA,Springfield,Hampden County,2014-01,928.0
2376,01013,Chicopee,MA,Springfield,Hampden County,2014-02,931.0
3678,01013,Chicopee,MA,Springfield,Hampden County,2014-03,934.0
4980,01013,Chicopee,MA,Springfield,Hampden County,2014-04,929.0
6282,01013,Chicopee,MA,Springfield,Hampden County,2014-05,929.0
...,...,...,...,...,...,...,...
88204,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-08,1277.0
89506,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-09,1271.0
90808,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-10,1299.0
92110,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-11,1261.5
