In [1]:
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
import numpy as np

def time_series_gaps(df, ts_start_col):
    '''
    Returns a dictionary containing:
        keys: the row indices of any row with missing data
        values: numeric horizontal (column) start and end indices of any periods of missing data
    
    time series data must be running row-wise for the function to work
    
    ts_start_col is the numeric index of the column in the dataframe at which the time series begins
    the function assumes the time series runs from ts_start_col all the way to the right side of the dataframe
    '''
    gaps = {}
    for i in range(df.shape[0]):
        curr_gaps = []
        curr_ts = df.iloc[i,:]
        if curr_ts.isnull().any():
            j = ts_start_col
            while j < len(curr_ts):
                if pd.isnull(curr_ts[j]):
                    gap_start = int(j)
                    gap_end = None
                    k = int(j)
                    while gap_end is None:
                        if k == len(curr_ts) - 1:
                            if pd.isnull(curr_ts[k]):
                                gap_end = int(k)
                                curr_gaps.append((gap_start, gap_end))
                                j = int(k)+1
                            else:   
                                gap_end = int(k) - 1
                                curr_gaps.append((gap_start, gap_end))
                                j = int(k)+1
                        elif pd.isnull(curr_ts[k]):
                            k+=1
                        else:
                            gap_end = int(k) - 1
                            curr_gaps.append((gap_start, gap_end))
                            j = int(k)
                else:
                    j += 1
        if curr_gaps != []:
            gaps[i] = curr_gaps
    return gaps

def get_rejected_gaps(gaps, gap_limit):
    '''
    takes gaps dictionary generated by time_series_gaps, returns list of row indices that contain gaps >= gap_limit
    '''
    over_limit = []
    for key, val in gaps.items():
        if len(val) > 1:
            for i in range(len(val)):
                width = val[i][1] - val[i][0] + 1
                if width >= gap_limit:
                    over_limit.append(key)
        else:
            width = val[0][1] - val[0][0] + 1
            if width >= gap_limit:
                over_limit.append(key)
    return list(set(over_limit))

In [3]:
multi = pd.read_csv('./../data/Zip_Zri_MultiFamilyResidenceRental.csv')

In [4]:
type(multi['RegionName'][0])

numpy.int64

In [5]:
multi

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,2930.0,2952.0,2926.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1447.0,1465.0,1469.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,2797.0,2811.0,2813.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1437.0,1469.0,1490.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,90755,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,997.0,1019.0,1042.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,94610,84627,Ephraim,UT,,Sanpete County,1858,,,,...,,,,,,,,,1298.0,
1858,84452,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,94629,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,...,,,,,,,,,,1221.0


In [6]:
multi['RegionName'] = multi['RegionName'].map(lambda x: str(x))

In [7]:
multi['RegionName'].map(lambda x: len(x)).value_counts()

5    1621
4     240
Name: RegionName, dtype: int64

In [8]:
multi['RegionName'] = multi['RegionName'].map(lambda x: '0'+x if len(x)<5 else x)

In [9]:
multi.rename(columns = {'RegionName':'zip'}, inplace = True)

In [10]:
multi = multi.iloc[:,1:] # drop RegionID column, just a unique identifier

In [11]:
multi

Unnamed: 0,zip,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,2010-12,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,2930.0,2952.0,2926.0,2904.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1447.0,1465.0,1469.0,1467.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,2797.0,2811.0,2813.0,2784.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1437.0,1469.0,1490.0,1503.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,997.0,1019.0,1042.0,1063.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,84627,Ephraim,UT,,Sanpete County,1858,,,,,...,,,,,,,,,1298.0,
1858,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,,...,,,,,,,,,,1221.0


In [12]:
multi = pd.concat([multi.iloc[:,0:6], multi.iloc[:,46:]], axis = 1)

In [13]:
multi

Unnamed: 0,zip,City,State,Metro,CountyName,SizeRank,2014-01,2014-02,2014-03,2014-04,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3333.0,3338.0,3340.0,3323.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1647.0,1645.0,1635.0,1650.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3135.0,3138.0,3136.0,3131.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,1081.0,1093.0,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1756.0,1759.0,1754.0,1774.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,1297.0,1284.0,1295.0,1297.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,84627,Ephraim,UT,,Sanpete County,1858,,,,,...,,,,,,,,,1298.0,
1858,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,,...,,,,,,,,,,1221.0


In [14]:
gaps = time_series_gaps(multi, 7)

In [15]:
rej_gaps = get_rejected_gaps(gaps, 6)

In [16]:
multi2 = multi.drop(rej_gaps, axis = 0)

In [17]:
multi2

Unnamed: 0,zip,City,State,Metro,CountyName,SizeRank,2014-01,2014-02,2014-03,2014-04,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3333.0,3338.0,3340.0,3323.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1647.0,1645.0,1635.0,1650.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3135.0,3138.0,3136.0,3131.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,1081.0,1093.0,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1756.0,1759.0,1754.0,1774.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1849,04937,Fairfield,ME,,Somerset County,1850,794.0,793.0,790.0,792.0,...,1092.0,1111.0,1108.0,1105.0,1113.0,1113.0,1146.0,,,
1850,60606,Chicago,IL,Chicago-Naperville-Elgin,Cook County,1851,2004.0,2001.0,2016.0,2041.0,...,2145.0,2155.0,2166.0,2183.0,2206.0,2228.0,2217.0,2230.0,2200.0,2192.0
1852,12307,Schenectady,NY,Albany-Schenectady-Troy,Schenectady County,1853,,1044.0,,1028.0,...,937.0,949.0,964.0,974.0,978.0,978.0,,,1039.0,1047.0
1855,18232,Lansford,PA,Allentown-Bethlehem-Easton,Carbon County,1856,775.0,778.0,780.0,783.0,...,781.0,773.0,769.0,762.0,758.0,758.0,,,,


In [18]:
multi2 = pd.melt(multi2, id_vars = multi2.columns[0:6], value_vars = multi2.columns[7:], var_name = 'year-month', value_name = 'zri')

In [19]:
missing_zri = multi2[multi2['zri'].isnull()]['zip'].unique()

In [20]:
missing_zri

array(['77494', '78660', '94109', '77584', '28269', '77429', '30044',
       '30043', '98052', '30024', '78704', '77077', '94110', '19143',
       '78240', '87111', '98012', '92677', '20906', '06902', '89117',
       '30080', '95123', '22304', '27703', '20147', '20878', '27713',
       '60622', '29464', '94568', '19124', '95051', '76116', '60615',
       '23451', '78249', '27707', '98133', '75070', '19120', '37040',
       '27705', '85282', '94538', '78748', '59901', '60608', '21061',
       '19104', '92620', '95014', '53704', '95112', '28210', '32771',
       '92130', '92122', '02135', '90036', '76137', '78759', '94087',
       '97124', '33436', '76051', '98034', '54601', '19131', '78209',
       '94086', '19144', '92128', '92805', '29414', '95008', '92656',
       '29607', '30606', '27613', '77042', '90028', '94115', '72764',
       '28262', '27615', '32837', '85224', '37221', '80211', '27612',
       '14150', '85283', '91367', '19139', '30328', '30093', '85018',
       '85260', '236

In [21]:
list(multi2.loc[multi2['zip']=='78212','zri'])

[nan,
 848.0,
 853.0,
 862.0,
 866.0,
 871.0,
 874.0,
 870.0,
 869.0,
 866.0,
 856.0,
 851.0,
 852.0,
 851.0,
 855.0,
 865.0,
 874.0,
 878.0,
 879.0,
 879.0,
 880.0,
 883.0,
 882.0,
 886.0,
 884.0,
 878.0,
 876.0,
 879.0,
 886.0,
 893.0,
 904.0,
 908.0,
 905.0,
 901.0,
 905.0,
 913.0,
 917.0,
 919.0,
 929.0,
 936.0,
 938.0,
 939.0,
 931.0,
 932.0,
 932.0,
 930.0,
 925.0,
 924.0,
 932.0,
 934.0,
 925.0,
 924.0,
 916.0,
 905.0,
 902.0,
 896.0,
 892.0,
 896.0,
 894.0,
 909.0,
 913.0,
 914.0,
 918.0,
 921.0,
 922.0,
 nan,
 970.0,
 964.0,
 943.0,
 951.0,
 942.0,
 933.0]

In [22]:
multi2.loc[multi2['zip']=='78212','zri'].interpolate(method = 'ffill').interpolate(method = 'bfill')

895      848.0
2197     848.0
3499     853.0
4801     862.0
6103     866.0
         ...  
88129    964.0
89431    943.0
90733    951.0
92035    942.0
93337    933.0
Name: zri, Length: 72, dtype: float64

In [23]:
for item in missing_zri:
    temp = multi2.loc[multi2['zip']==item,'zri']
    multi2.loc[multi2['zip']==item,'zri'] = temp.interpolate(method = 'linear').interpolate(method = 'ffill').interpolate(method = 'bfill')

In [24]:
multi2['zri'].isnull().value_counts()

False    93744
Name: zri, dtype: int64

In [27]:
multi2 = multi2[multi2['year-month']!='2020-01']

In [30]:
multi2.drop(columns = ['SizeRank'], inplace = True)

KeyError: "['SizeRank'] not found in axis"

In [32]:
multi2 = multi2.sort_values(['zip', 'year-month'])

In [35]:
multi2.to_csv('./../data/zri_multifamily_v2.csv', index = False)