In [1]:
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
import numpy as np

def time_series_gaps(df, ts_start_col):
    '''
    Returns a dictionary containing:
        keys: the row indices of any row with missing data
        values: numeric horizontal (column) start and end indices of any periods of missing data
    
    time series data must be running row-wise for the function to work
    
    ts_start_col is the numeric index of the column in the dataframe at which the time series begins
    the function assumes the time series runs from ts_start_col all the way to the right side of the dataframe
    '''
    gaps = {}
    for i in range(df.shape[0]):
        curr_gaps = []
        curr_ts = df.iloc[i,:]
        if curr_ts.isnull().any():
            j = ts_start_col
            while j < len(curr_ts):
                if pd.isnull(curr_ts[j]):
                    gap_start = int(j)
                    gap_end = None
                    k = int(j)
                    while gap_end is None:
                        if k == len(curr_ts) - 1:
                            if pd.isnull(curr_ts[k]):
                                gap_end = int(k)
                                curr_gaps.append((gap_start, gap_end))
                                j = int(k)+1
                            else:   
                                gap_end = int(k) - 1
                                curr_gaps.append((gap_start, gap_end))
                                j = int(k)+1
                        elif pd.isnull(curr_ts[k]):
                            k+=1
                        else:
                            gap_end = int(k) - 1
                            curr_gaps.append((gap_start, gap_end))
                            j = int(k)
                else:
                    j += 1
        if curr_gaps != []:
            gaps[i] = curr_gaps
    return gaps

def get_rejected_gaps(gaps, gap_limit):
    '''
    takes gaps dictionary generated by time_series_gaps, returns list of row indices that contain gaps >= gap_limit
    '''
    over_limit = []
    for key, val in gaps.items():
        if len(val) > 1:
            for i in range(len(val)):
                width = val[i][1] - val[i][0] + 1
                if width >= gap_limit:
                    over_limit.append(key)
        else:
            width = val[0][1] - val[0][0] + 1
            if width >= gap_limit:
                over_limit.append(key)
    return list(set(over_limit))

In [3]:
multi = pd.read_csv('./../data/Zip_Zri_MultiFamilyResidenceRental.csv')

In [4]:
type(multi['RegionName'][0])

numpy.int64

In [5]:
multi

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,2930.0,2952.0,2926.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1447.0,1465.0,1469.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,2797.0,2811.0,2813.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1437.0,1469.0,1490.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,90755,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,997.0,1019.0,1042.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,94610,84627,Ephraim,UT,,Sanpete County,1858,,,,...,,,,,,,,,1298.0,
1858,84452,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,94629,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,...,,,,,,,,,,1221.0


In [6]:
multi['RegionName'] = multi['RegionName'].map(lambda x: str(x))

In [7]:
multi['RegionName'].map(lambda x: len(x)).value_counts()

5    1621
4     240
Name: RegionName, dtype: int64

In [8]:
multi['RegionName'] = multi['RegionName'].map(lambda x: '0'+x if len(x)<5 else x)

In [9]:
multi.rename(columns = {'RegionName':'zip'}, inplace = True)

In [10]:
multi = multi.iloc[:,1:] # drop RegionID column, just a unique identifier

In [11]:
multi

Unnamed: 0,zip,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,2010-12,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,2930.0,2952.0,2926.0,2904.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1447.0,1465.0,1469.0,1467.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,2797.0,2811.0,2813.0,2784.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1437.0,1469.0,1490.0,1503.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,997.0,1019.0,1042.0,1063.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,84627,Ephraim,UT,,Sanpete County,1858,,,,,...,,,,,,,,,1298.0,
1858,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,,...,,,,,,,,,,1221.0


In [12]:
multi = pd.concat([multi.iloc[:,0:6], multi.iloc[:,46:]], axis = 1)

In [13]:
multi

Unnamed: 0,zip,City,State,Metro,CountyName,SizeRank,2014-01,2014-02,2014-03,2014-04,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3333.0,3338.0,3340.0,3323.0,...,3484.0,3523.0,3573.0,3622.0,3664.0,3698.0,3704.0,3692.0,3715.0,3676.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1647.0,1645.0,1635.0,1650.0,...,1731.0,1749.0,1768.0,1787.0,1801.0,1806.0,1789.0,1761.0,1747.0,1731.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3135.0,3138.0,3136.0,3131.0,...,3449.0,3459.0,3479.0,3491.0,3510.0,3521.0,3574.0,3608.0,3561.0,
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,1081.0,1093.0,...,1226.0,1250.0,1264.0,1251.0,1246.0,1252.0,1280.0,1310.0,1294.0,1286.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1756.0,1759.0,1754.0,1774.0,...,1888.0,1911.0,1934.0,1950.0,1953.0,1952.0,1921.0,1958.0,,1916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,75202,Dallas,TX,Dallas-Fort Worth-Arlington,Dallas County,1857,1297.0,1284.0,1295.0,1297.0,...,1615.0,1619.0,1628.0,,,1729.0,1697.0,1739.0,,
1857,84627,Ephraim,UT,,Sanpete County,1858,,,,,...,,,,,,,,,1298.0,
1858,60301,Oak Park,IL,Chicago-Naperville-Elgin,Cook County,1859,,,,,...,1447.0,1483.0,1495.0,1511.0,1537.0,1570.0,1580.0,1625.0,,
1859,84647,Mount Pleasant,UT,,Sanpete County,1860,,,,,...,,,,,,,,,,1221.0


In [None]:
gaps = time_series_gaps(multi, 7)

In [None]:
rej_gaps = get_rejected_gaps(gaps, 6)

In [None]:
multi2 = multi.drop(rej_gaps, axis = 0)

In [None]:
multi2

In [None]:
multi2.columns[7:]

In [None]:
multi2 = pd.melt(multi2, id_vars = multi2.columns[0:6], value_vars = multi2.columns[6:], var_name = 'year-month', value_name = 'zri')

In [None]:
missing_zri = multi2[multi2['zri'].isnull()]['zip'].unique()

In [None]:
missing_zri

In [None]:
list(multi2.loc[multi2['zip']=='78212','zri'])

In [None]:
multi2.loc[multi2['zip']=='78212','zri'].interpolate(method = 'ffill').interpolate(method = 'bfill')

In [None]:
for item in missing_zri:
    temp = multi2.loc[multi2['zip']==item,'zri']
    multi2.loc[multi2['zip']==item,'zri'] = temp.interpolate(method = 'linear').interpolate(method = 'ffill').interpolate(method = 'bfill')

In [None]:
multi2['zri'].isnull().value_counts()

In [None]:
multi2 = multi2[multi2['year-month']!='2020-01']

In [None]:
multi2.drop(columns = ['SizeRank'], inplace = True)

In [None]:
multi2['year-month'].value_counts()

In [None]:
multi2 = multi2.sort_values(['zip', 'year-month'])

In [None]:
# multi2.to_csv('./../data/zri_multifamily_v2.csv', index = False)