In [1]:
import pandas as pd
import numpy as np

def time_series_gaps(df, ts_start_col):
    '''
    Returns a dictionary containing:
        keys: the row indices of any row with missing data
        values: numeric horizontal (column) start and end indices of any periods of missing data
    
    time series data must be running row-wise for the function to work
    
    ts_start_col is the numeric index of the column in the dataframe at which the time series begins
    the function assumes the time series runs from ts_start_col all the way to the right side of the dataframe
    '''
    gaps = {}
    for i in range(df.shape[0]):
        curr_gaps = []
        curr_ts = df.iloc[i,:]
        if curr_ts.isnull().any():
            j = ts_start_col
            while j < len(curr_ts):
                if pd.isnull(curr_ts[j]):
                    gap_start = int(j)
                    gap_end = None
                    k = int(j)
                    while gap_end is None:
                        if k == len(curr_ts) - 1:
                            if pd.isnull(curr_ts[k]):
                                gap_end = int(k)
                                curr_gaps.append((gap_start, gap_end))
                                j = int(k)+1
                            else:   
                                gap_end = int(k) - 1
                                curr_gaps.append((gap_start, gap_end))
                                j = int(k)+1
                        elif pd.isnull(curr_ts[k]):
                            k+=1
                        else:
                            gap_end = int(k) - 1
                            curr_gaps.append((gap_start, gap_end))
                            j = int(k)
                else:
                    j += 1
        if curr_gaps != []:
            gaps[i] = curr_gaps
    return gaps

def get_rejected_gaps(gaps, gap_limit):
    '''
    takes gaps dictionary generated by time_series_gaps, returns list of row indices that contain gaps >= gap_limit
    '''
    over_limit = []
    for key, val in gaps.items():
        if len(val) > 1:
            for i in range(len(val)):
                width = val[i][1] - val[i][0] + 1
                if width >= gap_limit:
                    over_limit.append(key)
        else:
            width = val[0][1] - val[0][0] + 1
            if width >= gap_limit:
                over_limit.append(key)
    return over_limit

In [2]:
asdf = pd.DataFrame((['10001', 1, 2, 3, 4, np.nan, np.nan, 5, 6, np.nan, np.nan],
                     ['10002', 1, np.nan, np.nan, np.nan, np.nan, 9, 5, 6, np.nan, 10]))
asdf.columns = ['zip_code', 'year1', 'year2', 'year3', 'year4', 'year5', 'year6', 'year7', 'year8', 'year9', 'year10']
asdf

Unnamed: 0,zip_code,year1,year2,year3,year4,year5,year6,year7,year8,year9,year10
0,10001,1,2.0,3.0,4.0,,,5,6,,
1,10002,1,,,,,9.0,5,6,,10.0


In [3]:
gaps = time_series_gaps(asdf, 1) # 1 bc time series begins in column 1

In [4]:
for key, val in gaps.items():
    print(key, ':', val)

0 : [(5, 6), (9, 10)]
1 : [(2, 5), (9, 9)]


In [5]:
get_rejected_gaps(gaps, 4)

[1]