In [116]:
import pandas as pd
import datetime
from datetime import timedelta
import numpy as np
import re
import logging

# Helper Functions

In [2]:
def to_dates_list(dates_dictionary):
    '''
    to_dates_list takes a dictionary with the following format:
    { 'date_as_string': number of days as integer,...}
    returns a list with the range of dates of each item in the dictionary + the num of days 
    '''
    dates_list = []
    for date, date_range in dates_dictionary.items():
        dates_list.append(date)
        i = 1 
        sign = lambda a: (a>0) - (a<0)
        while(abs(date_range)>0):
            new_date = pd.to_datetime(date) + (sign(date_range) * timedelta(days=i))
            dates_list.append(new_date.strftime("%Y-%m-%d"))
            i = i+1
            date_range = date_range-(1*(sign (date_range)))
           
                    
    return list(set(dates_list))



date_list = to_dates_list({'2020-03-27':-1,'2020-02-14':1})
date_list

['2020-03-26', '2020-02-15', '2020-02-14', '2020-03-27']

# Data Parsing Function

In [77]:
file_path = 'sample_data.xlsx'
file_path2 = 'latest/all_daily.csv'
date_range_dict = {'2020-01-22':3}
#date_range_dict = None


In [184]:


def DataParser(file_path, date_range_dict, incubation_duration = 5, regions = {}, \
               attr_extract = None, _attr_date = 'date_ymd', _attr_region = 'Country_Region', \
               _attr_province = 'Province_State'):
    
    #logger initiation:
    # create logger 
    logger = logging.getLogger('sample')
    logger.setLevel(logging.INFO)
    logger.propagate = False
    # create file handler
    fh = logging.FileHandler('dataparsing.log',mode='w')
    fh.setLevel(logging.INFO)
    # create formatter and add it to the handlers
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)    
    logger.addHandler(fh)  
    logger.info('Incubation date: {}'.format(incubation_duration))
   
               
    #FileOpen - according to file extension
    if file_path.endswith(".xlsx"):
        xlsx = pd.ExcelFile(file_path)
        df = pd.read_excel(xlsx)
    elif file_path.endswith(".csv"):
        df = pd.read_csv(file_path, parse_dates=[_attr_date])
    
    

    #Region and Province Selection - if region parameter is not None, choose the region
    df_extract = df
    mask_prev =  [False]*len(df.index)
    if regions != {}:
        for region, provinces in regions.items():
            
            if (provinces == []) or (provinces == None) :
                mask = mask_prev | (df_extract[_attr_region] == region ) 
                mask_prev = mask
                continue
            for province in provinces:
                mask = mask_prev | ((df_extract[_attr_region] == region ) & (df_extract[_attr_province] == province)) 
                mask_prev = mask
        df_extract = df.loc[mask]
#         return df_extract
    else:
        df_extract = df
    
    
    
#     df_extract = df
#     if region != None:
# #         mask_prev = (df_extract[_attr_country] == None)
#         mask_prev =  [False]*len(df.index)
#         for reg in region:
#             mask = mask_prev | (df_extract[_attr_region] == reg)  
#             mask_prev = mask    
#         df_extract = df.loc[mask]
#     else:
#         df_extract = df       
    #ProvinceSelection - if province parameter is not None, choose the region
#     if province != None:
#         for prov in province:
#             mask = mask_prev | (df_extract[_attr_province] == prov) 
#             mask_prev = mask    
#         df_extract = df_extract.loc[mask]
#     else:
#         df_extract = df  
    
    
    
    #DateParsing - create a mask to choose the desired dates (According to list and incubation period)
    df_final = pd.DataFrame() #creates a new dataframe that's empty
    if date_range_dict != None:
        date_list = to_dates_list(date_range_dict)
        for date in date_list:
            #get the incubation period of the date
            curr_date_df = df_extract.loc[:,_attr_date: _attr_region]
            end_date = (pd.to_datetime(date)-timedelta(days=incubation_duration)) 
            mask = (df_extract[_attr_date] >= end_date) & (df_extract[_attr_date] <= date)
            incubation_df_extract = df_extract.loc[mask] #extracts the incubation period of the date

#             att_df_final = incubation_df_extract.groupby([_attr_region,_attr_province],as_index=False).agg(attr_extract)
            att_df_final = incubation_df_extract.groupby([_attr_region],as_index=False).agg(attr_extract)
            att_df_final.insert(0,_attr_date,date,True)
            df_final = df_final.append(att_df_final, ignore_index = False) # ignoring index is optional
            df_final = df_final.sort_values(by=_attr_date)
    else:
#         df_final = df_extract.groupby(by=[_attr_region,_attr_province],as_index=False,dropna=False).agg(attr_extract)
        df_final = df_extract.groupby(by=[_attr_region],as_index=False).agg(attr_extract) 
        df_final.insert(0,_attr_date,'ALL',True)
#         df_final = df_final.append(att_df_final, ignore_index = True) # ignoring index is optional
        
    df_final.columns = df_final.columns.map('_'.join)
    
    return df_final

In [186]:
df = DataParser(file_path2, None, _attr_date = 'date',regions={'Argentina':[]}, \
                 attr_extract={'Confirmed':['max','min']})
df

# attr_extract={'confirmed_cases':['mean','min'],'mean_temp': ['max']} ,region=['Mainland China'], province = ['Anhui']

['date_', 'Country_Region_', 'Confirmed_max', 'Confirmed_min']