In [1]:
import pandas as pd
import datetime
data_path = './data/'

In [2]:
timeFrom = datetime.datetime.strptime('2017-01-01 14:00:00', "%Y-%m-%d %H:%M:%S")
timeTo = datetime.datetime.strptime('2018-05-02 23:00:00', "%Y-%m-%d %H:%M:%S")
Hours_Delta = pd.date_range(timeFrom, timeTo, freq='H').strftime('%Y-%m-%d %H:%M:%S')

In [3]:
time_info = pd.DataFrame(Hours_Delta, columns=['time'])

In [4]:
time_info['time_dt'] = pd.to_datetime(time_info['time'])

**For weekday, 0 means Mon, and 6 means Sat.**

In [5]:
time_info['year'] = time_info.apply(lambda row: row[1].year, axis=1)
time_info['month'] = time_info.apply(lambda row: row[1].month, axis=1)
time_info['day'] = time_info.apply(lambda row: row[1].day, axis=1)
time_info['weekday'] = time_info.apply(lambda row: row[1].weekday(), axis=1)
time_info['hour'] = time_info.apply(lambda row: row[1].hour, axis=1)

In [8]:
time_info.to_csv(data_path+'time_info.csv')

**Bulid features about holidays, workdays, etc.**

In [None]:
columns = ["date","is_weekend","is_holiday","is_holiday_first",\
           "is_holiday_last","is_working","is_working_first","is_working_last"]

In [None]:
holiday = ["2017-01-01","2017-01-02","2017-01-27","2017-01-28","2017-01-29","2017-01-30"
           ,"2017-01-31","2017-02-01","2017-02-02","2017-04-02","2017-04-03"
           ,"2017-04-04","2017-05-01","2017-05-28","2017-05-29","2017-05-30"
           ,"2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05"
           ,"2017-10-06","2017-10-07","2017-10-08","2017-04-29","2017-04-30"
           ,"2017-12-30","2017-12-31","2018-01-01"
           ,"2018-02-15","2018-02-16","2018-02-17","2018-02-18","2018-02-19"
           ,"2018-02-20","2018-02-21","2018-04-05","2018-04-06","2018-04-07"
           ,"2018-04-29","2018-04-30","2018-05-01"]

compliment = ["2017-01-22","2017-02-04","2017-04-01","2017-05-27","2017-09-30"
             ,"2018-02-11","2018-02-24","2018-04-08","2018-04-28"]

In [None]:
is_holiday_first = ["2017-01-27","2017-04-02","2017-04-29","2017-05-28"
                   ,"2017-10-01","2017-12-30","2018-02-15","2018-04-05"
                   ,"2018-04-29"]
is_holiday_last = ["2017-01-02","2017-02-02","2017-04-04","2017-05-01"
                  ,"2017-05-30","2017-10-08","2018-01-01","2018-02-21"
                  ,"2018-04-07","2018-05-01"]

In [None]:
timeFrom = datetime.strptime('2017-01-01', "%Y-%m-%d")
timeTo = datetime.strptime('2018-04-30', "%Y-%m-%d")
Days_Delta = pd.date_range(timeFrom, timeTo, freq='D').strftime("%Y-%m-%d")
Days_Delta

In [None]:
holiday_info = pd.DataFrame(data=Hours_Delta,columns=["date"])
holiday_info.info()

In [None]:
df_length = 485

# 2017-01-01 is Sunday
def find_weekend(df):
    df["is_weekend"] = 0
    sunday = datetime.strptime('2017-01-01', "%Y-%m-%d")
    cond = pd.date_range(sunday,df["date"],freq="D")
    if len(cond) % 7 < 2:
        df["is_weekend"] = 1
        
    return df

def find_holiday(df):
    if df["date"] in holiday:
        df["is_holiday"] = 1
    else:
        df["is_holiday"] = 0
    return df

def find_working(df):
    df["is_working"] = 1
    # first find weekend
    if df["is_weekend"] == 1 or df["is_holiday"] == 1:
        df["is_working"] = 0
    if df["date"] in compliment:
        df["is_working"] = 1        
    return df

# still need to deal with the first day and last day situation manually!
def find_first_last(is_working):
    is_holiday_first = [0] * df_length
    is_holiday_last = [0] * df_length
    is_working_first = [0] * df_length
    is_working_last = [0] * df_length
    
    for i in range(1, len(is_working)-1):        
        # holiday
        if is_working[i] == 0:
            # holiday first
            if is_working[i-1] == 1 and is_working[i+1] == 0:
                is_holiday_first[i] = 1
            # holiday last
            if is_working[i-1] == 0 and is_working[i+1] == 1:
                is_holiday_last[i] = 1
                
        # working day
        else:
            # working first
            if is_working[i-1] == 0 and is_working[i+1] == 1:
                is_working_first[i] = 1
            # working last
            if is_working[i-1] == 1 and is_working[i+1] == 0:
                is_working_last[i] = 1
    
    return [is_holiday_first,is_holiday_last,is_working_first,is_working_last]

In [None]:
holiday_info = holiday_info.apply(find_weekend,axis=1)
holiday_info = holiday_info.apply(find_holiday,axis=1)
holiday_info = holiday_info.apply(find_working,axis=1)
is_working = list(holiday_info["is_working"])
first_last = find_first_last(is_working)

In [None]:
first_last_df = pd.DataFrame(columns=["is_holiday_first","is_holiday_last","is_working_first","is_working_last"])
first_last_df["is_holiday_first"] = first_last[0]
first_last_df["is_holiday_last"] = first_last[1]
first_last_df["is_working_first"] = first_last[2]
first_last_df["is_working_last"] = first_last[3]
first_last_df["date"] = Days_Delta

# deal with first and last
# no need to deal with the first and last in this case

In [None]:
date_info = holiday_info.join(first_last_df.set_index("date"),on="date")

In [None]:
# read time_info.csv and join holiday_info and origin info togeter
time_info_df = pd.read_csv("./data/time_info.csv",index_col=0)

# need to split time column to join
def split_time(df):
    df["date"] = df["time"].split(" ")[0]   
    return df

time_info_df = time_info_df.apply(split_time,axis=1)
whole_df = time_info_df.join(date_info.set_index("date"),on="date")

In [None]:
# whole_df = whole_df.drop("date",axis=1)
whole_df.to_csv("./data/holiday_info.csv",index=None)