In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
def get_datetime_from_str(string):
    """return datetime of sting with format '%Y-%m-%d %H:%M:%S'.
    
    Parameters:
    string: a string with format '%Y-%m-%d %H:%M:%S'.
    
    Returns:
    datetime
    
    """
    return datetime.strptime(string, '%Y-%m-%d %H:%M:%S')

def get_str_from_datetime(date):
    """return string representation of datetime, with format '%H:%M:%S'.
    
    Parameters:
    date: datetime
    
    Returns:
    string with format '%H:%M:%S'.
    
    """
    return date.strftime('%H:%M:%S')

def get_day_of_datetime(date):
    """return day of datetime object.
    
    Parameters:
    date: datetime object
    
    Returns:
    day of datetime, type int.
    
    """
    return date.day

def make_validation_set(test_data):
    """make validation set.
    
    Parameters:
    test_data: the dataframe treated as validation set, with speed data of whole day.
    
    Returns:
    test_data: revised version of input dataframe with speed converted to np.nan in am peak hours and pm peak hours.
    
    """
    year = test_data.datetime[5].year
    month = test_data.datetime[5].month
    day = test_data.datetime[5].day
    am_peak_start = datetime(year, month, day, 6, 0, 0)
    am_peak_end = datetime(year, month, day, 11, 0, 0)
    pm_peak_start = datetime(year, month, day, 16, 0, 0)
    pm_peak_end = datetime(year, month, day, 21, 0, 0)
    am = test_data[(test_data.datetime<am_peak_end)&(test_data.datetime>=am_peak_start)]
    pm = test_data[(test_data.datetime<pm_peak_end)&(test_data.datetime>=pm_peak_start)]
    test_data.loc[am.index.append(pm.index),'speed'] = np.nan
    test_data.datetime = test_data.datetime.transform(get_str_from_datetime)
    return test_data

def get_selected_data_by_day(days):
    """get combined dataframe by days
    
    Parameters:
    days: list of days 
    
    Returns:
    df_return: concatenated dataframe of chosen days.
    """
    df_return = pd.DataFrame(columns=['datetime','speed'])
    for day_of_month in days:
        df_day = df[df.day == day_of_month]
        df_return = pd.concat([df_return, df_day], ignore_index=True)
    return df_return

In [8]:
direction = 'south'
file_name = 'to_{}_historical_road_speed_cleaned.csv'.format(direction)
df = pd.read_csv(file_name)

In [9]:
df.datetime = df.datetime.transform(get_datetime_from_str)
df.day = df.datetime.transform(get_day_of_datetime)

  


#### use k-folds as validation method. and there are 7 testsets in total with day of month: 17,11,26,20,21,22,9

In [10]:
total_days = set(range(1,32))
test_day_list = [17,11,26,20,21,22,9]

In [11]:
i = 1
for day in test_day_list:
    test_day = [day]
    train_day_list = list(total_days.difference(set(test_day)))
    test_data = get_selected_data_by_day(test_day)
    test_data.to_csv('validation/{}/ground_truth_for_test/test_data_fold{}_answer.csv'.format(direction,i))
    test_data = make_test_set(test_data)
    train_data = get_selected_data_by_day(train_day_list)
    test_data.to_csv('validation/{}/test_data_fold{}.csv'.format(direction,i))
    train_data.to_csv('validation/{}/training_data_fold{}.csv'.format(direction,i))
    i += 1