In [59]:
import numpy as np
import pandas as pd
import glob
import os

In [60]:
# pd.set_option('display.max_rows', None)

# input_root_path = "../data"
input_root_path = "../input/travel-time"
# output_root_path = "../data"
output_root_path = "../working"


# Pre-processing

In [61]:
DELAY_THRESH = 20

In [62]:
def load_csv_to_df(files, filename=None):
    df = pd.concat((pd.read_csv(f, skipinitialspace = True) for f in files), ignore_index=True)
    if filename:
        df.to_csv(os.path.join(output_root_path, filename))
    return df

In [63]:
# Load data 

tasks_joined_files = glob.glob(os.path.join(input_root_path, "Update Travel Time Archive/Travel Time Task/TravelTimeTaskArchive*.csv"))
df_tasks = load_csv_to_df(tasks_joined_files)
# df_tasks = load_csv_to_df(tasks_joined_files, filename="all_TravelTimeTaskArchive.csv")

summary_joined_files = glob.glob(os.path.join(input_root_path, "Update Travel Time Archive/Travel Time Summary/TravelTimeSummaryTaskArchive*.csv"))
df_summary = load_csv_to_df(summary_joined_files)

df_tasks

In [64]:
def get_unique_assets(filename=None):
    df_asset = pd.concat([df_tasks[['AssetNumber', 'AssetIdentifier']], df_summary[['AssetNumber', 'AssetIdentifier']]], axis=0) 
    df_asset_unique = df_asset.drop_duplicates()
    
    if filename:
        df_asset_unique.to_csv(os.path.join(output_root_path, filename))

    return df_asset_unique

In [65]:
df_asset = get_unique_assets()
# df_asset = get_unique_assets(filename='unique_assets.csv')
df_asset

# Cleaning

In [66]:
# Filter out entries with zero UnderConstructionTime or BaselineTime
df_valid = df_tasks[(df_tasks['UnderConstructionTime'] != 0) & (df_tasks['BaselineTime'] != 0)].rename_axis('AllTasksIndex').reset_index()
df_valid = df_valid.rename_axis('ValidTasksIndex')
# df_valid

In [67]:
# Recalculate DelayTime & State
df_valid['CalculatedDelayTime'] = (df_valid.UnderConstructionTime - df_valid.BaselineTime).clip(lower=0).round(4)
df_valid['CalculatedState'] = np.where(df_valid['CalculatedDelayTime'] < DELAY_THRESH, 'OK', 'DELAY')
df_valid

In [68]:
def find_delay_diff():
    diff_delay = df_valid[df_valid['DelayTime'] != df_valid['CalculatedDelayTime']]
    diff_delay.to_csv(os.path.join(output_root_path, 'wrong_DelayTime.csv'))
    return diff_delay

In [69]:
def find_state_diff():
    diff_state = df_valid[df_valid['State'] != df_valid['CalculatedState']]
    diff_state.to_csv(os.path.join(output_root_path, 'wrong_State.csv'))
    return diff_state 

# Exploratory Data Analysis

In [70]:
df_valid.info()

In [71]:
df_valid.isnull().sum()

## Plot

In [72]:
import seaborn as sns
import matplotlib.pyplot as plt
# from PIL import Image

%matplotlib inline
sns.set_style("darkgrid")
plt.style.use('ggplot')

STEP_SIZE = 0.5


In [73]:
df_valid['CreatedDate'] = pd.to_datetime(df_valid['CreatedDate'])

# Return the day of the week as an integer, where Monday is 0 and Sunday is 6.
df_valid['CreatedDate:Weekday'] = df_valid['CreatedDate'].dt.weekday
# df_valid['CreatedDate:DayOfWeek'] = df_valid['CreatedDate'].dt.day_of_week
df_valid['CreatedDate:DayName'] = df_valid['CreatedDate'].dt.day_name()

# df_valid['CreatedDate:WeekOfYear'] = df_valid['CreatedDate'].dt.week_of_year
# df_valid['CreatedDate:DayOfYear'] = df_valid['CreatedDate'].dt.day_of_year

# df_valid['CreatedDate:Year'] = df_valid['CreatedDate'].dt.year
# df_valid['CreatedDate:Month'] = df_valid['CreatedDate'].dt.month
# # Between 1 and the number of days in the given month of the given year.
# df_valid['CreatedDate:Day'] = df_valid['CreatedDate'].dt.day
df_valid['CreatedDate:Hour'] = df_valid['CreatedDate'].dt.hour
# df_valid['CreatedDate:Minute'] = df_valid['CreatedDate'].dt.minute

         
df_valid

In [74]:
df_valid.info()

In [138]:
def draw_time_plot(loc, loc_id, time_type, plt_name):
    fig, ax = plt.subplots()

    # loc.plot.scatter(x='CreatedDate:DayName', y=['UnderConstructionTime', 'BaselineTime'])
    # loc.plot.scatter(x='CreatedDate:DayName', y='UnderConstructionTime')
#     plt.scatter(x=loc[f'CreatedDate:{time_type}'], y=loc['UnderConstructionTime'], color='purple', marker='o', label='UnderConstructionTime (After)')
#     plt.scatter(x=loc[f'CreatedDate:{time_type}'], y=loc['BaselineTime'], color='orange', marker='h', label='BaselineTime (Before)')

    x, y  = [], []
    
    if time_type == 'Hour':
        for hour in range(24):
            loc_hour = loc.loc[loc[f'CreatedDate:{time_type}'] == hour]
            x.append(hour)
            y.append(loc_hour['UnderConstructionTime'].mean())
    
    elif time_type == 'DayName':
         for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
            loc_day = loc.loc[loc[f'CreatedDate:{time_type}'] == day]
            x.append(day)
            y.append(loc_day['UnderConstructionTime'].mean())
    
    temp = loc['BaselineTime'].mean()
    plt.plot(x, y, color='purple', marker='o', label='UnderConstructionTime (After)')
    plt.plot(x, [temp for i in range(len(y))], color='orange', marker='h', label='BaselineTime (Before)')
    
#     start, end = ax.get_ylim()
#     plt.yticks(np.arange(np.round(start), np.round(end)+STEP_SIZE, step=STEP_SIZE))
    if time_type == 'Hour':
        plt.xticks(np.arange(0, 24, step=1))
    
    fig.set_size_inches(14, 10)
    ax.legend()
    ax.grid(True)
    plt.title(f'Travel Time for AssetID {loc_id} ({time_type})')
    plt.xlabel(time_type)
    plt.ylabel('TravelTime')
    plt.tight_layout()
    plt.savefig(f'{output_root_path}/{plt_name}.png')
    plt.show()
    plt.clf()
    plt.cla()
    plt.close()


In [139]:
locations_id = [80002045894, 50253718586, 44853071770, 12738620179, 37528363723, 43805438913, 68881926366, 36674989230, 34592545857, 22685608926]


In [140]:
for location_id in locations_id:
    for time_type in ['DayName', 'Hour']:
        location = df_valid.loc[df_valid['AssetNumber'] == location_id]
        draw_time_plot(location, location_id, time_type, plt_name=f'{location_id}_{time_type}')


In [146]:
!zip plots.zip ../working/*.png