In [None]:
# Import libraries
from calendar import monthrange
from datetime import datetime, timedelta
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import re

In [None]:
# global constants
path_to_data = "./bbdata"

In [None]:
# Loads a month of data, given year and month number
# TODO: document
def load_month_trip_data(year, month):
    trips = pd.read_csv(f"{path_to_data}/trips/{year}{month:02d}.csv",
                      parse_dates=["starttime", "stoptime"])
    
    # Drop unnecessary columns
    trips.drop("start station name", axis=1, inplace=True)
    trips.drop("end station name", axis=1, inplace=True)
    trips.drop("bikeid", axis=1, inplace=True)
    trips.drop("usertype", axis=1, inplace=True)
    trips.drop("birth year", axis=1, inplace=True)
    trips.drop("gender", axis=1, inplace=True)
    
    trips['starttime'] = pd.to_datetime(trips['starttime'])
    trips['stoptime'] = pd.to_datetime(trips['stoptime'])
    
    trips['start station latitude'] = pd.to_numeric(trips['start station latitude'])
    trips['start station longitude'] = pd.to_numeric(trips['start station longitude'])
    return trips

In [None]:
# Loads trip data from start_year/start_month to end_year/end_month
def load_trip_date(start_year, start_month, end_year, end_month):
    trips = DataFrame()
    for year in range(start_year, end_year + 1):
        for month in range(start_month, 13 if start_year is not end_year else end_month + 1):
            trips.append(load_month_trip_data(year, month))
        start_month = 1
    return trips

In [None]:
# Loads station data
# TODO: document
def load_station_data():
    stations = pd.read_csv(f"{path_to_data}/station_data.csv")
    stations.drop("Station", axis=1, inplace=True)
    return stations

In [None]:
# Loads weather data
# TODO: document
all_weather = pd.read_csv(f"{path_to_data}/weather_data_daily.csv")

all_weather.drop(["STATION","NAME","LATITUDE","LONGITUDE","ELEVATION"], axis = 1, inplace = True)    
all_weather[['YEAR','MONTH','DAY']] = all_weather['DATE'].str.split('-',expand=True)
all_weather.drop(["DATE"], axis = 1, inplace = True)

all_weather["YEAR"] = pd.to_numeric(all_weather["YEAR"])
all_weather["DAY"] = pd.to_numeric(all_weather["DAY"])
all_weather["MONTH"] = pd.to_numeric(all_weather["MONTH"])
all_weather["PRCP"] = pd.to_numeric(all_weather["PRCP"])
all_weather["SNOW"] = pd.to_numeric(all_weather["SNOW"])
all_weather["TMIN"] = pd.to_numeric(all_weather["TMIN"])
all_weather["TMAX"] = pd.to_numeric(all_weather["TMAX"])
all_weather["TAVG"] = pd.to_numeric(all_weather["TAVG"])

all_weather.apply(lambda row: ((row['TMIN'] * row['TMAX']) / 2) if np.isnan(row['TAVG']) else row['TAVG'], 
                  axis = 1)

# Loads weather data
# TODO: document
def load_weather_data(day, month, year):
    weather = all_weather.loc[(all_weather['YEAR'] == year) \
                              & (all_weather['DAY'] == day) \
                              & (all_weather['MONTH'] == month)].copy()
    weather.drop(['YEAR','MONTH','DAY'], axis = 1, inplace = True)
    weather.reset_index()
    return dict(weather.iloc[0])

In [None]:
# TODO: document
def get_starting_trips_in_range(trips, start, end):
    mask = (trips['starttime'] > start) & (trips['starttime'] <= end)
    return trips.loc[mask]

# TODO: document
def get_ending_trips_in_range(trips, start, end):
    
    mask = (trips['stoptime'] > start) & (trips['stoptime'] <= end)
    return trips.loc[mask]

In [None]:
# Process trip data into time series with availability value for each staation
# TODO: document
# time_step_delta should evenly divide minutes in a day (60 * 24 = 1440)
def trips_to_availability_time_series(start_year, start_month, 
                                      end_year, end_month, 
                                      time_step_delta, 
                                      export_url,
                                      verbose_level=0):
    if verbose_level >= 0:
        print("Starting time series generation")
    # TODO: offload to file after n steps?
    stations = load_station_data()
    
    step_delta = timedelta(minutes=time_step_delta)
    
    end_time = datetime(end_year, end_month, monthrange(end_year, end_month)[1], 23, 59)
    
    current_time = datetime(start_year, start_month, 1, 0, 0)
    current_month_trip_data = load_month_trip_data(current_time.year, current_time.month)
    
    if verbose_level >= 1:
        print(f"Starting with ({current_time.strftime('%b')} {current_time.year})")
    
    current_day_weather_data = load_weather_data(current_time.day, current_time.month, current_time.year)
    if verbose_level >= 3:
        print(f"\t\tLoaded Weather data for {current_time.strftime('%b')} {current_time.day} - {current_time.year}")
    
    time_series = [] # use to compute current available bikes

    while current_time <= end_time:
        next_time = current_time + step_delta
        
        current_starting_trips = get_starting_trips_in_range(current_month_trip_data, current_time, next_time)
        current_ending_trips = get_ending_trips_in_range(current_month_trip_data, current_time, next_time)
        
        for index, station in stations.iterrows():
            mask_from = current_starting_trips["start station latitude"] == station["Latitude"]
            mask_to = current_ending_trips["end station latitude"] == station["Latitude"]
            trips_from_station = current_starting_trips.loc[mask_from].shape[0]
            trips_to_station = current_ending_trips.loc[mask_to].shape[0]
                
            station_time_series_entry = {
                "time_step_start": current_time, 
                "time_step_end": next_time, 
                "station": station["Station ID"], 
                "latitude": station["Latitude"], 
                "longitude": station["Longitude"], 
                "municipality": station["Municipality"], 
                "total_docks": station["# of Docks"], 
                "trips_from_station": trips_from_station,
                "trips_to_station": trips_to_station, 
                "PRCP": current_day_weather_data["PRCP"],
                "SNOW": current_day_weather_data["SNOW"],
                "TMIN": current_day_weather_data["TMIN"],
                "TMAX": current_day_weather_data["TMAX"],
                "TAVG": current_day_weather_data["TAVG"]
            }
                        
            time_series.append(station_time_series_entry)
            
        if verbose_level >= 2:
            print(f"\tCompleted time step [{current_time} --- {next_time}]; Current rows: {len(time_series)}")
            
        if current_time.month is not next_time.month:
            # Export to csv
            time_series_df = DataFrame(time_series)
            with open(export_url, 'a') as file:
                time_series_df.to_csv(file, header = file.tell() == 0, index = False, line_terminator = '\n')
            if verbose_level >= 3:
                print("Exported to file.")
            
            # Next month
            current_month_trip_data = load_month_trip_data(next_time.year, next_time.month)
            if verbose_level >= 1:
                print(f"Moving to next month ({next_time.strftime('%b')} {next_time.year})")
                
        if current_time.day is not next_time.day:
            current_day_weather_data = load_weather_data(next_time.day, next_time.month, next_time.year)
            if verbose_level >= 3:
                print(f"\t\tLoaded Weather data for {next_time.strftime('%b')} {next_time.day} - {next_time.year}")
        
        current_time = next_time
        
    if verbose_level >= 0:
        print("Completed full date range")

In [None]:
trips_to_availability_time_series(2015, 1, 2018, 12, 60 * 24 * 30, "./timeseries.monthly.csv",        verbose_level = 0)
trips_to_availability_time_series(2015, 1, 2018, 12, 60 * 24 * 7,  "./timeseries.weekly.csv",         verbose_level = 0)
trips_to_availability_time_series(2015, 1, 2018, 12, 60 * 24,      "./timeseries.daily.csv",          verbose_level = 0)
trips_to_availability_time_series(2015, 1, 2018, 12, 60 * 12,      "./timeseries.12-hour-ly.csv",     verbose_level = 0)
trips_to_availability_time_series(2015, 1, 2018, 12, 60 * 6,       "./timeseries.6-hour-ly.csv",      verbose_level = 0)
trips_to_availability_time_series(2015, 1, 2018, 12, 60,           "./timeseries.hourly.csv",         verbose_level = 0)
trips_to_availability_time_series(2015, 1, 2018, 12, 30,           "./timeseries.half-hourly.csv",    verbose_level = 0)
trips_to_availability_time_series(2015, 1, 2018, 12, 15,           "./timeseries.quarter-hourly.csv", verbose_level = 0)
trips_to_availability_time_series(2015, 1, 2018, 12, 5,            "./timeseries.5-minute-ly.csv",    verbose_level = 0)