In [1]:
# Import libraries
from calendar import monthrange
from datetime import datetime, timedelta
from pandas import DataFrame, Series
import pandas as pd

In [2]:
# global constants
path_to_data = "./bbdata"

In [3]:
# Loads a month of data, given year and month number
# TODO: document
def load_month_trip_data(year, month):
    trips = pd.read_csv( f"{path_to_data}/trips/{year}{month:02d}.csv",
                      parse_dates=["starttime", "stoptime"])
    
    # Drop unnecessary columns
    trips.drop("start station name", axis=1, inplace=True)
    trips.drop("end station name", axis=1, inplace=True)
    trips.drop("bikeid", axis=1, inplace=True)
    trips.drop("usertype", axis=1, inplace=True)
    trips.drop("birth year", axis=1, inplace=True)
    trips.drop("gender", axis=1, inplace=True)
    
    trips['starttime'] = pd.to_datetime(trips['starttime'])
    trips['stoptime'] = pd.to_datetime(trips['stoptime'])
    
    trips['start station latitude'] = pd.to_numeric(trips['start station latitude'])
    trips['start station longitude'] = pd.to_numeric(trips['start station longitude'])
    return trips

In [4]:
# Loads trip data from start_year/start_month to end_year/end_month
def load_trip_date(start_year, start_month, end_year, end_month):
    trips = DataFrame()
    for year in range(start_year, end_year + 1):
        for month in range(start_month, 13 if start_year is not end_year else end_month + 1):
            trips.append(load_month_trip_data(year, month))
        start_month = 1
    return trips

In [5]:
# Loads station data
# TODO: document
def load_station_data():
    stations = pd.read_csv(f"{path_to_data}/station_data.csv")
    stations.drop("Station", axis=1, inplace=True)
    return stations

In [6]:
# TODO: document
def get_trips_in_range(trips, start, end):
    # "2015-01-01 00:30:47
    mask = (trips['starttime'] > start) & (trips['stoptime'] <= end)
    return trips.loc[mask]

In [7]:
# Process trip data into time series with availability value for each staation
# TODO: document
def trips_to_availability_time_series(start_year, start_month, 
                                      end_year, end_month, 
                                      time_step_delta, 
                                      initial_station_bikes_percent,
                                      verbose_level=0):
    if verbose_level >= 0:
        print("Starting time series generation")
    # TODO: don't take in trips, load on demand --> don't overload memory
    # TODO: offload to file after n steps?
    stations = load_station_data()
    step_delta = timedelta(minutes=time_step_delta)
    end_time = datetime(end_year, end_month, monthrange(end_year, end_month)[1], 23, 59)
    current_time = datetime(start_year, start_month, 1, 0, 0)
    current_month_trip_data = load_month_trip_data(current_time.year, current_time.month)
    if verbose_level >= 1:
        print(f"Starting with ({current_time.strftime('%b')} {current_time.year})")
    time_series = [] # use to compute current available bikes
    last_step_available = {}
    while current_time <= end_time:
        next_time = current_time + step_delta
        current_trips = get_trips_in_range(current_month_trip_data, current_time, next_time)
        for index, station in stations.iterrows():
            mask_from = current_trips["start station latitude"] == station["Latitude"]
            mask_to = current_trips["start station latitude"] == station["Latitude"]
            trips_from_station = current_trips.loc[mask_from].shape[0]
            trips_to_station = current_trips.loc[mask_to].shape[0]
            if station["Station ID"] in last_step_available:
                available = max(0, last_step_available[station["Station ID"]] - trips_from_station + trips_to_station)
            else:
                available = int(initial_station_bikes_percent / 100 * station["# of Docks"]) # TODO: compute
            if available < 0:
                pass # go back on the whole time series, till now, and add 1, available = 0
            station_time_series_entry = {
                "time_step_start": current_time, 
                "time_step_end": next_time, 
                "station": station["Station ID"], 
                "latitude": station["Latitude"], 
                "longitude": station["Longitude"], 
                "municipality": station["Municipality"], 
                "total_docks": station["# of Docks"], 
                "trips_from_station": trips_from_station,
                "trips_to_station": trips_to_station, 
                "available_bikes": available
            }
            last_step_available[station["Station ID"]] = available
            # TODO: compute current available bikes
            time_series.append(station_time_series_entry)
        if verbose_level >= 2:
            print(f"\tCompleted time step [{current_time} --- {next_time}]; Current rows: {len(time_series)}")
        if current_time.month is not next_time.month:
            current_month_trip_data = load_month_trip_data(next_time.year, next_time.month)
            if verbose_level >= 1:
                print(f"Moving to next month ({next_time.strftime('%b')} {next_time.year})")
        current_time = next_time
    if verbose_level >= 0:
        print("Completed full date range")
    availability_time_series = DataFrame(time_series)
    return availability_time_series

In [8]:
# Short test
time_series = trips_to_availability_time_series(2015, 1, 2015, 1, 60, 80, verbose_level=2)

Starting time series generation
Starting with (Jan 2015)
	Completed time step [2015-01-01 00:00:00 --- 2015-01-01 01:00:00]; Current rows: 281
	Completed time step [2015-01-01 01:00:00 --- 2015-01-01 02:00:00]; Current rows: 562
	Completed time step [2015-01-01 02:00:00 --- 2015-01-01 03:00:00]; Current rows: 843
	Completed time step [2015-01-01 03:00:00 --- 2015-01-01 04:00:00]; Current rows: 1124
	Completed time step [2015-01-01 04:00:00 --- 2015-01-01 05:00:00]; Current rows: 1405
	Completed time step [2015-01-01 05:00:00 --- 2015-01-01 06:00:00]; Current rows: 1686
	Completed time step [2015-01-01 06:00:00 --- 2015-01-01 07:00:00]; Current rows: 1967
	Completed time step [2015-01-01 07:00:00 --- 2015-01-01 08:00:00]; Current rows: 2248
	Completed time step [2015-01-01 08:00:00 --- 2015-01-01 09:00:00]; Current rows: 2529
	Completed time step [2015-01-01 09:00:00 --- 2015-01-01 10:00:00]; Current rows: 2810
	Completed time step [2015-01-01 10:00:00 --- 2015-01-01 11:00:00]; Current 

In [18]:
time_series["available_bikes"].value_counts()

12    104904
15     61008
18     11160
13      7440
11      5208
21      3720
14      3720
20      2976
8       2976
26      1488
16      1488
37       744
29       744
28       744
17       744
Name: available_bikes, dtype: int64