# Delta imcoming / outgoing bikes

In [1]:
#general imports
import pandas as pd
import numpy as np

#statsmodels for regression
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

#scipy for testing
from scipy import stats

#for visualization
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

In [4]:
df_bikes = pd.read_csv('../data/philadelphia_2017.csv')
df_weather = pd.read_csv('../data/weather_hourly_philadelphia.csv')

def get_datetime(date_string):
    if date_string.__contains__("/"):
        new_string = ""
        running_string = ""
        for char in date_string:
            if char == "/" or  char == ":" or char == " ":
                if len(running_string) == 1:
                    running_string = "0" + running_string
                new_string += running_string + char
                running_string = ""
            else:
                running_string += char
        new_string += running_string
        if len(new_string) > 16:
            return datetime.strptime(new_string, '%m/%d/%Y %H:%M:%S')
        else:
            return datetime.strptime(new_string, '%m/%d/%Y %H:%M')
    else:
        return datetime.fromisoformat(date_string)

def compare_datetime(start, end):
    start_datetime = get_datetime(start)
    end_datetime = get_datetime(end)
    duration_of_travel = end_datetime - start_datetime
    seconds = duration_of_travel.total_seconds()
    return seconds/60

ride_lengths = []
df_bikes["ride_duration_minutes"] = df_bikes.apply(lambda x: (compare_datetime(x["start_time"],x["end_time"])), axis=1)

for label, content in df_bikes.iterrows():
    ride_lengths.append(content["ride_duration_minutes"])

ride_lengths_np = np.array(ride_lengths)
iqr = stats.iqr(ride_lengths_np)
iqr

q1,q3 = np.percentile(ride_lengths_np, [25,95])

lower_range = q1 - (1.5*iqr)
upper_range = q3 + (1.5*iqr)

#A ride, can't be shorter than 0 Minutes obviously
if lower_range < 0:
    lower_range = 0

df_bikes.drop(df_bikes[ (df_bikes.ride_duration_minutes > upper_range) | (df_bikes.ride_duration_minutes < lower_range) ].index , inplace=True)

In [5]:
df_bikes['start_time_rounded'] = get_datetime(df_bikes["start_time"][0]).replace(minute=0)
df_bikes['end_time_rounded']= get_datetime(df_bikes["end_time"][0]).replace(minute=0)

In [7]:
s = 0
z = 1
for x in range(0,788906):
    if x not in df_bikes.index:
        s = s + 1
        if s == 1000*z:
            print(x)
            z = z + 1
        else:
            pass
    else:
        df_bikes['start_time_rounded'][x] = get_datetime(df_bikes["start_time"][x]).replace(minute=0)
        df_bikes['end_time_rounded'][x] = get_datetime(df_bikes["end_time"][x]).replace(minute=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bikes['start_time_rounded'][x] = get_datetime(df_bikes["start_time"][x]).replace(minute=0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bikes['end_time_rounded'][x] = get_datetime(df_bikes["end_time"][x]).replace(minute=0)


45795
66133
108591
131887
155306
184846
217251
238406
268642
293197
321602
343296
369865
399242
432102
457399
487616
519325
541169
576939
607341
646208
677934
712174
758304


In [8]:
df_bikes['start_time_rounded'] = df_bikes['start_time_rounded'].dt.strftime("%y/%m/%d %H:%M:%S")
df_bikes['end_time_rounded'] = df_bikes['end_time_rounded'].dt.strftime("%y/%m/%d %H:%M:%S")

In [9]:
df_bikes['start_time_rounded_and_station'] = df_bikes['start_time_rounded'] + " " + df_bikes['start_station_name']
df_bikes['end_time_rounded_and_station'] = df_bikes['end_time_rounded'] + " " + df_bikes['end_station_name']

In [10]:
df_bikes['outgoing_bikes'] = df_bikes['start_time_rounded_and_station'].value_counts()

In [11]:
df_bikes['incomming_bikes'] = df_bikes['end_time_rounded_and_station'].value_counts()

In [12]:
df_delta = pd.DataFrame((df_bikes['end_time_rounded_and_station'].value_counts() - df_bikes['start_time_rounded_and_station'].value_counts()), columns = ['hourly delta'])

In [13]:
df_delta[np.isnan(df_delta)] = 0

In [14]:
df_delta.head(10)

Unnamed: 0,hourly delta
17/01/01 00:00:00 10th & Chestnut,0.0
17/01/01 00:00:00 12th & Filbert,0.0
17/01/01 00:00:00 15th & Market,0.0
"17/01/01 00:00:00 18th & Fernon, Aquinas Center",0.0
"17/01/01 00:00:00 21st & Winter, Franklin Institute",0.0
17/01/01 00:00:00 22nd & Federal,0.0
17/01/01 00:00:00 25th & Locust,0.0
17/01/01 00:00:00 2nd & Market,0.0
17/01/01 00:00:00 4th & Bainbridge,0.0
17/01/01 00:00:00 6th & Fairmount,0.0


In [17]:
df_bikes_delta_outgoing = df_bikes.groupby(['start_station_name','start_time_rounded']).count()

In [18]:
df_bikes_delta_incoming = df_bikes.groupby(['end_station_name','end_time_rounded']).count()

In [19]:
df_bikes_delta_outgoing.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,end_station_name,ride_duration_minutes,end_time_rounded,start_time_rounded_and_station,end_time_rounded_and_station,outgoing_bikes,incomming_bikes
start_station_name,start_time_rounded,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10th & Chestnut,17/01/01 00:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/01 10:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/01 15:00:00,2,2,2,2,2,2,2,2,2,2,2,0,0
10th & Chestnut,17/01/01 16:00:00,2,2,2,2,2,2,2,2,2,2,2,0,0
10th & Chestnut,17/01/01 17:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/01 18:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/02 00:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/02 08:00:00,2,2,2,2,2,2,2,2,2,2,2,0,0
10th & Chestnut,17/01/02 09:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/02 18:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0


In [22]:
df_bikes_delta_incoming.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,ride_duration_minutes,start_time_rounded,start_time_rounded_and_station,end_time_rounded_and_station,outgoing_bikes,incomming_bikes
end_station_name,end_time_rounded,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10th & Chestnut,17/01/01 05:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/01 06:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/01 13:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/01 18:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/01 20:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/01 21:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/02 05:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/02 08:00:00,2,2,2,2,2,2,2,2,2,2,2,0,0
10th & Chestnut,17/01/02 11:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
10th & Chestnut,17/01/02 12:00:00,1,1,1,1,1,1,1,1,1,1,1,0,0
