## Imports

Imports every library needed

In [22]:
#general imports
import pandas as pd
import numpy as np

#statsmodels for regression
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

#scipy for testing
from scipy import stats

#for visualization
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

## Importing the Bike Data

Imports the normal Data needed for every KPI

In [24]:
df_bikes = pd.read_csv('../data/philadelphia_2017.csv')

df_bikes["start_time"] = pd.to_datetime(df_bikes["start_time"], dayfirst = True)

df_bikes["end_time"] = pd.to_datetime(df_bikes["end_time"], dayfirst = True)

df_bikes['ride_duration_minutes'] = df_bikes['end_time'] - df_bikes['start_time']

ride_lengths = []

for label, content in df_bikes.iterrows():
    ride_lengths.append(content["ride_duration_minutes"])

ride_lengths_np = np.array(ride_lengths)
iqr = stats.iqr(ride_lengths_np)
iqr

q1,q3 = np.percentile(ride_lengths_np, [25,99.9])

lower_range = q1 - (1.5*iqr)
upper_range = q3 + (1.5*iqr)

df_bikes.drop(df_bikes[ (df_bikes.ride_duration_minutes > upper_range) | (df_bikes.end_time > pd.to_datetime("2017-12-31 23:59:59")) | (df_bikes.start_time > pd.to_datetime("2017-12-31 23:59:59"))].index , inplace=True)

In [25]:
df_weather = pd.read_csv('../data/weather_hourly_philadelphia.csv')

Change type of "date_time" to datetime

In [26]:
df_weather["date_time"] = pd.to_datetime(df_weather["date_time"])

Ordered weather data by date

In [27]:
df_weather = df_weather.sort_values(by=['date_time'])

Deleted weather date which is not needed for Philadelphia 2017 (<2017 or >2017)

In [28]:
start2017 = datetime(2016, 12, 31, 23)
end2017 = datetime(2018, 1, 1)

df_2017weather = df_weather[(df_weather["date_time"] > start2017) & (df_weather["date_time"] < end2017)]
df_2017weather_unique = df_2017weather.drop_duplicates(subset='date_time')
df_2017weather_unique['Order'] = np.arange(len(df_2017weather_unique))
df_2017weather_unique = df_2017weather_unique.set_index('Order')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2017weather_unique['Order'] = np.arange(len(df_2017weather_unique))


Identify missing data and generate by using average data from former and following hour

In [30]:
for i in df_2017weather_unique.index:
    if i == 8231:
        pass
    elif (df_2017weather_unique['date_time'][i] - df_2017weather_unique['date_time'][i+1]) != (df_2017weather_unique['date_time'][1] - df_2017weather_unique['date_time'][2]):
        df_2017weather_support = {'date_time': (df_2017weather_unique['date_time'][i] + pd.DateOffset(hours=1)), 'max_temp': (df_2017weather_unique['max_temp'][i] + df_2017weather_unique['max_temp'][i+1])/2, 'min_temp': (df_2017weather_unique['max_temp'][i] + df_2017weather_unique['max_temp'][i+1])/2, 'precip': df_2017weather_unique['precip'][i]}
        df_2017weather_unique = df_2017weather_unique.append(df_2017weather_support, ignore_index=True)

New order by date, because we added new lines

In [31]:
df_2017weather_unique = df_2017weather_unique.sort_values(by=['date_time'])
df_2017weather_unique['Order'] = np.arange(len(df_2017weather_unique))
df_2017weather_unique = df_2017weather_unique.set_index('Order')

Running the same code again because one line is missing

In [32]:
for i in df_2017weather_unique.index:
    if i == 8758:
        pass
    elif (df_2017weather_unique['date_time'][i] - df_2017weather_unique['date_time'][i+1]) != (df_2017weather_unique['date_time'][1] - df_2017weather_unique['date_time'][2]):
        df_2017weather_support = {'date_time': (df_2017weather_unique['date_time'][i] + pd.DateOffset(hours=1)), 'max_temp': (df_2017weather_unique['max_temp'][i] + df_2017weather_unique['max_temp'][i+1])/2, 'min_temp': (df_2017weather_unique['max_temp'][i] + df_2017weather_unique['max_temp'][i+1])/2, 'precip': df_2017weather_unique['precip'][i]}
        df_2017weather_unique = df_2017weather_unique.append(df_2017weather_support, ignore_index=True)

New order by date, because we added new lines

In [33]:
df_2017weather_unique = df_2017weather_unique.sort_values(by=['date_time'])
df_2017weather_unique['Order'] = np.arange(len(df_2017weather_unique))
df_2017weather_unique = df_2017weather_unique.set_index('Order')

Check if data is complete by checking time difference of following

In [34]:
for i in df_2017weather_unique.index:
    if i == 8759:
        pass
    elif (df_2017weather_unique['date_time'][i] - df_2017weather_unique['date_time'][i+1]) != (df_2017weather_unique['date_time'][1] - df_2017weather_unique['date_time'][2]):
        df_2017weather_support = {'date_time': (df_2017weather_unique['date_time'][i] + pd.DateOffset(hours=1)), 'max_temp': (df_2017weather_unique['max_temp'][i] + df_2017weather_unique['max_temp'][i+1])/2, 'min_temp': (df_2017weather_unique['max_temp'][i] + df_2017weather_unique['max_temp'][i+1])/2, 'precip': df_2017weather_unique['precip'][i]}
        df_2017weather_unique = df_2017weather_unique.append(df_2017weather_support, ignore_index=True)

def would_be_rounded_up(num):
    if round(num) == int(num) + 1:
        return True
    else:
        return False

def get_temperature(start_time, end_time):
    duration_of_travel = end_time-start_time
    average_timestamp = end_time - datetime.fromisoformat("2017-01-01 00:00:00") - duration_of_travel/2
    weather_row = average_timestamp.total_seconds()/3600
    as_int = int(weather_row)
    if(as_int < 8760):
        result = (df_2017weather_unique["min_temp"][as_int] + df_2017weather_unique["max_temp"][as_int])/2
        return result
    else:
        return pd.to_datetime(0)

df_bikes["temperature"] = df_bikes.apply(lambda x: get_temperature(x["start_time"], x["end_time"]), axis=1)

In [41]:
df_bikes.tail(20)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,ride_duration_minutes,temperature
788884,2017-12-31 21:13:00,2017-12-31 21:29:00,3075,3075,11863,Walk-up,Fairmount & Ridge,Fairmount & Ridge,0 days 00:16:00,-9.4
788886,2017-12-31 21:29:00,2017-12-31 21:52:00,3055,3110,3542,Walk-up,8th & Market,Del. River Trail & Penn St.,0 days 00:23:00,-9.4
788887,2017-12-31 21:42:00,2017-12-31 22:05:00,3120,3028,11039,Indego30,31st & Girard,4th & Bainbridge,0 days 00:23:00,-9.4
788888,2017-12-31 21:49:00,2017-12-31 21:50:00,3112,3112,11763,Walk-up,48th & Spruce,48th & Spruce,0 days 00:01:00,-9.4
788889,2017-12-31 21:50:00,2017-12-31 21:51:00,3112,3112,11763,Walk-up,48th & Spruce,48th & Spruce,0 days 00:01:00,-9.4
788890,2017-12-31 21:50:00,2017-12-31 22:01:00,3164,3007,5375,Indego30,12th & Passyunk,"11th & Pine, Kahn Park",0 days 00:11:00,-9.4
788891,2017-12-31 22:13:00,2017-12-31 22:23:00,3069,3069,5229,Indego30,4th & Christian,4th & Christian,0 days 00:10:00,-10.0
788892,2017-12-31 22:13:00,2017-12-31 22:19:00,3010,3012,11908,Indego30,15th & Spruce,21st & Catharine,0 days 00:06:00,-10.0
788893,2017-12-31 22:17:00,2017-12-31 23:17:00,3062,3062,3677,Indego30,24th & Sansom,24th & Sansom,0 days 01:00:00,-10.0
788894,2017-12-31 22:18:00,2017-12-31 22:41:00,3088,3121,11932,Indego30,3rd & Girard,27th & Girard,0 days 00:23:00,-10.0


In [39]:
df_2017weather_unique.tail(5)

Unnamed: 0_level_0,date_time,max_temp,min_temp,precip
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8755,2017-12-31 19:00:00,-8.9,-8.9,0.0
8756,2017-12-31 20:00:00,-8.9,-8.9,0.0
8757,2017-12-31 21:00:00,-9.4,-9.4,0.0
8758,2017-12-31 22:00:00,-10.0,-10.0,0.0
8759,2017-12-31 23:00:00,-10.0,-10.0,0.0
