## Imports

In [185]:
#general imports
import pandas as pd
import numpy as np

#statsmodels for regression
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

#scipy for testing
from scipy import stats

#for visualization
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime


## Bike Data

In [186]:
df_bikes = pd.read_csv('philadelphia_2017.csv')
df_weather = pd.read_csv('weather_hourly_philadelphia.csv')

df_bikes.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name
0,1/1/2017 0:05,1/1/2017 0:16,3046,3041,5347,Indego30,2nd & Market,"Girard Station, MFL"
1,1/1/2017 0:21,1/1/2017 0:57,3110,3054,3364,Walk-up,Del. River Trail & Penn St.,Rodin Museum
2,1/1/2017 0:22,1/1/2017 0:57,3110,3054,2536,Walk-up,Del. River Trail & Penn St.,Rodin Museum
3,1/1/2017 0:27,1/1/2017 0:39,3041,3005,5176,Indego30,"Girard Station, MFL","Welcome Park, NPS"
4,1/1/2017 0:28,1/1/2017 0:36,3047,3124,5370,Walk-up,"Independence Mall, NPS",Race Street Pier


In [187]:
df_bikes.describe()

Unnamed: 0,start_station_id,end_station_id,bike_id
count,788907.0,788907.0,788907.0
mean,3057.400081,3056.431487,5428.980913
std,41.126829,41.108792,3221.972215
min,3000.0,3000.0,0.0
25%,3023.0,3023.0,3350.0
50%,3050.0,3049.0,3703.0
75%,3075.0,3074.0,5370.0
max,3170.0,3170.0,11965.0


In [188]:
unique_station_ids = set(df_bikes['start_station_id'])
id_mapped_to_station = []
for id in unique_station_ids:
    this_station = []
    this_station.append(id)
    this_station.append(df_bikes.loc[df_bikes['start_station_id'] == id].iloc[0]['start_station_name'])
    id_mapped_to_station.append(this_station)
print(id_mapped_to_station)


[[3072, 'Front & Carpenter'], [3073, '9th & Spring Garden'], [3074, '44th & Walnut'], [3075, 'Fairmount & Ridge'], [3077, '17th & Girard'], [3078, '19th & Market'], [3083, '3083'], [3086, 'Broad & Christian'], [3088, '3rd & Girard'], [3093, '33rd & Diamond'], [3095, '29th & Diamond'], [3096, '29th & Dauphin'], [3097, 'Berks Station, MFL'], [3098, '10th & Federal'], [3099, '4th & Washington'], [3100, 'Moyamensing & Tasker'], [3101, '11th & South'], [3102, 'Pennsylvania & Fairmount Perelman Building'], [3103, '27th & Master, Athletic Recreation Center'], [3104, '34th & Mantua'], [3105, 'Penn Treaty Park'], [3106, '33rd & Dauphin'], [3107, '33rd & Reservoir'], [3108, '15th & Market'], [3109, 'Parkside & Girard'], [3110, 'Del. River Trail & Penn St.'], [3111, 'Parkside & Belmont, Case Building'], [3112, '48th & Spruce'], [3113, 'Philadelphia Zoo'], [3114, '22nd & Federal'], [3115, '19th & Girard, PTTI'], [3116, 'Barnes Foundation'], [3117, 'ParkWest Town Center'], [3118, '46th Street Stati

In [189]:
# Zeiten in Daytime umgewandelt und Leihdauer hinzugefügt
df_bikes["start_time"] = pd.to_datetime(df_bikes["start_time"], dayfirst = True)

df_bikes["end_time"] = pd.to_datetime(df_bikes["end_time"], dayfirst = True)

df_bikes['duration'] = df_bikes['end_time'] - df_bikes['start_time']

df_bikes.head()


Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration
0,2017-01-01 00:05:00,2017-01-01 00:16:00,3046,3041,5347,Indego30,2nd & Market,"Girard Station, MFL",0 days 00:11:00
1,2017-01-01 00:21:00,2017-01-01 00:57:00,3110,3054,3364,Walk-up,Del. River Trail & Penn St.,Rodin Museum,0 days 00:36:00
2,2017-01-01 00:22:00,2017-01-01 00:57:00,3110,3054,2536,Walk-up,Del. River Trail & Penn St.,Rodin Museum,0 days 00:35:00
3,2017-01-01 00:27:00,2017-01-01 00:39:00,3041,3005,5176,Indego30,"Girard Station, MFL","Welcome Park, NPS",0 days 00:12:00
4,2017-01-01 00:28:00,2017-01-01 00:36:00,3047,3124,5370,Walk-up,"Independence Mall, NPS",Race Street Pier,0 days 00:08:00


In [190]:
df_bikes.notnull()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration
0,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...
788902,True,True,True,True,True,True,True,True,True
788903,True,True,True,True,True,True,True,True,True
788904,True,True,True,True,True,True,True,True,True
788905,True,True,True,True,True,True,True,True,True


In [191]:
df_bikes.dropna(axis=0,inplace=True)
len(df_bikes)-len(df_bikes.dropna())

0

In [192]:
# da hat sich jemand gegönnt :D
max(df_bikes['duration'])


Timedelta('91 days 16:29:00')

In [193]:
# auch nicht schlecht
min(df_bikes['duration'])


Timedelta('-323 days +00:14:00')

In [194]:
sumIndego30 = sum(1 for i in df_bikes["user_type"] if i == 'Indego30')
sumIndego30


667441

In [195]:
sumWalkUp = sum(1 for i in df_bikes["user_type"] if i == 'Walk-up')
sumWalkUp


100661

In [196]:
# gibt also noch mehr Modelle
sumIndego30 + sumWalkUp

768102

In [197]:
sumIndegoFlex = sum(1 for i in df_bikes["user_type"] if i == 'IndegoFlex')
sumIndegoFlex


6034

In [198]:
sumOneDayPass = sum(1 for i in df_bikes["user_type"] if i == 'One Day Pass')
sumOneDayPass


6328

In [199]:
sumIndego365 = sum(1 for i in df_bikes["user_type"] if i == 'Indego365')
sumIndego365


6840

In [200]:
#mindestends ein Modell fehlt noch
sumIndego30 + sumWalkUp + sumIndegoFlex + sumOneDayPass + sumIndego365


787304

## Weather Data

In [201]:
# Ausgabe der ersten Werte
df_weather.head()

Unnamed: 0,date_time,max_temp,min_temp,precip
0,2015-01-02 01:00:00,3.3,3.3,0.0
1,2015-01-02 02:00:00,2.8,2.8,0.0
2,2015-01-02 03:00:00,2.2,2.2,0.0
3,2015-01-02 04:00:00,0.6,0.6,0.0
4,2015-01-02 05:00:00,1.1,1.1,0.0


In [202]:
# Kurzzusammenfassung der Werte
df_weather.describe()

Unnamed: 0,max_temp,min_temp,precip
count,43785.0,43785.0,43785.0
mean,14.204796,14.1213,0.08777
std,10.272673,10.259344,0.282963
min,-16.7,-16.7,0.0
25%,6.1,6.1,0.0
50%,15.0,14.4,0.0
75%,22.8,22.8,0.0
max,36.7,36.7,1.0


In [203]:
# Datumsdaten in datetime umwandeln
df_weather["date_time"] = pd.to_datetime(df_weather["date_time"])

In [204]:
# Daten in richtige Reihenfolge gebracht
df_weather = df_weather.sort_values(by=['date_time'])

In [381]:
# Wetter für 2017 in Philly; andere Jahre rausgeworfen

start2017 = datetime(2016, 12, 31, 23)
end2017 = datetime(2018, 1, 1)

df_2017weather = df_weather[(df_weather["date_time"] > start2017) & (df_weather["date_time"] < end2017)]
df_2017weather_unique = df_2017weather.drop_duplicates(['date_time'])
df_2017weather_unique['Order'] = np.arange(len(df_2017weather_unique))
df_2017weather_unique = df_2017weather_unique.set_index('Order')

df_2017weather_unique.tail(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2017weather_unique['Order'] = np.arange(len(df_2017weather_unique))


Unnamed: 0_level_0,date_time,max_temp,min_temp,precip
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8212,2017-12-31 04:00:00,-6.1,-6.1,0.0
8213,2017-12-31 05:00:00,-6.7,-6.7,0.0
8214,2017-12-31 06:00:00,-6.7,-6.7,0.0
8215,2017-12-31 07:00:00,-7.2,-7.2,0.0
8216,2017-12-31 08:00:00,-7.8,-7.8,0.0
8217,2017-12-31 09:00:00,-8.3,-8.3,0.0
8218,2017-12-31 10:00:00,-8.3,-8.3,0.0
8219,2017-12-31 11:00:00,-8.9,-8.9,0.0
8220,2017-12-31 12:00:00,-9.4,-9.4,0.0
8221,2017-12-31 13:00:00,-10.0,-10.0,0.0


In [405]:
# Durchschnitt Test
(df_2017weather_unique['max_temp'][0] + df_2017weather_unique['max_temp'][1]) / 2

6.1

In [410]:
# Lücken identifizieren und mit Durchschnitt füllen
j = 8231
for i in df_2017weather_unique.index:
    if i == 8231:
        print("Ende")
    elif (df_2017weather_unique['date_time'][i] - df_2017weather_unique['date_time'][i+1]) != (df_2017weather_unique['date_time'][1] - df_2017weather_unique['date_time'][2]):
        df_2017weather_support = {'date_time': (df_2017weather_unique['date_time'][i] + pd.DateOffset(hours=1)), 'max_temp': (df_2017weather_unique['max_temp'][i] + df_2017weather_unique['max_temp'][i+1])/2, 'min_temp': (df_2017weather_unique['max_temp'][i] + df_2017weather_unique['max_temp'][i+1])/2, 'precip': df_2017weather_unique['precip'][i]}
        print(df_2017weather_support)
        j = j+1
        df_2017weather_unique = df_2017weather_unique.append(df_2017weather_support, ignore_index=True)
print(j)
    

{'date_time': Timestamp('2017-01-02 15:00:00'), 'max_temp': 5.0, 'min_temp': 5.0, 'precip': 1.0}
{'date_time': Timestamp('2017-01-02 17:00:00'), 'max_temp': 5.85, 'min_temp': 5.85, 'precip': 1.0}
{'date_time': Timestamp('2017-01-02 23:00:00'), 'max_temp': 5.3, 'min_temp': 5.3, 'precip': 0.0}
{'date_time': Timestamp('2017-01-03 02:00:00'), 'max_temp': 5.0, 'min_temp': 5.0, 'precip': 0.0}
{'date_time': Timestamp('2017-01-03 05:00:00'), 'max_temp': 5.0, 'min_temp': 5.0, 'precip': 0.0}
{'date_time': Timestamp('2017-01-03 13:00:00'), 'max_temp': 5.3, 'min_temp': 5.3, 'precip': 1.0}
{'date_time': Timestamp('2017-01-03 22:00:00'), 'max_temp': 8.05, 'min_temp': 8.05, 'precip': 0.0}
{'date_time': Timestamp('2017-01-04 03:00:00'), 'max_temp': 6.7, 'min_temp': 6.7, 'precip': 0.0}
{'date_time': Timestamp('2017-01-04 07:00:00'), 'max_temp': 6.7, 'min_temp': 6.7, 'precip': 1.0}
{'date_time': Timestamp('2017-01-04 14:00:00'), 'max_temp': 8.3, 'min_temp': 8.3, 'precip': 0.0}
{'date_time': Timestamp('2

{'date_time': Timestamp('2017-03-01 06:00:00'), 'max_temp': 14.4, 'min_temp': 14.4, 'precip': 1.0}
{'date_time': Timestamp('2017-03-01 09:00:00'), 'max_temp': 13.9, 'min_temp': 13.9, 'precip': 0.0}
{'date_time': Timestamp('2017-03-01 14:00:00'), 'max_temp': 15.55, 'min_temp': 15.55, 'precip': 0.0}
{'date_time': Timestamp('2017-03-01 16:00:00'), 'max_temp': 18.05, 'min_temp': 18.05, 'precip': 1.0}
{'date_time': Timestamp('2017-03-01 21:00:00'), 'max_temp': 20.799999999999997, 'min_temp': 20.799999999999997, 'precip': 1.0}
{'date_time': Timestamp('2017-03-03 23:00:00'), 'max_temp': 1.15, 'min_temp': 1.15, 'precip': 0.0}
{'date_time': Timestamp('2017-03-08 11:00:00'), 'max_temp': 11.4, 'min_temp': 11.4, 'precip': 1.0}
{'date_time': Timestamp('2017-03-10 12:00:00'), 'max_temp': 4.199999999999999, 'min_temp': 4.199999999999999, 'precip': 1.0}
{'date_time': Timestamp('2017-03-10 15:00:00'), 'max_temp': 0.8500000000000001, 'min_temp': 0.8500000000000001, 'precip': 1.0}
{'date_time': Timestamp

{'date_time': Timestamp('2017-04-26 09:00:00'), 'max_temp': 13.3, 'min_temp': 13.3, 'precip': 1.0}
{'date_time': Timestamp('2017-04-26 22:00:00'), 'max_temp': 17.5, 'min_temp': 17.5, 'precip': 0.0}
{'date_time': Timestamp('2017-04-27 03:00:00'), 'max_temp': 16.7, 'min_temp': 16.7, 'precip': 0.0}
{'date_time': Timestamp('2017-04-27 09:00:00'), 'max_temp': 16.1, 'min_temp': 16.1, 'precip': 0.0}
{'date_time': Timestamp('2017-04-27 13:00:00'), 'max_temp': 18.6, 'min_temp': 18.6, 'precip': 0.0}
{'date_time': Timestamp('2017-04-28 07:00:00'), 'max_temp': 19.45, 'min_temp': 19.45, 'precip': 1.0}
{'date_time': Timestamp('2017-04-28 11:00:00'), 'max_temp': 18.85, 'min_temp': 18.85, 'precip': 0.0}
{'date_time': Timestamp('2017-04-29 12:00:00'), 'max_temp': 22.25, 'min_temp': 22.25, 'precip': 0.0}
{'date_time': Timestamp('2017-04-29 14:00:00'), 'max_temp': 24.45, 'min_temp': 24.45, 'precip': 0.0}
{'date_time': Timestamp('2017-05-01 00:00:00'), 'max_temp': 13.350000000000001, 'min_temp': 13.350000

{'date_time': Timestamp('2017-06-19 23:00:00'), 'max_temp': 22.5, 'min_temp': 22.5, 'precip': 0.0}
{'date_time': Timestamp('2017-06-20 07:00:00'), 'max_temp': 23.05, 'min_temp': 23.05, 'precip': 0.0}
{'date_time': Timestamp('2017-06-20 09:00:00'), 'max_temp': 22.5, 'min_temp': 22.5, 'precip': 0.0}
{'date_time': Timestamp('2017-06-20 19:00:00'), 'max_temp': 30.0, 'min_temp': 30.0, 'precip': 0.0}
{'date_time': Timestamp('2017-06-21 23:00:00'), 'max_temp': 23.35, 'min_temp': 23.35, 'precip': 1.0}
{'date_time': Timestamp('2017-06-23 11:00:00'), 'max_temp': 25.0, 'min_temp': 25.0, 'precip': 0.0}
{'date_time': Timestamp('2017-06-23 16:00:00'), 'max_temp': 26.4, 'min_temp': 26.4, 'precip': 1.0}
{'date_time': Timestamp('2017-06-23 18:00:00'), 'max_temp': 29.45, 'min_temp': 29.45, 'precip': 0.0}
{'date_time': Timestamp('2017-06-24 12:00:00'), 'max_temp': 23.6, 'min_temp': 23.6, 'precip': 0.0}
{'date_time': Timestamp('2017-06-27 09:00:00'), 'max_temp': 16.95, 'min_temp': 16.95, 'precip': 0.0}
{'

{'date_time': Timestamp('2017-08-12 07:00:00'), 'max_temp': 21.7, 'min_temp': 21.7, 'precip': 1.0}
{'date_time': Timestamp('2017-08-12 13:00:00'), 'max_temp': 23.3, 'min_temp': 23.3, 'precip': 0.0}
{'date_time': Timestamp('2017-08-12 18:00:00'), 'max_temp': 26.95, 'min_temp': 26.95, 'precip': 0.0}
{'date_time': Timestamp('2017-08-13 02:00:00'), 'max_temp': 24.4, 'min_temp': 24.4, 'precip': 0.0}
{'date_time': Timestamp('2017-08-13 08:00:00'), 'max_temp': 23.6, 'min_temp': 23.6, 'precip': 0.0}
{'date_time': Timestamp('2017-08-15 11:00:00'), 'max_temp': 22.8, 'min_temp': 22.8, 'precip': 0.0}
{'date_time': Timestamp('2017-08-15 13:00:00'), 'max_temp': 23.05, 'min_temp': 23.05, 'precip': 1.0}
{'date_time': Timestamp('2017-08-15 17:00:00'), 'max_temp': 24.4, 'min_temp': 24.4, 'precip': 0.0}
{'date_time': Timestamp('2017-08-15 23:00:00'), 'max_temp': 25.0, 'min_temp': 25.0, 'precip': 0.0}
{'date_time': Timestamp('2017-08-16 04:00:00'), 'max_temp': 23.05, 'min_temp': 23.05, 'precip': 0.0}
{'da

{'date_time': Timestamp('2017-11-02 05:00:00'), 'max_temp': 16.1, 'min_temp': 16.1, 'precip': 0.0}
{'date_time': Timestamp('2017-11-02 07:00:00'), 'max_temp': 15.0, 'min_temp': 15.0, 'precip': 0.0}
{'date_time': Timestamp('2017-11-02 10:00:00'), 'max_temp': 13.350000000000001, 'min_temp': 13.350000000000001, 'precip': 0.0}
{'date_time': Timestamp('2017-11-02 12:00:00'), 'max_temp': 14.75, 'min_temp': 14.75, 'precip': 0.0}
{'date_time': Timestamp('2017-11-03 11:00:00'), 'max_temp': 14.7, 'min_temp': 14.7, 'precip': 0.0}
{'date_time': Timestamp('2017-11-03 14:00:00'), 'max_temp': 18.299999999999997, 'min_temp': 18.299999999999997, 'precip': 0.0}
{'date_time': Timestamp('2017-11-05 10:00:00'), 'max_temp': 11.7, 'min_temp': 11.7, 'precip': 0.0}
{'date_time': Timestamp('2017-11-06 00:00:00'), 'max_temp': 16.65, 'min_temp': 16.65, 'precip': 0.0}
{'date_time': Timestamp('2017-11-06 04:00:00'), 'max_temp': 16.15, 'min_temp': 16.15, 'precip': 0.0}
{'date_time': Timestamp('2017-11-06 06:00:00'),

In [413]:
# Erneut Ordnen und neue Nummern vergeben
df_2017weather_unique = df_2017weather_unique.sort_values(by=['date_time'])
df_2017weather_unique['Order'] = np.arange(len(df_2017weather_unique))
df_2017weather_unique = df_2017weather_unique.set_index('Order')

df_2017weather_unique.tail(20)

Unnamed: 0_level_0,date_time,max_temp,min_temp,precip
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8739,2017-12-31 04:00:00,-6.1,-6.1,0.0
8740,2017-12-31 05:00:00,-6.7,-6.7,0.0
8741,2017-12-31 06:00:00,-6.7,-6.7,0.0
8742,2017-12-31 07:00:00,-7.2,-7.2,0.0
8743,2017-12-31 08:00:00,-7.8,-7.8,0.0
8744,2017-12-31 09:00:00,-8.3,-8.3,0.0
8745,2017-12-31 10:00:00,-8.3,-8.3,0.0
8746,2017-12-31 11:00:00,-8.9,-8.9,0.0
8747,2017-12-31 12:00:00,-9.4,-9.4,0.0
8748,2017-12-31 13:00:00,-10.0,-10.0,0.0


In [387]:
# Keine Missing Values in allen Spalten; Summe der Missing Values ist jeweils 0
df_2017weather.isnull().sum()

date_time    0
max_temp     0
min_temp     0
precip       0
dtype: int64

In [388]:
# Maximale Temperatur
max(df_weather["max_temp"])

36.7

In [389]:
# Minimale Temperatur
min(df_weather["max_temp"])

-16.7

In [212]:
"""
Die Funktion ist an sich jetzt richtig, die Wetterdaten müssen nur noch sortiert werden, hab ich jetzt leider nicht gerafft,
dass sie es nicht gemacht haben / die Daten extra kaputt gemacht haben :D
"""
def get_datetime(date_string):
    new_string = ""
    running_string = ""
    for char in date_string:
        if char == "/" or  char == ":" or char == " ":
            if len(running_string) == 1:
                running_string = "0" + running_string
            new_string += running_string + char
            running_string = ""
        else:
            running_string += char
    new_string += running_string
    if len(new_string) > 16:
        return datetime.strptime(new_string, '%m/%d/%Y %H:%M:%S')
    else:
        return datetime.strptime(new_string, '%m/%d/%Y %H:%M')

def would_be_rounded_up(num):
    if round(num) == int(num) + 1:
        return True
    else:
        return False

def get_temperature(start_time, end_time):
    duration_of_travel = end_time-start_time
    average_timestamp = end_time - datetime.fromisoformat("2017-01-01 00:00:00") - duration_of_travel/2
    weather_row = 17521 + average_timestamp.total_seconds()/3600
    as_int = int(weather_row)
    if would_be_rounded_up(weather_row):
        rows = [[df_weather["min_temp"][as_int],df_weather["max_temp"][as_int]], [df_weather["min_temp"][as_int+1],df_weather["max_temp"][as_int+1]]]
    else:
        rows = [[df_weather["min_temp"][as_int],df_weather["max_temp"][as_int]], [df_weather["min_temp"][as_int-1],df_weather["max_temp"][as_int-1]]]
    result = 0;
    avg_temp = (rows[0][0] + rows[0][1]) / 2
    result += avg_temp/(weather_row - int(weather_row))
    avg_temp = (rows[1][0] + rows[1][1]) / 2
    result += avg_temp/(1 - weather_row - int(weather_row))

    return result



def temperature_at_time(start_time, end_time):
    start_time = get_datetime(start_time)
    end_time = get_datetime(end_time)
    print(start_time)
    print(get_temperature(start_time, end_time))





temperature_at_time(df_bikes["start_time"][10000],df_bikes["end_time"][10000])

TypeError: 'Timestamp' object is not iterable