# Data preparation Task

Import necessary packages

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

import folium
from folium import plugins
from folium.plugins import HeatMap
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo

import geopandas as gpd
import geopy as gp
from geopy import distance
import geoplot as geoplot
import mapclassify

Load csv files

In [None]:
cologneData = pd.read_csv("../../Data sets/koeln.csv")

essenData = pd.read_csv("../../Data sets/essen.csv")

## Get Weekdays from day Object

In [None]:

def weekday_match (ts):
    return ts.weekday()

def hour_match (ts):
    return ts.hour

def createDatetime (ts):
    return datetime.strptime(str(ts), '%Y-%m-%d').date()

def createTime (ts):
    return datetime.strptime(ts, '%H:%M:%S').time()

def timestampNormalize (ts):
    return ts.replace(minute=0, second=0)
# creates a timestamp that is comparable to weather data
def timestamp_create(stamp):
    newStamp =  datetime.strptime(stamp, '%Y-%m-%d %H:%M:%S')
    
    newStamp = newStamp.replace(minute=0, second=0)
    
    return newStamp

In [None]:
# concatenate day and time strings as preparation for timestamp creation
essenData["Zeitstempel"] = essenData["day"] + ' ' + essenData["time"]
cologneData["Zeitstempel"] = cologneData["day"] + ' ' + cologneData["time"]
# create timestamp for comparison of bike rental data and weather data
essenData["Zeitstempel"] = essenData["Zeitstempel"].map(timestamp_create)
cologneData["Zeitstempel"] = cologneData["Zeitstempel"].map(timestamp_create)
# change datatypes to make them more comparable and accessable
essenData["day"] = essenData["day"].map(createDatetime)

essenData["time"] = essenData["time"].map(createTime)

cologneData["day"] = cologneData["day"].map(createDatetime)

cologneData["time"] = cologneData["time"].map(createTime)

In [None]:
# add weekday and hour feauture
essenData["weekday"]=essenData["day"].map(weekday_match)
essenData["hour"] = essenData["time"].map(hour_match)

cologneData["weekday"]=cologneData["day"].map(weekday_match)
cologneData["hour"] = cologneData["time"].map(hour_match)

In [None]:
# transform value to day string
def getDayFromWeekdayNumber(argument):
    switcher = {
        0: "Monday",
        1: "Tuesday",
        2: "Wednesday",
        3: "Thursday",
        4: "Friday",
        5: "Saturday",
        6: "Sunday",
    }
    return switcher.get(argument)

In [None]:
# change weekday value to string of suitable day
essenData["weekday2"] = essenData["weekday"].map(getDayFromWeekdayNumber)

cologneData["weekday2"] = cologneData["weekday"].map(getDayFromWeekdayNumber)

In [None]:
#cleaning data with the right lat. and lng.
cologneCleanedData = cologneData
cologneCleanedData = cologneCleanedData[cologneCleanedData['orig_lat'] >= 50]
cologneCleanedData = cologneCleanedData[ cologneCleanedData['orig_lat'] < 52]
cologneCleanedData = cologneCleanedData[cologneCleanedData['orig_lng'] >= 6]
cologneCleanedData = cologneCleanedData[ cologneCleanedData['orig_lng'] < 8]

essenCleanedData = essenData
essenCleanedData = essenCleanedData[essenCleanedData['orig_lat'] >= 50]
essenCleanedData = essenCleanedData[ essenCleanedData['orig_lat'] < 52]
essenCleanedData = essenCleanedData[essenCleanedData['orig_lng'] >= 6]
essenCleanedData = essenCleanedData[ essenCleanedData['orig_lng'] < 8]

## Weather data

In [None]:
def ts_match (ts):
    return datetime.strptime(str(ts), '%Y%m%d%H%M')

def ts_match2 (ts):
    return datetime.strptime(str(ts), '%Y%m%d')

def getTime (ts):
    return ts.time()

def getDay (ts):
    return ts.date()

### Air Pressure

In [None]:
#import Data
airPressure = pd.read_csv("../weather_data/3_weather_data/air_pressure/data_air_pressure_hpa_hourly.csv")

# change timestamp format

airPressure["Zeitstempel"] = airPressure["Zeitstempel"].map(ts_match)
# airPressure["time"] = airPressure["Zeitstempel"].map(getTime)
# airPressure["day"] = airPressure["Zeitstempel"].map(getDay)

# delete Zeitstempel column (optional) 
airPressure = airPressure.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

airPressureColBo = airPressure.loc[airPressure.SDO_ID == 2667]
airPressureEss= airPressure.loc[airPressure.SDO_ID == 1303]

# rearrange column order
cols = airPressure.columns.tolist()
cols = cols[-1:] + cols[:-1]
cols = cols[-1:] + cols[:-1]

airPressure = airPressure[cols] 

# # show Dataframe
# #airPressure.info()
airPressureColBo


### Air Temparature

In [None]:
# import data
airTemperature = pd.read_csv("../weather_data/3_weather_data/air_temperature/data_Temperature_air_2m_hourly.csv")

# change timestamp format
airTemperature["Zeitstempel"] = airTemperature["Zeitstempel"].map(ts_match)
# airTemperature["time"] = airTemperature["Zeitstempel"].map(getTime)
# airTemperature["day"] = airTemperature["Zeitstempel"].map(getDay)
    
# delete Zeitstempel column (optional) 
airTemperature = airTemperature.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

# airTemperatureCol = airTemperature.loc[airTemperature.SDO_ID == 2968]
airTemperatureColBo = airTemperature.loc[airTemperature.SDO_ID == 2667]
airTemperatureEss= airTemperature.loc[airTemperature.SDO_ID == 1303]

# print(len(airTemperatureCol))
print(len(airTemperatureColBo))
print(len(airTemperatureEss))

# show Dataframe
airTemperatureColBo

### Cloud Coverage

In [None]:
# import data
cloudCoverage = pd.read_csv("../weather_data/3_weather_data/cloud_coverage/data_Hourly_observ_cloud_coverage.csv")

# change timestamp format
cloudCoverage["Zeitstempel"] = cloudCoverage["Zeitstempel"].map(ts_match)
#cloudCoverage["time"] = cloudCoverage["Zeitstempel"].map(getTime)
#cloudCoverage["day"] = cloudCoverage["Zeitstempel"].map(getDay)

# delete Zeitstempel column (optional) 
cloudCoverage = cloudCoverage.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

cloudCoverageColBo = cloudCoverage.loc[cloudCoverage.SDO_ID == 2667]
cloudCoverageEss= cloudCoverage.loc[cloudCoverage.SDO_ID == 1303]

print(cloudCoverage["SDO_ID"].unique())

print(len(cloudCoverageColBo))
print(len(cloudCoverageEss))

# show Dataframe
#cloudCoverage.info()
cloudCoverageColBo

### Precipitation Form

In [None]:
# import data
precipitationForm = pd.read_csv("../weather_data/3_weather_data/form_of_precipitation/data_form_of_rain_precipitation.csv")

# change timestamp format
precipitationForm["Zeitstempel"] = precipitationForm["Zeitstempel"].map(ts_match)
# precipitationForm["time"] = precipitationForm["Zeitstempel"].map(getTime)
# precipitationForm["day"] = precipitationForm["Zeitstempel"].map(getDay)

# delete Zeitstempel column (optional) 
precipitationForm = precipitationForm.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

precipitationFormColBo = precipitationForm.loc[precipitationForm.SDO_ID == 2667]
precipitationFormEss= precipitationForm.loc[precipitationForm.SDO_ID == 1303]

print(precipitationForm["SDO_ID"].unique())

print(len(precipitationFormColBo))
print(len(precipitationFormEss))

# show Dataframe
#precipitationForm.info()
precipitationFormColBo

### Precipitation Amount

In [None]:
# import data
precipitationAmount = pd.read_csv("../weather_data/3_weather_data/precipitation_amount/data_volume_rain_precipitation_hourly.csv")

# change timestamp format
precipitationAmount["Zeitstempel"] = precipitationAmount["Zeitstempel"].map(ts_match)
# precipitationAmount["time"] = precipitationAmount["Zeitstempel"].map(getTime)
# precipitationAmount["day"] = precipitationAmount["Zeitstempel"].map(getDay)

# delete Zeitstempel column (optional) 
precipitationAmount = precipitationAmount.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

precipitationAmountCol = precipitationAmount.loc[precipitationAmount.SDO_ID == 2968]
precipitationAmountColBo = precipitationAmount.loc[precipitationAmount.SDO_ID == 2667]
precipitationAmountEss= precipitationAmount.loc[precipitationAmount.SDO_ID == 1303]

print(precipitationAmount["SDO_ID"].unique())
print(len(precipitationAmountCol))
print(len(precipitationAmountColBo))
print(len(precipitationAmountEss))

# show Dataframe
#precipitationAmount.info()
precipitationAmountColBo

### Relative Humidity

In [None]:
# import data
relativeHumidity= pd.read_csv("../weather_data/3_weather_data/relative_humidity_percent/data_relative_humidity_hourly.csv")

# change timestamp format
relativeHumidity["Zeitstempel"] = relativeHumidity["Zeitstempel"].map(ts_match)
# relativeHumidity["time"] = relativeHumidity["Zeitstempel"].map(getTime)
# relativeHumidity["day"] = relativeHumidity["Zeitstempel"].map(getDay)

# delete Zeitstempel column (optional) 
relativeHumidity = relativeHumidity.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

relativeHumidityCol = relativeHumidity.loc[relativeHumidity.SDO_ID == 2968]
relativeHumidityColBo = relativeHumidity.loc[relativeHumidity.SDO_ID == 2667]
relativeHumidityEss= relativeHumidity.loc[relativeHumidity.SDO_ID == 1303]

print(relativeHumidity["SDO_ID"].unique())
print(len(relativeHumidityCol))
print(len(relativeHumidityColBo))
print(len(relativeHumidityEss))

# show Dataframe
#relativeHumidity.info()
relativeHumidityColBo

### Soil Temperature

In [None]:
# import data
soilTemperature = pd.read_csv("../weather_data/3_weather_data/soil_temperatur_depth_5cm/data_Temperature_ground_5cm_hourly.csv")

# change timestamp format
soilTemperature["Zeitstempel"] = soilTemperature["Zeitstempel"].map(ts_match)
# soilTemperature["time"] = soilTemperature["Zeitstempel"].map(getTime)
# soilTemperature["day"] = soilTemperature["Zeitstempel"].map(getDay)

# delete Zeitstempel column (optional) 
soilTemperature = soilTemperature.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

soilTemperatureColBo = soilTemperature.loc[soilTemperature.SDO_ID == 2667]
soilTemperatureEss= soilTemperature.loc[soilTemperature.SDO_ID == 1303]

print(soilTemperature["SDO_ID"].unique())
print(len(soilTemperatureColBo))
print(len(soilTemperatureEss))

# show Dataframe
#soilTemperature.info()
soilTemperatureColBo

### Sunshine Duration

In [None]:
# import data
sunshineDuration = pd.read_csv("../weather_data/3_weather_data/sunshine_duration_hours/data_daily_observ_sunshine_duration_hours.csv")

# change timestamp format
sunshineDuration["Zeitstempel"] = sunshineDuration["Zeitstempel"].map(ts_match2)
# sunshineDuration["day"] = sunshineDuration["Zeitstempel"].map(getDay)

# delete Zeitstempel column (optional) 
sunshineDuration = sunshineDuration.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

sunshineDurationColBo = sunshineDuration.loc[sunshineDuration.SDO_ID == 2667]
sunshineDurationEss= sunshineDuration.loc[sunshineDuration.SDO_ID == 1303]

print(sunshineDuration["SDO_ID"].unique())
print(len(sunshineDurationColBo))
print(len(sunshineDurationEss))


# show Dataframe
#sunshineDuration.info()
sunshineDurationColBo

### Wind Velocity

In [None]:
# import data
windVelocity = pd.read_csv("../weather_data/3_weather_data/wind_velocity/data_Wind_velocity_10m_hourly.csv")

# change timestamp format
windVelocity["Zeitstempel"] = windVelocity["Zeitstempel"].map(ts_match)
# windVelocity["time"] = windVelocity["Zeitstempel"].map(getTime)
# windVelocity["day"] = windVelocity["Zeitstempel"].map(getDay)

# delete Zeitstempel column (optional) 
windVelocity = windVelocity.drop(columns=["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"])

windVelocityColBo = windVelocity.loc[windVelocity.SDO_ID == 2667]
windVelocityEss= windVelocity.loc[windVelocity.SDO_ID == 1303]

print(windVelocity["SDO_ID"].unique())
print(len(windVelocityColBo))
print(len(windVelocityEss))

# show Dataframe
#windVelocity.info()
windVelocity.head()

Display the last 5 rows

# Merging Weatherdata to one Dataframe

### Merging relevant weather datas for our correlation coefficient(Cologne)

In [None]:
weatherDataColBo = airPressureColBo.merge(airTemperatureColBo, left_on=['Zeitstempel', 'SDO_ID'], right_on=['Zeitstempel', 'SDO_ID'])
weatherDataColBo.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature']

#merge cloudCoverage in weatherData - loss of one entrie -> Approach: Deletion - Missing Rows

weatherDataColBo = weatherDataColBo.merge(cloudCoverageColBo, left_on=['Zeitstempel', 'SDO_ID'], right_on=['Zeitstempel', 'SDO_ID'])
weatherDataColBo.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage']

# # merge windVelocity in weatherData - no loss
weatherDataColBo = weatherDataColBo.merge(windVelocityColBo, left_on=['Zeitstempel', "SDO_ID"], right_on=['Zeitstempel', "SDO_ID"])
weatherDataColBo.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage', 'windVelocity']
## merge precipitationAmount
weatherDataColBo = weatherDataColBo.merge(precipitationAmountColBo, left_on=['Zeitstempel', "SDO_ID"], right_on=['Zeitstempel', "SDO_ID"])
weatherDataColBo.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage', 'windVelocity', 'precipitationAmount']
## merge relativeHumidity
weatherDataColBo = weatherDataColBo.merge(relativeHumidityColBo, left_on=['Zeitstempel', "SDO_ID"], right_on=['Zeitstempel', "SDO_ID"])
weatherDataColBo.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage', 'windVelocity', 'precipitationAmount','relativeHumidity']
##merge soilTemperature 
weatherDataColBo = weatherDataColBo.merge(soilTemperatureColBo, left_on=['Zeitstempel', "SDO_ID"], right_on=['Zeitstempel', "SDO_ID"])
weatherDataColBo.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage', 'windVelocity', 'precipitationAmount','relativeHumidity', 'soilTemperature']




### Merging relevant weather datas for our correlation coefficient(Essen)

In [None]:
# merge airPressure and airTemperature in weatherData
weatherDataEss = airPressureEss.merge(airTemperatureEss, left_on=['Zeitstempel', 'SDO_ID'], right_on=['Zeitstempel', 'SDO_ID'])
weatherDataEss.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature']
#merge cloudCoverage in weatherData - loss of some entries -> Approach: Deletion - Missing Rows
weatherDataEss = weatherDataEss.merge(cloudCoverageEss, left_on=['Zeitstempel', 'SDO_ID'], right_on=['Zeitstempel', 'SDO_ID'])
weatherDataEss.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage']
# merge windVelocity in weatherData - no loss
weatherDataEss = weatherDataEss.merge(windVelocityEss, left_on=['Zeitstempel', "SDO_ID"], right_on=['Zeitstempel', "SDO_ID"])
weatherDataEss.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage', 'windVelocity']
# merge precipitationAmount
weatherDataEss = weatherDataEss.merge(precipitationAmountEss, left_on=['Zeitstempel', "SDO_ID"], right_on=['Zeitstempel', "SDO_ID"])
weatherDataEss.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage', 'windVelocity', 'precipitationAmount']
# merge relativeHumidity
weatherDataEss = weatherDataEss.merge(relativeHumidityEss, left_on=['Zeitstempel', "SDO_ID"], right_on=['Zeitstempel', "SDO_ID"])
weatherDataEss.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage', 'windVelocity', 'precipitationAmount','relativeHumidity']
# merge soilTemperature
weatherDataEss = weatherDataEss.merge(soilTemperatureEss, left_on=['Zeitstempel', "SDO_ID"], right_on=['Zeitstempel', "SDO_ID"])
weatherDataEss.columns = ['SDO_ID', 'Zeitstempel', 'airPressure', 'airTemperature', 'cloudCoverage', 'windVelocity', 'precipitationAmount','relativeHumidity','soilTemperature']



## computing percentage for the cologne bike fleet

In [None]:
#temporary dataframe for further manipulation
cologne_bikes_frame = pd.DataFrame({'day':cologneCleanedData["day"] ,
                                    'time':cologneCleanedData["time"] ,
                                    'b_number':cologneCleanedData["b_number"] ,
                                    'hour':cologneCleanedData["hour"]})
cologne_bikes_frame['Datetime'] = cologne_bikes_frame.apply(lambda r : pd.datetime.combine(r['day'],r['time']),1)


#computing absolute number of different bikes in the rental dataset
cologne_numberOfBikes = cologne_bikes_frame.groupby("b_number")["day"].nunique()
cNumberOfBikes = cologne_numberOfBikes.count()



#grouping by combined datetime as hourly grouping
cologne_bikesHourly = cologne_bikes_frame.groupby(pd.Grouper(key='Datetime', freq='H'))["b_number"].nunique()
cologne_bikesHourlyLength = cologne_bikesHourly.count()
cologne_bikesHourly.iloc[0:24*cologne_bikesHourlyLength] = cologne_bikesHourly.iloc[0:24*cologne_bikesHourlyLength].apply(lambda x: x / cNumberOfBikes * 100)
cologne_bikesHourly = cologne_bikesHourly.reset_index(level=['Datetime'])


#adds the bikes, used in multiple hours to the following hours  
c_tripTime  = pd.DataFrame({ 'day':cologneCleanedData["day"] ,
                            'hour':cologneCleanedData["hour"],
                       'trip_duration':cologneCleanedData["trip_duration"] ,
                         'Datetime':  cologne_bikes_frame['Datetime']})
c_tripTime['Datetime'] = c_tripTime['Datetime'].map(timestampNormalize)
c_tripTime['trip_time'] = (pd.Series(pd.to_timedelta(c_tripTime['trip_duration']))).dt.total_seconds()
c_tripTime['length'] = c_tripTime['trip_time'].apply(lambda x: x/3600)
c_tripTime['length'] = c_tripTime['length'].apply(np.floor)

temp = 1
while (temp < 3):
    value = 0
    tempV = c_tripTime[c_tripTime.length >= (temp)]
    tempV = tempV.groupby(pd.Grouper(key='Datetime', freq='H')).agg('count')
    tempV = tempV.reset_index(level = 'Datetime')
    while (value < len(tempV)):
        cologne_bikesHourly.loc[value ,'b_number'] =   cologne_bikesHourly.loc[value ,'b_number'] + (tempV.loc[value ,'length'] /cNumberOfBikes)
        value+=1
    temp += 1

#computes range of the possible error created by bikes, which are only available in a part of the year
#computing number of bikes with usage count smaller than x = 10 
cologne_numberOfUsagePerBike = cologne_numberOfBikes.apply(lambda x: x < 10)
cologne_numberOfUsagePerBike = len(cologne_numberOfUsagePerBike[cologne_numberOfUsagePerBike == True].index)
cologne_percentageErrorRatio = cologne_numberOfUsagePerBike / cNumberOfBikes
cologne_percentageErrorRatio

## computing percentage of the essen bike fleet

In [None]:
#temporary dataframe for further manipulation
essen_bikes_frame = pd.DataFrame({'day':essenCleanedData["day"] ,
                                  'time':essenCleanedData["time"],
                                    'b_number':essenCleanedData["b_number"] ,
                                    'hour':essenCleanedData["hour"]})
essen_bikes_frame['Datetime'] = essen_bikes_frame.apply(lambda r : pd.datetime.combine(r['day'],r['time']),1)

#computing absolute number of different bikes in the rental dataset
essen_numberOfBikes = essen_bikes_frame.groupby("b_number")["day"].nunique()
eNumberOfBikes = essen_numberOfBikes.count()

#grouping by combined datetime as hourly grouping
essen_bikesHourly = essen_bikes_frame.groupby(pd.Grouper(key='Datetime', freq='H'))["b_number"].nunique()
essen_bikesHourlyLength = essen_bikesHourly.count()
essen_bikesHourly.iloc[0:24*essen_bikesHourlyLength] = essen_bikesHourly.iloc[0:24*essen_bikesHourlyLength].apply(lambda x: x / cNumberOfBikes * 100)
essen_bikesHourly = essen_bikesHourly.reset_index(level=['Datetime'])

#adds the bikes, used in multiple hours to the following hours  
e_tripTime  = pd.DataFrame({ 'day':essenCleanedData["day"] ,
                            'hour':essenCleanedData["hour"],
                       'trip_duration':essenCleanedData["trip_duration"] ,
                         'Datetime':  essen_bikes_frame['Datetime']})
e_tripTime['Datetime'] = e_tripTime['Datetime'].map(timestampNormalize)
e_tripTime['trip_time'] = (pd.Series(pd.to_timedelta(e_tripTime['trip_duration']))).dt.total_seconds()
e_tripTime['length'] = e_tripTime['trip_time'].apply(lambda x: x/3600)
e_tripTime['length'] = e_tripTime['length'].apply(np.floor)

temp = 1
while (temp < 3):
    value = 0
    tempV = e_tripTime[e_tripTime.length >= (temp)]
    tempV = tempV.groupby(pd.Grouper(key='Datetime', freq='H')).agg('count')
    tempV = tempV.reset_index(level = 'Datetime')
    while (value < len(tempV)):
        essen_bikesHourly.loc[value ,'b_number'] =   essen_bikesHourly.loc[value ,'b_number'] + (tempV.loc[value ,'length'] /eNumberOfBikes)
        value+=1
    temp += 1

#computes range of the possible error created by bikes, which are only available in a part of the year
#computing number of bikes with usage count smaller than x = 10 
essen_numberOfUsagePerBike = essen_numberOfBikes.apply(lambda x: x < 10)
essen_numberOfUsagePerBike = len(essen_numberOfUsagePerBike[essen_numberOfUsagePerBike == True].index)
essen_percentageErrorRatio = essen_numberOfUsagePerBike / eNumberOfBikes
essen_percentageErrorRatio

### Create dataframe with weather data and percentage of bikes used (Cologne)

In [None]:
#merge weatherdataframe with utilizationdataframe
DataCologne = cologne_bikesHourly.merge(weatherDataColBo, left_on=['Datetime'], right_on=['Zeitstempel'])

DataCologne.sort_values(by=['Zeitstempel'], inplace=True)
#drop not needed columns or double columns
DataCologne = DataCologne.drop(columns=["Datetime"])
DataCologne = DataCologne.drop(columns=["SDO_ID"])
#rename columns
DataCologne= DataCologne.rename(columns={"b_number":"BikeUse"})
DataCologne
#slice this df to the right datetime
prA2_1=DataCologne[(DataCologne.Zeitstempel >= datetime(2019, 10, 1))&
         (DataCologne.Zeitstempel < datetime(2019, 10, 8))]

### Create dataframe with weather data and percentage of bikes used (Essen)

In [None]:
#merge weatherdataframe with utilizationdataframe
DataEssen = essen_bikesHourly.merge(weatherDataEss, left_on=['Datetime'], right_on=['Zeitstempel'])

DataEssen.sort_values(by=['Zeitstempel'], inplace=True)
#drop not needed columns or double columns
DataEssen = DataEssen.drop(columns=["Datetime"])
DataEssen = DataEssen.drop(columns=["SDO_ID"])
#rename columns
DataEssen= DataEssen.rename(columns={"b_number":"BikeUse"})
#slice this df to the right datetime
prA1_1=DataEssen[(DataEssen.Zeitstempel >= datetime(2019, 11, 1))&
         (DataEssen.Zeitstempel < datetime(2019, 11, 4))]

# Visualize percentage

### Percentage of used bikes/Utilization of a year (Cologne)

In [None]:
cologne_bikesHourly.head()

fig,ax = plt.subplots(figsize=(80,4), dpi= 80)

ax.plot(cologne_bikesHourly['Datetime'],cologne_bikesHourly["b_number"])
ax.set_xlabel("day")
ax.set_ylabel("used bikes in %")
ax.set_title("bike usage in % over the year")

plt.show()

### Percentage of used bikes/Utilization of a year (Essen)

In [None]:
essen_bikesHourly.head()

fig,ax = plt.subplots(figsize=(80,4), dpi= 80)

ax.plot(essen_bikesHourly['Datetime'],essen_bikesHourly["b_number"])
ax.set_xlabel("day")
ax.set_ylabel("used bikes in %")
ax.set_title("bike usage in % over the year")

plt.show()

### Direct comparison of both cities

In [None]:
rolling_mean = cologne_bikesHourly["b_number"].rolling(window=20).mean()
rolling_mean2 = essen_bikesHourly["b_number"].rolling(window=20).mean()
plt.figure(num=None, figsize=(30, 10), dpi=80, facecolor='w', edgecolor='k')
plt.plot(cologne_bikesHourly['Datetime'], cologne_bikesHourly["b_number"], label='Cologne percentage')
plt.plot(cologne_bikesHourly['Datetime'], rolling_mean, label='Cologne percentage SMA', color='magenta')
plt.plot(essen_bikesHourly['Datetime'], essen_bikesHourly["b_number"], label="Essen percentage")
plt.plot(essen_bikesHourly['Datetime'], rolling_mean2, label='Essen percentage SMA', color='magenta')
plt.legend(loc='upper right')
plt.title("bike usage in % over the year")

### Percentage of used bikes for every hour

The following chart defines the final summary of the computed percentage data. It shows the mean values for every hour, the times of peak demand over a day and the fluctuation of the values.

In [None]:
cologne_bikesHourly['hour'] = cologne_bikesHourly['Datetime'].map(hour_match)
cologne = cologne_bikesHourly
cologne['city'] = "cologne"
cologne['percentage of used bikes'] = cologne['b_number']

essen_bikesHourly['hour'] = essen_bikesHourly['Datetime'].map(hour_match)
essen = essen_bikesHourly
essen['city'] = "essen"
essen['percentage of used bikes'] = essen['b_number']

cities = [cologne,essen]

fig = plt.gcf()
fig.set_size_inches(16, 10)

plotOut = pd.concat(cities, axis=0, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)

sns.boxplot(x="hour",y="percentage of used bikes",data= plotOut ,palette="rainbow", hue="city")
plt.show()

## Visualizing Bikes Utilization

### Interactive graph for Bike Utilization (Cologne)

In [None]:
#interactive Graph to see exact utilization for a specific datetime
pyo.init_notebook_mode()
fig = px.line(cologne_bikesHourly, x='Datetime', y="b_number",title="Utilization of a year, Cologne")
#create rangeslider for closer looks
fig.update_xaxes(rangeslider_visible=True)
fig.show()

b_number = percentage of use

### Interactive graph for Bike Utilization (Essen)

In [None]:
#interactive Graph to see exact utilization for a specific datetime
pyo.init_notebook_mode()
fig = px.line(essen_bikesHourly, x='Datetime', y='b_number',title="Utilization of a year, Essen")
#create rangeslider for closer looks
fig.update_xaxes(rangeslider_visible=True)
fig.show()

b_number = percentage of use

# Visualizing Weather Data combined with Utilization Data

### Interactive graph comparing Temperature, Precipitation, Utilization (Cologne)

with this interactive graph we can see the exact value for every hour

In [None]:
#enables plotly to work offline
pyo.init_notebook_mode()
fig = go.Figure()
fig.add_scatter(x=DataCologne['Zeitstempel'], y=DataCologne['BikeUse'],name="BikeUse%")
fig.add_scatter(x=DataCologne['Zeitstempel'], y=DataCologne['airTemperature'],name="Airtemperature(C°)")
fig.add_scatter(x=DataCologne['Zeitstempel'], y=DataCologne['precipitationAmount'],name="Precipitation(mm)")
#create rangeslider for closer looks
fig.update_xaxes(rangeslider_visible=True)
fig.show()

### Interactive graph comparing Temperature, Precipitation, Utilization (Essen)

In [None]:
#enables plotly to work offline
pyo.init_notebook_mode()
fig = go.Figure()
fig.add_scatter(x=DataEssen['Zeitstempel'], y=DataEssen['BikeUse'],name="BikeUse%")
fig.add_scatter(x=DataEssen['Zeitstempel'], y=DataEssen['airTemperature'],name="Airtemperature")
fig.add_scatter(x=DataEssen['Zeitstempel'], y=DataEssen['precipitationAmount'],name="Precipitation")
#create rangeslider for closer looks
fig.update_xaxes(rangeslider_visible=True)
fig.show()

## corrMatrix with Utilization & Weather

### corrMatrix Cologne

These matrix shows every coefficient between each columns of the df. only the first/second row of the matrix is relevant

In [None]:
#corrMAtrix of whole year
plt.figure(figsize=(10,5))
plt.title('corrMatrix/Year')
corrMatrix_1c = DataCologne.corr()
sns.heatmap(corrMatrix_1c, annot=True )
plt.show()
#corrMatrix of chosen time interval as an example
plt.figure(figsize=(10,5))
plt.title('Example corrMatrix/Interval')
#restricted timeinterval from 2019-10-1 to 2019-10-8
corrMatrix_2c = prA2_1.corr()
sns.heatmap(corrMatrix_2c, annot=True )
plt.show()

### corrMatrix Essen

In [None]:
#corrMAtrix of the whole year
plt.figure(figsize=(10,5))
plt.title('corrMatrix/Year')
corrMatrix = DataEssen.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()
#corrMatrix of chosen time interval as an example
plt.figure(figsize=(10,5))
plt.title('Example corrMatrix/Interval')
#restricted timeinterval from 2019-10-1 to 2019-10-8
corrMatrix = prA1_1.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

# KPI: Revenue

### Some further preparations to calculate the revenue:

Preparing a Dataframe with all the neccessary data to calculate revenue KPI's for Cologne:

In [None]:
# setting up a DataFrame with relevant data
c_kpi_df = pd.DataFrame({'day':cologneData["day"] ,
                           'weekday2':cologneData["weekday2"],
                           'weekday':cologneData["weekday"], 
                           'hour':cologneData["hour"],
                           'b_number':cologneData["b_number"] ,
                           'trip_duration':cologneData["trip_duration"]})

# get triptime (in sec) from trip duration 
c_duration_in_seconds = pd.Series(pd.to_timedelta(c_kpi_df['trip_duration']))
c_kpi_df['trip_time'] = c_duration_in_seconds.dt.total_seconds()

# caculate the cost of a trip using the pricing model of 1€/30min 
c_kpi_df['cost'] = c_kpi_df['trip_time'].apply(lambda x: x/1800+1)
c_kpi_df['cost'] = c_kpi_df['cost'].apply(np.floor)

# set a limit of maximum 9€ per trip (according to the Nextbike pricing model)
c_kpi_df['cost'] = np.clip(c_kpi_df['cost'], a_max=9, a_min=None)

# extract month name from date
c_kpi_df['month'] =  pd.DatetimeIndex(c_kpi_df['day']).month
import calendar
c_kpi_df['month'] = c_kpi_df['month'].apply(lambda x: calendar.month_abbr[x])
c_kpi_df

Preparing a Dataframe with all the neccessary data to calculate revenue KPI's for Essen:

In [None]:
# setting up a DataFrame with relevant data
e_kpi_df = pd.DataFrame({'day':essenData["day"] ,
                           'weekday2':essenData["weekday2"],
                           'weekday':essenData["weekday"],
                           'hour':essenData["hour"],
                           'b_number':essenData["b_number"] ,
                           'trip_duration':essenData["trip_duration"]})

# get triptime (in sec) from trip duration 
e_duration_in_seconds = pd.Series(pd.to_timedelta(e_kpi_df['trip_duration']))
e_kpi_df['trip_time'] = e_duration_in_seconds.dt.total_seconds()

# caculate the cost of a trip using the pricing model of 1€/30min 
e_kpi_df['cost'] = e_kpi_df['trip_time'].apply(lambda x: x/1800+1)
e_kpi_df['cost'] = e_kpi_df['cost'].apply(np.floor)

# set a limit of maximum 9€ per trip (according to the Nextbike pricing model)
e_kpi_df['cost'] = np.clip(e_kpi_df['cost'], a_max=9, a_min=None)

# extract month name from date
e_kpi_df['month'] =  pd.DatetimeIndex(e_kpi_df['day']).month
import calendar
e_kpi_df['month'] = e_kpi_df['month'].apply(lambda x: calendar.month_abbr[x])
e_kpi_df

## Revenue for Cologne

### Total revenue per hour (Cologne)

In [None]:
# calculate the total revenue generated per hour of the day over the whole time period in Cologne
c_trip_cost_sum_per_hour = c_kpi_df.groupby('hour').cost.sum()
c_trip_cost_sum_per_hour

In [None]:
# plotting the total revenue generated per hour of the day over the whole time period in Cologne
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(c_trip_cost_sum_per_hour)
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))

plt.xlabel('Hour of the day', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Total Revenue per hour over the whole time period (Cologne)', fontsize=14)

### Average Revenue per hour (Cologne)

In [None]:
# turn into numpy array, to perform mathematical operations
c_trip_cost_sum_per_hour = np.array(c_trip_cost_sum_per_hour)

# Average revenue per hour during a single day in Cologne
# To get an idea how much revenue is generated on average per hour, we divide the total hourly revenue by the number of days 

c_avg_hourly_revenue = c_trip_cost_sum_per_hour/365

In [None]:
# plotting the average revenue generated per hour of the day during a single day in Cologne
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(c_avg_hourly_revenue)
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))

plt.xlabel('Hour of the day', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Average Revenue per hour during a single day (Cologne)', fontsize=14)

### Revenue per hour per bike (Cologne)

In [None]:
# Revenue per hour per bike
c_number_of_bikes = len(c_kpi_df["b_number"].unique())
print("The number of bikes in Cologne is:", c_number_of_bikes)

c_hourly_revenue_per_bike = c_trip_cost_sum_per_hour/c_number_of_bikes

In [None]:
# plotting the revenue generated per hour per bike in Cologne
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(c_hourly_revenue_per_bike)
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))

plt.xlabel('Hour of the day', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Average Revenue per hour per bike (Cologne)', fontsize=14)

### Hourly revenue on weekends vs. workdays (Cologne)

In [None]:
# Compute the revenue for weekdays vs weekends
c_revenue_workday = c_kpi_df[c_kpi_df.weekday <= 5].groupby(
                       c_kpi_df.hour).cost.sum()
c_revenue_weekend = c_kpi_df[c_kpi_df.weekday > 5].groupby(
                       c_kpi_df.hour).cost.sum()

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(c_revenue_workday, color="steelblue", label="Workday")
# set x-axis label
ax.set_xlabel("Hour of the day", fontsize=12)
# show hours on the x-axis in 1-hour-steps
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))
# set y-axis label
ax.set_ylabel("Revenue workday (in €)", color="steelblue", fontsize=12)
# show legend for Cologne
plt.legend(loc="upper left")

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(c_revenue_weekend,color="coral", label="Weekend")
# set y-axis label
ax2.set_ylabel("Revenue weekend (in €)", color="coral", fontsize=12)
# set diagram title
plt.title('Total hourly revenue on weekends vs. workdays (Cologne)', fontsize=14)
# show legend for Cologne
plt.legend(loc="upper right")
# display the diagram
plt.show()


### Revenue per weekday (Cologne)

In [None]:
# calculate the total revenue generated per day of the week over the whole time period in Cologne
c_trip_cost_sum_per_weekday = c_kpi_df.groupby('weekday2').cost.sum()

In [None]:
# get the weekdays in the right order
c_trip_cost_sum_per_weekday = pd.DataFrame(c_trip_cost_sum_per_weekday)

sorter = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sorterIndex = dict(zip(sorter,range(len(sorter))))

c_trip_cost_sum_per_weekday['day_id'] = c_trip_cost_sum_per_weekday.index
c_trip_cost_sum_per_weekday['day_id'] = c_trip_cost_sum_per_weekday['day_id'].map(sorterIndex)
c_trip_cost_sum_per_weekday.head(10)

c_trip_cost_sum_per_weekday.sort_values('day_id', inplace=True)
c_trip_cost_sum_per_weekday.head(10)

In [None]:
# plotting the total revenue generated per day of the week over the whole time period in Cologne
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(c_trip_cost_sum_per_weekday)

# Set the range of the y-axis  
ax.set_ylim(120000,200000)


plt.xlabel('Day of the week', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Total revenue per weekday (Cologne)', fontsize=14)

### Revenue per month (Cologne)

In [None]:
# calculate the total revenue generated per month in Cologne
c_trip_cost_sum_per_month = c_kpi_df.groupby('month').cost.sum()

In [None]:
# get the months in the right order
c_trip_cost_sum_per_month = pd.DataFrame(c_trip_cost_sum_per_month)
c_trip_cost_sum_per_month

month_sorter = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_sorterIndex = dict(zip(month_sorter,range(len(month_sorter))))

c_trip_cost_sum_per_month['month_id'] = c_trip_cost_sum_per_month.index
c_trip_cost_sum_per_month['month_id'] = c_trip_cost_sum_per_month['month_id'].map(month_sorterIndex)
c_trip_cost_sum_per_month.head(10)

c_trip_cost_sum_per_month.sort_values('month_id', inplace=True)
c_trip_cost_sum_per_month.head(10)

In [None]:
# plotting the total revenue generated per month in Cologne
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(c_trip_cost_sum_per_month)

# Set the range of the y-axis  
ax.set_ylim(60000,150000)


plt.xlabel('Month', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Revenue per month (Cologne)', fontsize=14)

## Revenue for Essen

### Total revenue per hour (Essen)

In [None]:
# calculate the total revenue generated per hour of the day over the whole time period in Essen
e_trip_cost_sum_per_hour = e_kpi_df.groupby('hour').cost.sum()
e_trip_cost_sum_per_hour

In [None]:
# plotting the total revenue generated per hour of the day over the whole time period in Essen
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(e_trip_cost_sum_per_hour)
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))

plt.xlabel('Hour of the day', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Total revenue per hour over the whole time period (Essen)', fontsize=14)

### Average Revenue per hour (Essen)

In [None]:
# turn into numpy array, to perform mathematical operations
e_trip_cost_sum_per_hour = np.array(e_trip_cost_sum_per_hour)

# Average revenue per hour during a single day in Essen
# To get an idea how much revenue is generated on average per hour, we divide the total hourly revenue by the number of days 

e_avg_hourly_revenue = e_trip_cost_sum_per_hour/365

In [None]:
# plotting the average revenue generated per hour of the day during a single day in Essen
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(e_avg_hourly_revenue)
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))

plt.xlabel('Hour of the day', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Average revenue per hour during a single day (Essen)', fontsize=14)

### Revenue per hour per bike (Essen)

In [None]:
# Revenue per hour per bike
e_number_of_bikes = len(e_kpi_df["b_number"].unique())
print("The number of bikes in Essen is:", e_number_of_bikes)

e_hourly_revenue_per_bike = e_trip_cost_sum_per_hour/e_number_of_bikes

In [None]:
# plotting the revenue generated per hour per bike in Essen
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(e_hourly_revenue_per_bike)
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))

plt.xlabel('Hour of the day', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Average Revenue per hour per bike (Essen)', fontsize=14)

### Hourly revenue on weekends vs. workdays (Essen)

In [None]:
# Compute the revenue for weekdays vs weekends
e_revenue_workday = e_kpi_df[e_kpi_df.weekday <= 5].groupby(
                       e_kpi_df.hour).cost.sum()
e_revenue_weekend = e_kpi_df[e_kpi_df.weekday > 5].groupby(
                       e_kpi_df.hour).cost.sum()

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(e_revenue_workday, color="steelblue", label="Workday")
# set x-axis label
ax.set_xlabel("Hour of the day", fontsize=12)
# show hours on the x-axis in 1-hour-steps
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))
# set y-axis label
ax.set_ylabel("Revenue workday (in €)", color="steelblue", fontsize=12)
# show legend for Cologne
plt.legend(loc="upper left")

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(e_revenue_weekend,color="coral", label="Weekend")
# set y-axis label
ax2.set_ylabel("Revenue weekend (in €)", color="coral", fontsize=12)
# set diagram title
plt.title('Total hourly revenue on weekends vs. workdays (Essen)', fontsize=14)
# show legend for Cologne
plt.legend(loc="upper right")
# display the diagram
plt.show()

### Revenue per weekday (Essen)

In [None]:
# calculate the total revenue generated per day of the week over the whole time period in Essen
e_trip_cost_sum_per_weekday = e_kpi_df.groupby('weekday2').cost.sum()

In [None]:
# get the weekdays in the right order
e_trip_cost_sum_per_weekday = pd.DataFrame(e_trip_cost_sum_per_weekday)

sorter = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sorterIndex = dict(zip(sorter,range(len(sorter))))

e_trip_cost_sum_per_weekday['day_id'] = e_trip_cost_sum_per_weekday.index
e_trip_cost_sum_per_weekday['day_id'] = e_trip_cost_sum_per_weekday['day_id'].map(sorterIndex)
e_trip_cost_sum_per_weekday.head(10)

e_trip_cost_sum_per_weekday.sort_values('day_id', inplace=True)
e_trip_cost_sum_per_weekday.head(10)

In [None]:
# plotting the total revenue generated per day of the week over the whole time period in Essen
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(e_trip_cost_sum_per_weekday)

# Set the range of the y-axis  
ax.set_ylim(5000,14000)


plt.xlabel('Day of the week', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Total Revenue per weekday over the whole time period (Essen)', fontsize=14)

### Revenue per month (Essen)

In [None]:
# calculate the total revenue generated per month in Essen
e_trip_cost_sum_per_month = e_kpi_df.groupby('month').cost.sum()

In [None]:
# get the months in the right order
e_trip_cost_sum_per_month = pd.DataFrame(e_trip_cost_sum_per_month)
e_trip_cost_sum_per_month

month_sorter = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_sorterIndex = dict(zip(month_sorter,range(len(month_sorter))))

e_trip_cost_sum_per_month['month_id'] = e_trip_cost_sum_per_month.index
e_trip_cost_sum_per_month['month_id'] = e_trip_cost_sum_per_month['month_id'].map(month_sorterIndex)
e_trip_cost_sum_per_month.head(10)

e_trip_cost_sum_per_month.sort_values('month_id', inplace=True)
e_trip_cost_sum_per_month.head(10)

In [None]:
# plotting the total revenue generated per month in Essen
fig,ax = plt.subplots(figsize=(12,4)) 
ax.plot(e_trip_cost_sum_per_month)

# Set the range of the y-axis  
ax.set_ylim(2000,10000)


plt.xlabel('Month', fontsize=12)
plt.ylabel('Revenue (in €)', fontsize=12)
plt.title('Revenue per month (Essen)', fontsize=14)

## Revenue Comparison Cologne vs. Essen

In this last section on the revenue KPI's we are going to directly compare the results for both cities.

Since the total revenue for Cologne is much higher than for Essen (it's about 17 times as high), we need to adjust the way we display the data within a single diagram. 
In this case we chose to use a lineplot with two different y-axis - one for Cologne and one for Essen.
This enables us to nicely spot the differences of the two cities for the given timeframe.

In [None]:
# Total Revenue for Cologne
c_total_revenue = c_kpi_df['cost'].sum()
print("The total revenue generated in Cologne is:", c_total_revenue, "€")

# Total Revenue for Essen FEHLT NOCH -> erstmal e_tripTime berechnen
e_total_revenue = e_kpi_df['cost'].sum()
print("The total revenue generated in Essen is:", e_total_revenue, "€")

scale_factor = c_total_revenue/e_total_revenue
print("The total revenue generated in Cologne is",scale_factor, "times higher than in Essen.")

### Total revenue per hour (Cologne vs. Essen)

In [None]:
# Calculate the total revenue generated per bike in Cologne
c_revenue_per_bike = c_total_revenue/c_number_of_bikes

# Calculate the total revenue generated per bike in Cologne
e_revenue_per_bike = e_total_revenue/e_number_of_bikes

print("The revenue generated per bike in Cologne is", c_revenue_per_bike, "€.")
print("The revenue generated per bike in Essen is", e_revenue_per_bike, "€.")

revenue_per_bike_diff = c_revenue_per_bike/e_revenue_per_bike
print("The revenue generated per bike in Cologne is", revenue_per_bike_diff, "times higher than in Essen.")

# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(c_trip_cost_sum_per_hour, color="steelblue", label="Cologne")
# set x-axis label
ax.set_xlabel("Hour of the day", fontsize=12)
# show hours on the x-axis in 1-hour-steps
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))
# set y-axis label
ax.set_ylabel("Revenue in Cologne (in €)", color="steelblue", fontsize=12)
# show legend for Cologne
plt.legend(loc="upper left")

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(e_trip_cost_sum_per_hour,color="coral", label="Essen")
# set y-axis label
ax2.set_ylabel("Revenue in Essen (in €)", color="coral", fontsize=12)
# set diagram title
plt.title('Total Revenue per hour of the day (Cologne vs. Essen)', fontsize=14)
# show legend for Cologne
plt.legend(loc="upper right")
# display the diagram
plt.show()

### Average revenue per hour (Cologne vs. Essen)

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(c_avg_hourly_revenue, color="steelblue", label="Cologne")
# set x-axis label
ax.set_xlabel("Hour of the day", fontsize=12)
# show hours on the x-axis in 1-hour-steps
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))
# set y-axis label
ax.set_ylabel("Revenue in Cologne (in €)", color="steelblue", fontsize=12)
# show legend for Cologne
plt.legend(loc="upper left")

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(e_avg_hourly_revenue,color="coral", label="Essen")
# set y-axis label
ax2.set_ylabel("Revenue in Essen (in €)", color="coral", fontsize=12)
# set diagram title
plt.title('Avergae Revenue per hour for a single day (Cologne vs. Essen)', fontsize=14)
# show legend for Cologne
plt.legend(loc="upper right")
# display the diagram
plt.show()

### Revenue per hour per bike (Cologne vs. Essen)

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(c_hourly_revenue_per_bike, color="steelblue", label="Cologne")
# set x-axis label
ax.set_xlabel("Hour of the day", fontsize=12)
# show hours on the x-axis in 1-hour-steps
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))
# set y-axis label
ax.set_ylabel("Revenue in Cologne (in €)", color="steelblue", fontsize=12)
# show legend for Cologne
plt.legend(loc="upper left")

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(e_hourly_revenue_per_bike,color="coral", label="Essen")
# set y-axis label
ax2.set_ylabel("Revenue in Essen (in €)", color="coral", fontsize=12)
# set diagram title
plt.title('Revenue per hour per bike (Cologne vs. Essen)', fontsize=14)
# show legend for Cologne
plt.legend(loc="upper right")
# display the diagram
plt.show()

### Hourly revenue on weekend vs. workdays (Cologne vs. Essen)

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(c_revenue_workday, color="steelblue", label="Cologne")
# set x-axis label
ax.set_xlabel("Hour of the day", fontsize=12)
# show hours on the x-axis in 1-hour-steps
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))
# set y-axis label
ax.set_ylabel("Revenue Cologne (in €)", color="steelblue", fontsize=12)
# show legend for Cologne
plt.legend(loc="upper left")

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(e_revenue_workday,color="coral", label="Essen")
# set y-axis label
ax2.set_ylabel("Revenue Essen (in €)", color="coral", fontsize=12)
# set diagram title
plt.title('Total hourly revenue on workdays (Cologne vs. Essen)', fontsize=14)
# show legend for Cologne
plt.legend(loc="upper right")
# display the diagram
plt.show()

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(c_revenue_weekend, color="steelblue", label="Cologne")
# set x-axis label
ax.set_xlabel("Hour of the day", fontsize=12)
# show hours on the x-axis in 1-hour-steps
ax.set_xlim(0,23)
ax.set_xticks(range(0,24))
# set y-axis label
ax.set_ylabel("Revenue Cologne (in €)", color="steelblue", fontsize=12)
# show legend for Cologne
plt.legend(loc="upper left")

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(e_revenue_weekend,color="coral", label="Essen")
# set y-axis label
ax2.set_ylabel("Revenue Essen (in €)", color="coral", fontsize=12)
# set diagram title
plt.title('Total hourly revenue on weekends (Cologne vs. Essen)', fontsize=14)
# show legend for Cologne
plt.legend(loc="upper right")
# display the diagram
plt.show()


### Revenue per week (Cologne vs. Essen)

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(c_trip_cost_sum_per_weekday, color="steelblue", label="Cologne")
# set x-axis label
ax.set_xlabel("Weekday", fontsize=12)
# set y-axis label
ax.set_ylabel("Revenue in Cologne (in €)", color="steelblue", fontsize=12)
# Set the range of the y-axis  
ax.set_ylim(130000,200000)

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(e_trip_cost_sum_per_weekday,color="coral", label="Essen")
# set y-axis label
ax2.set_ylabel("Revenue in Essen (in €)", color="coral", fontsize=12)
# Set the range of the y-axis  
ax2.set_ylim(6000,13000)
# set diagram title
plt.title('Revenue per weekday (Cologne vs. Essen)', fontsize=14)
# display the diagram
plt.show()

### Revenue per month (Cologne vs. Essen)

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(12,4))
# make the plot for Cologne
ax.plot(c_trip_cost_sum_per_month, color="steelblue", label="Cologne")
# set x-axis label
ax.set_xlabel("Month", fontsize=12)
# set y-axis label
ax.set_ylabel("Revenue in Cologne (in €)", color="steelblue", fontsize=12)
# Set the range of the y-axis  
ax.set_ylim(70000,140000)

# create twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(e_trip_cost_sum_per_month,color="coral", label="Essen")
# set y-axis label
ax2.set_ylabel("Revenue in Essen (in €)", color="coral", fontsize=12)
# Set the range of the y-axis  
ax2.set_ylim(2000,10000)
# set diagram title
plt.title('Revenue per month (Cologne vs. Essen)', fontsize=14)
# display the diagram
plt.show()


# KPI: Coverage and peak demand of bikes per borough

## Definition of the Dataframes for Cologne and Essen for Start and Destination

First we define some Dataframes, on which the computing process will depend. Furthermore we import the Shapefiles containing the location and shape of the boroughs, which we need for our visualization.

In [None]:
e_boroughs = gpd.read_file("Stadtteil/Essen-Stadtteile-map2.shp")
e_boroughs = e_boroughs.drop(['boundary', 'type' , 'source' , 'wikipedia' , 'admin_leve' , 'ref'], axis =1)
e_boroughs = e_boroughs.reset_index()
e_boroughs.head()

In [None]:
c_boroughs = gpd.read_file("Stadtteil/Stadtteil_WGS84.shp")
c_boroughs2 = gpd.read_file("Stadtteil/Stadtteil.shp")
c_boroughs['name'] = c_boroughs2['STT_NAME']
c_boroughs = c_boroughs.drop(['STT_NAME', 'SHAPE_AREA' , 'SHAPE_LEN' , 'STT_NR'], axis =1)
c_boroughs = c_boroughs.reset_index()
c_boroughs.head()

In [None]:
cologne_timedGeo =  pd.DataFrame({ 'Zeitstempel':cologneCleanedData["Zeitstempel"] ,
                                  'hour' : cologneCleanedData["hour"] ,
                       'b_number':cologneCleanedData["b_number"] ,
                       'orig_lat':cologneCleanedData["orig_lat"] ,
                       'orig_lng':cologneCleanedData["orig_lng"] ,
                       'dest_lat':cologneCleanedData["dest_lat"] ,
                       'dest_lng':cologneCleanedData["dest_lng"] })
cologne_timedGeo['dest_lat'] = cologne_timedGeo['dest_lat'].round(decimals = 5)
cologne_timedGeo['dest_lng'] = cologne_timedGeo['dest_lng'].round(decimals = 5)
cologne_timedGeo['orig_lat'] = cologne_timedGeo['orig_lat'].round(decimals = 5)
cologne_timedGeo['orig_lng'] = cologne_timedGeo['orig_lng'].round(decimals = 5)

cologne_timedGeo1 = pd.DataFrame({ 'Zeitstempel':cologneCleanedData["Zeitstempel"] ,
                                  'hour' : cologneCleanedData["hour"] ,
                       'b_number':cologneCleanedData["b_number"] ,
                       'orig_lat':cologneCleanedData["orig_lat"] ,
                       'orig_lng':cologneCleanedData["orig_lng"] ,
                       'dest_lat':cologneCleanedData["dest_lat"] ,
                       'dest_lng':cologneCleanedData["dest_lng"] })
cologne_timedGeo1['dest_lat'] = cologne_timedGeo1['dest_lat'].round(decimals = 5)
cologne_timedGeo1['dest_lng'] = cologne_timedGeo1['dest_lng'].round(decimals = 5)
cologne_timedGeo1['orig_lat'] = cologne_timedGeo1['orig_lat'].round(decimals = 5)
cologne_timedGeo1['orig_lng'] = cologne_timedGeo1['orig_lng'].round(decimals = 5)

cologne_timedGeo_orig = gpd.GeoDataFrame(cologne_timedGeo , geometry= gpd.points_from_xy(cologne_timedGeo.orig_lng ,cologne_timedGeo.orig_lat))
cologne_timedGeo_dest = gpd.GeoDataFrame(cologne_timedGeo1 , geometry= gpd.points_from_xy(cologne_timedGeo1.dest_lng ,cologne_timedGeo1.dest_lat))

In [None]:
essen_timedGeo =  pd.DataFrame({ 'Zeitstempel':essenCleanedData["Zeitstempel"] ,
                                'hour' : essenCleanedData["hour"] ,
                       'b_number':essenCleanedData["b_number"] ,
                       'orig_lat':essenCleanedData["orig_lat"] ,
                       'orig_lng':essenCleanedData["orig_lng"] ,
                       'dest_lat':essenCleanedData["dest_lat"] ,
                       'dest_lng':essenCleanedData["dest_lng"] })
essen_timedGeo['dest_lat'] = essen_timedGeo['dest_lat'].round(decimals = 5)
essen_timedGeo['dest_lng'] = essen_timedGeo['dest_lng'].round(decimals = 5)
essen_timedGeo['orig_lat'] = essen_timedGeo['orig_lat'].round(decimals = 5)
essen_timedGeo['orig_lng'] = essen_timedGeo['orig_lng'].round(decimals = 5)

essen_timedGeo1 = pd.DataFrame({ 'Zeitstempel':essenCleanedData["Zeitstempel"] ,
                                'hour' : essenCleanedData["hour"] ,
                       'b_number':essenCleanedData["b_number"] ,
                       'orig_lat':essenCleanedData["orig_lat"] ,
                       'orig_lng':essenCleanedData["orig_lng"] ,
                       'dest_lat':essenCleanedData["dest_lat"] ,
                       'dest_lng':essenCleanedData["dest_lng"] })
essen_timedGeo1['dest_lat'] = essen_timedGeo1['dest_lat'].round(decimals = 5)
essen_timedGeo1['dest_lng'] = essen_timedGeo1['dest_lng'].round(decimals = 5)
essen_timedGeo1['orig_lat'] = essen_timedGeo1['orig_lat'].round(decimals = 5)
essen_timedGeo1['orig_lng'] = essen_timedGeo1['orig_lng'].round(decimals = 5)

essen_timedGeo_orig = gpd.GeoDataFrame(essen_timedGeo , geometry= gpd.points_from_xy(essen_timedGeo.orig_lng ,essen_timedGeo.orig_lat))
essen_timedGeo_dest = gpd.GeoDataFrame(essen_timedGeo1 , geometry= gpd.points_from_xy(essen_timedGeo1.dest_lng ,essen_timedGeo1.dest_lat))

## Definition of the functions used to compute the KPI values

These functions are used to compute the following for both cities individually:

    1) the percentage of used bikes in all boroughs as a mean value for an hour over the year
    2) matching a point in geopandas format in the boroughs of a city
    3) the percentage of used bikes in all boroughs for one individual hour of the year 

In [None]:
def percentage_for_hour_diff_plot (hours , dataframeStart, dataframeDestination , boroughs , scheme_diff , city):
    x = dataframeStart[dataframeStart['hour'] == hours]
    x = x.drop('hour' , axis=1)
    x = x.set_index('index')
    bike = x['index_right'].rename('number_of_bikes')
    combined_with_boroughStart = pd.merge(boroughs, bike, how='outer', on=None, left_on=None, right_on=None,
    left_index=True, right_index=True, sort=True)
    combined_with_boroughStart['number_of_bikes'] = combined_with_boroughStart['number_of_bikes'].fillna(0)
    bikes_in_hour = combined_with_boroughStart.number_of_bikes.sum()
    combined_with_boroughStart.number_of_bikes = combined_with_boroughStart.number_of_bikes.apply(lambda x: x/bikes_in_hour*100)
    combined_with_boroughStart['number_of_bikes'] = combined_with_boroughStart['number_of_bikes'].fillna(0)
    
    y = dataframeDestination[dataframeDestination['hour'] == hours]
    y = y.drop('hour' , axis=1)
    y = y.set_index('index')
    bike = y['index_right'].rename('number_of_bikes')
    combined_with_boroughDest = pd.merge(boroughs, bike, how='outer', on=None, left_on=None, right_on=None,
    left_index=True, right_index=True, sort=True)
    combined_with_boroughDest['number_of_bikes'] = combined_with_boroughDest['number_of_bikes'].fillna(0)
    bikes_in_hour = combined_with_boroughDest.number_of_bikes.sum()
    combined_with_boroughDest.number_of_bikes = combined_with_boroughDest.number_of_bikes.apply(lambda x: x/bikes_in_hour*100)
    combined_with_boroughDest['number_of_bikes'] = combined_with_boroughDest['number_of_bikes'].fillna(0)
    
    diff = combined_with_boroughStart['number_of_bikes']-combined_with_boroughDest['number_of_bikes']
    combined_with_boroughStart['number_of_bikes'] = diff
    
    if ((hours == 0) &  (city == 'cologne')):
        scheme_diff = mapclassify.Quantiles(diff, k=20)
    
    plot_of_map = combined_with_boroughStart.number_of_bikes
    plot = geoplot.choropleth(
        combined_with_boroughStart, hue=plot_of_map, scheme=scheme_diff,
        cmap='Reds', figsize=(50, 60) , legend = True
    )
    plot = plot.set_title('Mean change of bike availability per borough at ' + str(hours) + " o'clock")
    for i in range(0,len(combined_with_boroughStart)):
        combined_with_boroughStart.loc[i,'centroid_lon'] = combined_with_boroughStart.geometry.centroid.x.iloc[i]
        combined_with_boroughStart.loc[i,'centroid_lat'] = combined_with_boroughStart.geometry.centroid.y.iloc[i]
        plt.text(combined_with_boroughStart.loc[i,'centroid_lon'],combined_with_boroughStart.loc[i,'centroid_lat'],combined_with_boroughStart.loc[i,'name']
                , horizontalalignment='center', verticalalignment ='center' ,bbox=dict(facecolor='white', alpha=0.5))
    return diff

In [None]:
def match_Points_to_boroughs(city , df):
    if('cologne' == city):
        bikes_in_boroughs = gpd.sjoin(df, c_boroughs, how="inner", op='intersects')
        rented_bikes_in_boroughs = bikes_in_boroughs.set_index('index_right')
    elif('essen' == city):
        bikes_in_boroughs = gpd.sjoin(df, e_boroughs, how="inner", op='intersects')
        rented_bikes_in_boroughs = bikes_in_boroughs.set_index('index_right')
    return rented_bikes_in_boroughs

In [None]:
def compute_PNOB_city_date (city , date , rented_bikes_in_boroughs , boroughs):
    bikes = rented_bikes_in_boroughs[(rented_bikes_in_boroughs['Zeitstempel'] == date)]
    bikes = bikes.groupby('index_right')['index'].count()
    bike = bikes.rename('number_of_bikes')
    combined_with_borough = pd.merge(boroughs, bike, how='outer', on=None, left_on=None, right_on=None,
    left_index=True, right_index=True, sort=True)
    combined_with_borough['number_of_bikes'] = combined_with_borough['number_of_bikes'].fillna(0)
    bikes_in_hour = combined_with_borough.number_of_bikes.sum()
    combined_with_borough.number_of_bikes = combined_with_borough.number_of_bikes.apply(lambda x: x/bikes_in_hour*100)
    combined_with_borough['number_of_bikes'] = combined_with_borough['number_of_bikes'].fillna(0)
    return combined_with_borough

## Visualization of the 24 hour mean-values over the year for cologne

For a good visualization we observe the differences between our bikes start and end location. This can be achieved with the function defined above. The exact scheme will be the same for each function to increase compareability. In the following we see first two maps containing the start and end distribution of bikes over the boroughs. After this there are 24 maps each showing a different hour computed as a mean value over the year. The same will be done for the data of essen afterwards.

In [None]:
date = pd.datetime.strptime('2019-07-13 11:00:00', '%Y-%m-%d %H:%M:%S')

In [None]:
cologne_timedGeo_orig

In [None]:
#the trips are matched to the boroughs and the number of bikes per borough is counted
rented_bikes_in_boroughs_Start = match_Points_to_boroughs('cologne', cologne_timedGeo_orig)
rbibs = rented_bikes_in_boroughs_Start
c_validity_S = rbibs.groupby('name').count()
rented_bikes_in_boroughs_Dest = match_Points_to_boroughs('cologne', cologne_timedGeo_dest)
rbibd = rented_bikes_in_boroughs_Dest
c_validity_D = rbibd.groupby('name').count()
cbS = compute_PNOB_city_date ('cologne' , date , rented_bikes_in_boroughs_Start , c_boroughs)
cbD = compute_PNOB_city_date ('cologne' , date , rented_bikes_in_boroughs_Dest , c_boroughs)

In [None]:
#this value shows the number of boroughs where there was no rental activity at all. 
c_validity = c_validity_S.merge(c_validity_D, left_on='name', right_on='name')
print(len(c_boroughs) - len (c_validity))

In [None]:
#this scheme is used in all following similar maps to make them comparable
scheme_oneHour = mapclassify.Quantiles(cbS.number_of_bikes, k=20)

In [None]:
#the map of the renting in Cologne
plot_of_map = cbS.number_of_bikes
plot = geoplot.choropleth(
    cbS, hue=plot_of_map, scheme=scheme_oneHour,
    cmap='Reds', figsize=(50, 60) , legend = True
)
plot = plot.set_title('Distribution of tripstarts at ' + str(date)+ "(Cologne)")
for i in range(0,len(cbS)):
    cbS.loc[i,'centroid_lon'] = cbS.geometry.centroid.x.iloc[i]
    cbS.loc[i,'centroid_lat'] = cbS.geometry.centroid.y.iloc[i]
    plt.text(cbS.loc[i,'centroid_lon'],cbS.loc[i,'centroid_lat'],cbS.loc[i,'name']
             , horizontalalignment='center', verticalalignment ='center' ,bbox=dict(facecolor='white', alpha=0.5))

In [None]:
#the map of the returning of bikes in Cologne
plot_of_map = cbD.number_of_bikes
plot = geoplot.choropleth(
    cbD, hue=plot_of_map, scheme=scheme_oneHour,
    cmap='Reds', figsize=(50, 60) , legend = True
)
plot = plot.set_title('Distribution of tripdestinations at ' + str(date)+ "(Cologne)")
for i in range(0,len(cbD)):
    cbD.loc[i,'centroid_lon'] = cbD.geometry.centroid.x.iloc[i]
    cbD.loc[i,'centroid_lat'] = cbD.geometry.centroid.y.iloc[i]
    plt.text(cbD.loc[i,'centroid_lon'],cbD.loc[i,'centroid_lat'],cbD.loc[i,'name']
             , horizontalalignment='center', verticalalignment ='center' ,bbox=dict(facecolor='white', alpha=0.5))

In [None]:
#here all trips are matched to two mapcharts
cologne_timedGeo_orig = gpd.sjoin(cologne_timedGeo_orig, c_boroughs, how="inner", op='intersects')
cologne_timedGeo_orig = cologne_timedGeo_orig.groupby(['hour' , 'index'])['index_right'].count()
cologne_timedGeo_orig = cologne_timedGeo_orig.reset_index()
cologne_timedGeo_dest = gpd.sjoin(cologne_timedGeo_dest, c_boroughs, how="inner", op='intersects')
cologne_timedGeo_dest = cologne_timedGeo_dest.groupby(['hour' , 'index'])['index_right'].count()
cologne_timedGeo_dest = cologne_timedGeo_dest.reset_index()

In [None]:
#the differences between the renting and return activity will now be plotted for each hour of the year using mean values
hours = 0

scheme_diff = mapclassify.Quantiles(cbS.number_of_bikes, k=20)

c_difference = percentage_for_hour_diff_plot(hours , cologne_timedGeo_orig , cologne_timedGeo_dest , c_boroughs ,scheme_diff , 'cologne')
hours += 1

scheme_diff = mapclassify.Quantiles(c_difference, k=20)

while hours<24:
    c_difference = c_difference + percentage_for_hour_diff_plot(hours , cologne_timedGeo_orig , cologne_timedGeo_dest , c_boroughs ,scheme_diff , 'cologne')
    hours+=1

## Visualization of the 24 hour mean-values over the year for essen

The following cells are doing the exact same computings for the city of Essen as the above cells did for Cologne. 

In [None]:
rented_bikes_in_boroughs_Start = match_Points_to_boroughs('essen', essen_timedGeo_orig)
rbibs = rented_bikes_in_boroughs_Start
e_validity_S = rbibs.groupby('name').count()
rented_bikes_in_boroughs_Dest = match_Points_to_boroughs('essen', essen_timedGeo_dest)
rbibd = rented_bikes_in_boroughs_Dest
e_validity_D = rbibd.groupby('name').count()
ebS = compute_PNOB_city_date ('essen' , date , rented_bikes_in_boroughs_Start , e_boroughs)
ebD = compute_PNOB_city_date ('essen' , date , rented_bikes_in_boroughs_Dest , e_boroughs)

In [None]:
#this value shows the number of boroughs where there was no rental activity at all. 
e_validity = e_validity_S.merge(e_validity_D, left_on='name', right_on='name')
print(len(e_boroughs) - len (e_validity))

In [None]:
plot_of_map = ebS.number_of_bikes
plot = geoplot.choropleth(
    ebS, hue=plot_of_map, scheme=scheme_oneHour,
    cmap='Reds', figsize=(50, 60) , legend = True
)
plot = plot.set_title('Distribution of tripstarts at ' + str(date)+ "(Essen)")
for i in range(0,len(ebS)):
    ebS.loc[i,'centroid_lon'] = ebS.geometry.centroid.x.iloc[i]
    ebS.loc[i,'centroid_lat'] = ebS.geometry.centroid.y.iloc[i]
    plt.text(ebS.loc[i,'centroid_lon'],ebS.loc[i,'centroid_lat'],ebS.loc[i,'name']
             , horizontalalignment='center', verticalalignment ='center' ,bbox=dict(facecolor='white', alpha=0.5))

In [None]:
plot_of_map = ebD.number_of_bikes
plot = geoplot.choropleth(
    ebD, hue=plot_of_map, scheme=scheme_oneHour,
    cmap='Reds', figsize=(50, 60) , legend = True
)
plot = plot.set_title('Distribution of tripdestinations at ' + str(date) + "(Essen)")
for i in range(0,len(ebD)):
    ebD.loc[i,'centroid_lon'] = ebD.geometry.centroid.x.iloc[i]
    ebD.loc[i,'centroid_lat'] = ebD.geometry.centroid.y.iloc[i]
    plt.text(ebD.loc[i,'centroid_lon'],ebD.loc[i,'centroid_lat'],ebD.loc[i,'name']
             , horizontalalignment='center', verticalalignment ='center' ,bbox=dict(facecolor='white', alpha=0.5))

In [None]:
essen_timedGeo_orig = gpd.sjoin(essen_timedGeo_orig, e_boroughs, how="inner", op='intersects')
essen_timedGeo_orig = essen_timedGeo_orig.groupby(['hour' , 'index'])['index_right'].count()
essen_timedGeo_orig = essen_timedGeo_orig.reset_index()
essen_timedGeo_dest = gpd.sjoin(essen_timedGeo_dest, e_boroughs, how="inner", op='intersects')
essen_timedGeo_dest = essen_timedGeo_dest.groupby(['hour' , 'index'])['index_right'].count()
essen_timedGeo_dest = essen_timedGeo_dest.reset_index()

In [None]:
hours = 0

e_difference = percentage_for_hour_diff_plot(hours , essen_timedGeo_orig , essen_timedGeo_dest , e_boroughs ,scheme_diff , 'essen')
hours += 1


while hours<24:
    e_difference = e_difference + percentage_for_hour_diff_plot(hours , essen_timedGeo_orig , essen_timedGeo_dest , e_boroughs ,scheme_diff , 'essen')
    hours+=1

## Visualization of the difference between starting and end boroughs computed for the mean day in Cologne compared to Essen

The histogram charts show the distribution of how big the difference was for each borough.

In [None]:
plt.hist(c_difference, bins=50, edgecolor='k')
plt.title('Cologne')
plt.show()

In [None]:
plt.hist(e_difference, bins=50, edgecolor='k')
plt.title('Essen')
plt.show()

In [None]:
#these standard deviations are the values, that can be compared across cities.
c_standard_deviation = np.std(c_difference)
print(c_standard_deviation)
e_standard_deviation = np.std(e_difference)
print(e_standard_deviation)