In [76]:
import pandas as pd
import numpy as np
import seaborn as sns
import calendar

In [18]:
# Import our dataset from first week

df1 = pd.read_csv('../raw_data/select_2020.csv',encoding='iso-8859-2',sep=';')
df2 = pd.read_csv('../raw_data/select_2021.csv',encoding='iso-8859-2',sep=';')
df3 = pd.read_csv('../raw_data/select_2022.csv',encoding='iso-8859-2',sep=';')

df = pd.concat([df1, df2, df3])

In [77]:
# Filter train df, clean it up 

#filter for München and Köln 
df = df.query("bhf in ('München Hbf', 'Köln Hbf', 'Köln Messe/Deutz Gl.11-12')")

#cleaning of train names 
#some letters of 'zugnr' are not capitalized
df['zugnr'] = df['zugnr'].str.upper()

#process of 9999 in arrTime and depTime
df['start_or_endpoint'] = 'nan'
        
df.loc[df['arrTime'] == 9999, 'start_or_endpoint'] = 'start'
df.loc[df['depTime'] == 9999, 'start_or_endpoint'] = 'end'

#overwrite 9999 with respective arr/dep time of same observation (in new clean columns)
df['arrTime_clean'] = np.where(df['arrTime'] == 9999, df['depTime'], df['arrTime'])
df['depTime_clean'] = np.where(df['depTime'] == 9999, df['arrTime'], df['depTime'])

#fill with zeros in front of the hours
df['arrTime_clean'] = df['arrTime_clean'].astype(str)
df['arrTime_clean'] = df['arrTime_clean'].map(lambda a: a.zfill(4))
df['depTime_clean'] = df['depTime_clean'].astype(str)
df['depTime_clean'] = df['depTime_clean'].map(lambda a: a.zfill(4))

#add date column
df['date'] = df['datum'] + ' ' + df['arrTime_clean']
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H%M')
#3) add month of the year 
df['weekday'] = df['date'].dt.day_name()
#4) add month of the year 
df['month'] = df['date'].dt.month_name()

df['date'] = df['datum'] + ' ' + df['arrTime_clean'].astype(str)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H%M')
df['sharp_date'] = df['date'].dt.round('H')
df

Unnamed: 0,zugnr,datum,bhf,arrTime,adelay,depTime,ddelay,start_or_endpoint,arrTime_clean,depTime_clean,date,weekday,month,sharp_date
15,EC 6,2019-12-15,Köln Hbf,1906,0,1908,0,,1906,1908,2019-12-15 19:06:00,Sunday,December,2019-12-15 19:00:00
36,EC 6,2019-12-16,Köln Hbf,1906,10,1909,8,,1906,1909,2019-12-16 19:06:00,Monday,December,2019-12-16 19:00:00
56,EC 6,2019-12-17,Köln Hbf,1906,47,1909,47,,1906,1909,2019-12-17 19:06:00,Tuesday,December,2019-12-17 19:00:00
79,EC 6,2019-12-18,Köln Hbf,1906,-1,1909,-1,,1906,1909,2019-12-18 19:06:00,Wednesday,December,2019-12-18 19:00:00
80,EC 6,2019-12-18,Köln Messe/Deutz Gl.11-12,1906,29,1909,30,,1906,1909,2019-12-18 19:06:00,Wednesday,December,2019-12-18 19:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289452,NJ 421,2022-05-13,München Hbf,711,0,728,5,,0711,0728,2022-05-13 07:11:00,Friday,May,2022-05-13 07:00:00
289468,NJ 421,2022-05-14,Köln Hbf,2212,2,2216,1,,2212,2216,2022-05-14 22:12:00,Saturday,May,2022-05-14 22:00:00
289471,NJ 421,2022-05-14,München Hbf,711,0,728,0,,0711,0728,2022-05-14 07:11:00,Saturday,May,2022-05-14 07:00:00
289487,NJ 421,2022-05-15,Köln Hbf,2212,2,2216,1,,2212,2216,2022-05-15 22:12:00,Sunday,May,2022-05-15 22:00:00


In [78]:
# Import journey index, and clean it up with the procedures set up by Marie

#processing
journeys = journeys[journeys.leg1_train.notna()] #delete duplicated trips that once go to köln hbf and once to deutz
journeys = journeys[journeys.leg2_train.notna()] 
journeys = journeys[journeys.leg3_train.notna()] 

journeys = journeys.drop(['Unnamed: 0'],axis=1) 
journeys['key_ID'] = list(range(journeys.shape[0]))

#rename columns (necessary for wide_to_long function)
journeys.columns = [ 'date', 'weekday', 'month', 'journey_origin',
       'journey_destination', 'journey_start', 'journey_end',
       'journey_duration', 'journey_numberlegs', 
       'train_leg1', 'origin_leg1','destination_leg1', 'start_leg1', 'end_leg1', 'duration_leg1',
       'train_leg2', 'origin_leg2', 'destination_leg2', 'start_leg2','end_leg2', 'duration_leg2', 
       'train_leg3', 'origin_leg3','destination_leg3', 'start_leg3', 'end_leg3', 'duration_leg3', 
       'key_ID']

journeys_long = pd.wide_to_long(df = journeys,
                                stubnames=['train', 'origin','destination', 'start', 'end', 'duration'],
                                i=['key_ID'],
                                j='leg',
                                sep = '_',
                                suffix='.+').reset_index()

#delete empty legs
journeys_long = journeys_long[journeys_long.train != '-1']

#reorder columns
journeys_long = journeys_long[['key_ID', 
 'journey_origin', 
 'journey_destination',
 'journey_start',
 'journey_end',
 'journey_duration', 
 'journey_numberlegs',
 'leg', 
 'train', 
 'origin', 
 'destination', 
 'start', 
 'end',
 'duration',
 'date', 'month','weekday']]

for index,i in journeys_long.iterrows(): #this is needed as there was a mismatch between the dfs, comment out if using a more recent journey index
    #print(type(i['weekday']))
    journeys_long['weekday'][index] = calendar.day_name[i['weekday']]
    journeys_long['month'][index] = calendar.month_name[i['month']]

with pd.option_context("display.max_rows", 100, "display.max_columns", None):
    display(journeys_long.head(10))
    
    

# check all legs of one journey (connected by key_ID)   
# with pd.option_context("display.max_rows", 100, "display.max_columns", None):
# display(journeys_long[journeys_long.key_ID == 0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  journeys_long['weekday'][index] = calendar.day_name[i['weekday']]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  journeys_long['month'][index] = calendar.month_name[i['month']]


Unnamed: 0,key_ID,journey_origin,journey_destination,journey_start,journey_end,journey_duration,journey_numberlegs,leg,train,origin,destination,start,end,duration,date,month,weekday
0,0,Köln Hbf,München Hbf,2022-09-04 04:22:00+02:00,2022-09-04 09:13:00+02:00,17460,2,leg1,ICE 521,Köln Hbf,Frankfurt(Main) Flughafen Fernbf,2022-09-04 04:22:00+02:00,2022-09-04 05:26:00+02:00,3840,2022-09-04,September,Sunday
1,1,Köln Hbf,München Hbf,2022-09-04 04:22:00+02:00,2022-09-04 09:39:00+02:00,19020,1,leg1,ICE 521,Köln Hbf,München Hbf,2022-09-04 04:22:00+02:00,2022-09-04 09:39:00+02:00,19020,2022-09-04,September,Sunday
2,2,Köln Hbf,München Hbf,2022-09-04 05:48:00+02:00,2022-09-04 10:27:00+02:00,16740,1,leg1,ICE 511,Köln Hbf,München Hbf,2022-09-04 05:48:00+02:00,2022-09-04 10:27:00+02:00,16740,2022-09-04,September,Sunday
3,3,Köln Hbf,München Hbf,2022-09-04 06:54:00+02:00,2022-09-04 11:27:00+02:00,16380,2,leg1,ICE 101,Köln Hbf,Mannheim Hbf,2022-09-04 06:54:00+02:00,2022-09-04 08:23:00+02:00,5340,2022-09-04,September,Sunday
4,4,Köln Hbf,München Hbf,2022-09-04 07:55:00+02:00,2022-09-04 12:27:00+02:00,16320,1,leg1,ICE 513,Köln Hbf,München Hbf,2022-09-04 07:55:00+02:00,2022-09-04 12:27:00+02:00,16320,2022-09-04,September,Sunday
5,5,Köln Hbf,München Hbf,2022-09-04 08:23:00+02:00,2022-09-04 13:12:00+02:00,17340,3,leg1,ICE 11,Köln Hbf,Frankfurt(Main) Flughafen Fernbf,2022-09-04 08:23:00+02:00,2022-09-04 09:17:00+02:00,3240,2022-09-04,September,Sunday
6,6,Köln Hbf,München Hbf,2022-09-04 08:54:00+02:00,2022-09-04 13:27:00+02:00,16380,2,leg1,ICE 103,Köln Hbf,Mannheim Hbf,2022-09-04 08:54:00+02:00,2022-09-04 10:23:00+02:00,5340,2022-09-04,September,Sunday
7,7,Köln Hbf,München Hbf,2022-09-04 09:55:00+02:00,2022-09-04 14:27:00+02:00,16320,1,leg1,ICE 515,Köln Hbf,München Hbf,2022-09-04 09:55:00+02:00,2022-09-04 14:27:00+02:00,16320,2022-09-04,September,Sunday
8,8,Köln Hbf,München Hbf,2022-09-04 10:18:00+02:00,2022-09-04 15:12:00+02:00,17640,3,leg1,ICE 13,Köln Hbf,Frankfurt(Main) Flughafen Fernbf,2022-09-04 10:18:00+02:00,2022-09-04 11:17:00+02:00,3540,2022-09-04,September,Sunday
9,9,Köln Hbf,München Hbf,2022-09-04 10:54:00+02:00,2022-09-04 15:28:00+02:00,16440,2,leg1,ICE 255,Köln Hbf,Mannheim Hbf,2022-09-04 10:54:00+02:00,2022-09-04 12:23:00+02:00,5340,2022-09-04,September,Sunday


In [93]:
journeys = pd.read_csv('../data/journeyindex.csv')

In [89]:
journeys_long[journeys_long['end']== 'Mannheim Hbf']

Unnamed: 0,key_ID,journey_origin,journey_destination,journey_start,journey_end,journey_duration,journey_numberlegs,leg,train,origin,destination,start,end,duration,date,month,weekday
986,166,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-04 06:44:00+02:00,2022-09-04 11:27:00+02:00,16980,3,leg2,ICE 101,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-04 07:52:00+02:00,Mannheim Hbf,1860,2022-09-04,September,Sunday
990,170,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-04 08:44:00+02:00,2022-09-04 13:27:00+02:00,16980,3,leg2,ICE 103,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-04 09:52:00+02:00,Mannheim Hbf,1860,2022-09-04,September,Sunday
995,175,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-04 12:44:00+02:00,2022-09-04 17:27:00+02:00,16980,3,leg2,ICE 107,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-04 13:52:00+02:00,Mannheim Hbf,1860,2022-09-04,September,Sunday
1000,180,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-04 14:44:00+02:00,2022-09-04 19:27:00+02:00,16980,3,leg2,ICE 109,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-04 15:52:00+02:00,Mannheim Hbf,1860,2022-09-04,September,Sunday
1005,185,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-04 16:44:00+02:00,2022-09-04 21:27:00+02:00,16980,3,leg2,ICE 201,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-04 17:52:00+02:00,Mannheim Hbf,1860,2022-09-04,September,Sunday
1010,190,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-04 18:44:00+02:00,2022-09-04 23:29:00+02:00,17100,3,leg2,ICE 203,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-04 19:52:00+02:00,Mannheim Hbf,1860,2022-09-04,September,Sunday
1015,195,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-05 06:44:00+02:00,2022-09-05 11:27:00+02:00,16980,3,leg2,ICE 101,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-05 07:52:00+02:00,Mannheim Hbf,1860,2022-09-05,September,Monday
1020,200,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-05 08:44:00+02:00,2022-09-05 13:27:00+02:00,16980,3,leg2,ICE 103,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-05 09:52:00+02:00,Mannheim Hbf,1860,2022-09-05,September,Monday
1025,205,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-05 12:44:00+02:00,2022-09-05 17:27:00+02:00,16980,3,leg2,ICE 107,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-05 13:52:00+02:00,Mannheim Hbf,1860,2022-09-05,September,Monday
1030,210,Köln Messe/Deutz Gl.11-12,München Hbf,2022-09-05 14:44:00+02:00,2022-09-05 19:27:00+02:00,16980,3,leg2,ICE 109,Frankfurt(Main) Flughafen Fernbf,Mannheim Hbf,2022-09-05 15:52:00+02:00,Mannheim Hbf,1860,2022-09-05,September,Monday


In [85]:
df[df['start_or_endpoint'] == 'end']

Unnamed: 0,zugnr,datum,bhf,arrTime,adelay,depTime,ddelay,start_or_endpoint,arrTime_clean,depTime_clean,date,weekday,month,sharp_date
10693,EC 8,2020-08-09,Köln Hbf,1710,70,9999,70,end,1710,1710,2020-08-09 17:10:00,Sunday,August,2020-08-09 17:00:00
21346,IC 2210,2019-12-16,Köln Hbf,2305,61,9999,0,end,2305,2305,2019-12-16 23:05:00,Monday,December,2019-12-16 23:00:00
21354,IC 2210,2019-12-17,Köln Hbf,2305,7,9999,7,end,2305,2305,2019-12-17 23:05:00,Tuesday,December,2019-12-17 23:00:00
21362,IC 2210,2019-12-18,Köln Hbf,2305,6,9999,6,end,2305,2305,2019-12-18 23:05:00,Wednesday,December,2019-12-18 23:00:00
21369,IC 2210,2019-12-19,Köln Hbf,2305,34,9999,34,end,2305,2305,2019-12-19 23:05:00,Thursday,December,2019-12-19 23:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270325,ICE 827,2022-05-11,München Hbf,807,27,9999,0,end,0807,0807,2022-05-11 08:07:00,Wednesday,May,2022-05-11 08:00:00
270337,ICE 827,2022-05-12,München Hbf,807,2,9999,0,end,0807,0807,2022-05-12 08:07:00,Thursday,May,2022-05-12 08:00:00
270349,ICE 827,2022-05-13,München Hbf,807,75,9999,0,end,0807,0807,2022-05-13 08:07:00,Friday,May,2022-05-13 08:00:00
270361,ICE 827,2022-05-14,München Hbf,807,16,9999,0,end,0807,0807,2022-05-14 08:07:00,Saturday,May,2022-05-14 08:00:00


In [58]:
calendar.day_name[journeys_long['weekday'][0]]

'Sunday'

In [60]:
journeys_long

Unnamed: 0,key_ID,journey_origin,journey_destination,journey_start,journey_end,journey_duration,journey_numberlegs,leg,train,origin,destination,start,end,duration,date,month,weekday
0,0,Köln Hbf,München Hbf,2022-09-04 04:22:00+02:00,2022-09-04 09:13:00+02:00,17460,2,leg1,ICE 521,Köln Hbf,Frankfurt(Main) Flughafen Fernbf,2022-09-04 04:22:00+02:00,2022-09-04 05:26:00+02:00,3840,2022-09-04,9,September
1,1,Köln Hbf,München Hbf,2022-09-04 04:22:00+02:00,2022-09-04 09:39:00+02:00,19020,1,leg1,ICE 521,Köln Hbf,München Hbf,2022-09-04 04:22:00+02:00,2022-09-04 09:39:00+02:00,19020,2022-09-04,9,September
2,2,Köln Hbf,München Hbf,2022-09-04 05:48:00+02:00,2022-09-04 10:27:00+02:00,16740,1,leg1,ICE 511,Köln Hbf,München Hbf,2022-09-04 05:48:00+02:00,2022-09-04 10:27:00+02:00,16740,2022-09-04,9,September
3,3,Köln Hbf,München Hbf,2022-09-04 06:54:00+02:00,2022-09-04 11:27:00+02:00,16380,2,leg1,ICE 101,Köln Hbf,Mannheim Hbf,2022-09-04 06:54:00+02:00,2022-09-04 08:23:00+02:00,5340,2022-09-04,9,September
4,4,Köln Hbf,München Hbf,2022-09-04 07:55:00+02:00,2022-09-04 12:27:00+02:00,16320,1,leg1,ICE 513,Köln Hbf,München Hbf,2022-09-04 07:55:00+02:00,2022-09-04 12:27:00+02:00,16320,2022-09-04,9,September
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2438,798,München Hbf,Köln Messe/Deutz Gl.11-12,2022-09-10 10:28:00+02:00,2022-09-10 15:15:00+02:00,17220,3,leg3,ICE 720,Frankfurt(Main) Flughafen Fernbf,Köln Messe/Deutz Gl.11-12,2022-09-10 14:24:00+02:00,2022-09-10 15:15:00+02:00,3060,2022-09-10,9,September
2443,803,München Hbf,Köln Messe/Deutz Gl.11-12,2022-09-10 12:27:00+02:00,2022-09-10 17:15:00+02:00,17280,3,leg3,ICE 626,Frankfurt(Main) Flughafen Fernbf,Köln Messe/Deutz Gl.11-12,2022-09-10 16:24:00+02:00,2022-09-10 17:15:00+02:00,3060,2022-09-10,9,September
2449,809,München Hbf,Köln Messe/Deutz Gl.11-12,2022-09-10 14:27:00+02:00,2022-09-10 19:15:00+02:00,17280,3,leg3,ICE 622,Frankfurt(Main) Flughafen Fernbf,Köln Messe/Deutz Gl.11-12,2022-09-10 18:24:00+02:00,2022-09-10 19:15:00+02:00,3060,2022-09-10,9,September
2454,814,München Hbf,Köln Messe/Deutz Gl.11-12,2022-09-10 16:27:00+02:00,2022-09-10 21:14:00+02:00,17220,3,leg3,ICE 528,Frankfurt(Main) Flughafen Fernbf,Köln Messe/Deutz Gl.11-12,2022-09-10 20:25:00+02:00,2022-09-10 21:14:00+02:00,2940,2022-09-10,9,September


In [27]:
journeys_long['weekday'][0].dt.weekday()

AttributeError: 'numpy.int64' object has no attribute 'dt'

In [54]:
from calendar import weekday, day_name

dayNumber = weekday(2015, 5, 8)
dayName = day_name[6]


TypeError: '_localized_day' object is not callable

In [53]:
dayName

'Sunday'

In [47]:
journeys_long['weekday'][0]

6