# PACKAGES

In [229]:
# packages
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime as dt
from datetime import timedelta

# DATA

In [230]:
df1 = pd.read_csv('../goodtrainbadtrain/data/select2_2020.csv',sep=',')
df2 = pd.read_csv('../goodtrainbadtrain/data/select2_2021.csv',sep=',')
df3 = pd.read_csv('../goodtrainbadtrain/data/select2_2022.csv',sep=',')

df = pd.concat([df1, df2, df3])

# FILTER

In [231]:
#filter for München and Köln
df = df.query("bhf in ('München Hbf', 'Köln Hbf',  'Köln Messe/Deutz Gl.11-12',  'Frankfurt(M) Flughafen Fernbf', 'Mannheim Hbf',  'Stuttgart Hbf','Würzburg Hbf', 'Frankfurt(Main) Hbf','Nürnberg Hbf','Berlin Hbf', 'Berlin Hbf (tief)','Hamburg Hbf','Essen Hbf','Hagen(Westf) Bahnhof','Hagen Hbf','Hannover Hbf','Erfurt Hbf','Göttingen')")

# PROCESS AND CLEAN

In [232]:
#1) cleaning of train names 
#some letters of 'zugnr' are not capitalized
df['zugnr'] = df['zugnr'].str.upper()

#2)process of 9999 in arrTime and depTime: add new column with info 
df['start_or_endpoint'] = 'nan'
df.loc[df['arrTime'] == 9999, 'start_or_endpoint'] = 'start'
df.loc[df['depTime'] == 9999, 'start_or_endpoint'] = 'end'
#overwrite 9999 with respective arr/dep time of same observation (in new clean columns)
df['arrTime_clean'] = np.where(df['arrTime'] == 9999, df['depTime'], df['arrTime'])
df['depTime_clean'] = np.where(df['depTime'] == 9999, df['arrTime'], df['depTime'])

#3)some times need to be filled up with 0's. eg. '5' -> 00:05
df['arrTime_clean'] = df['arrTime_clean'].astype(str)
df['arrTime_clean'] = df['arrTime_clean'].map(lambda a: a.zfill(4))
df['depTime_clean'] = df['depTime_clean'].astype(str)
df['depTime_clean'] = df['depTime_clean'].map(lambda a: a.zfill(4))

# FEATURE ENGENEERING

In [233]:
#1)add city feature (merged Köln Hbf/ Messe Deutz)
df['city'] = df.bhf

city_dictionary = {'Köln Messe/Deutz Gl.11-12':'Köln',
              'Köln Hbf':'Köln',
              'München Hbf':'München',
              'Frankfurt(M) Flughafen Fernbf':'Frankfurt',
              'Frankfurt(Main) Hbf':'Frankfurt',
              'Nürnberg Hbf':'Nürnberg',
              'Stuttgart Hbf':'Stuttgart',
              'Würzburg Hbf':'Würzburg',
              'Mannheim Hbf':'Mannheim',
              'Essen Hbf':'Essen',
              'Hamburg Hbf':'Hamburg',
              'Berlin Hbf':'Berlin',
              'Berlin (tief)':'Berlin',
              'Hannover Hbf':'Hannover',
              'Berlin Hbf (tief)':'Berlin',
              'Hagen Hbf':'Hagen',
              'Erfurt Hbf':'Erfurt', 
              'Göttingen':'Göttingen'} 

for key in city_dictionary.keys():
    df['city'] = df['city'].str.replace(key, city_dictionary[key])
     
#code above does not work for berlin and frankfurt; workaround:
df['city'] = df['city'].apply(lambda x: x.replace('Frankfurt(M) Flughafen Fernbf','Frankfurt'))
df['city'] = df['city'].apply(lambda x: x.replace('Berlin (tief)','Berlin'))

#2) add date column
df['date'] = df['datum'] + ' ' + df['arrTime_clean']
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H%M')

#3a) add weekday 
df['weekday'] = df['date'].dt.day_name()

#3a) add binary weekend-week 
df['weekend'] = (df['weekday'] == 'Sunday') | (df['weekday'] == 'Saturday')

#4) add month of the year 
df['month'] = df['date'].dt.month_name()

#time of the day
df['time_of_day'] = pd.cut(pd.to_datetime(df.date).dt.hour,
       bins=[0, 6, 12, 18, 24],
       labels=['night', 'morning', 'afternoon', 'evening'],
       right=False,
       include_lowest=True)

#5 time of day in circular 
seconds_in_day = 24*60*60
df['seconds'] = pd.to_datetime(df.date).dt.time.astype(str)
df['seconds'] = df['seconds'].map(lambda a: sum(x * int(t) for x, t in zip([3600, 60, 1], a.split(":"))))
df['sin_time'] = np.sin(2*np.pi*df.seconds/seconds_in_day)
df['cos_time'] = np.cos(2*np.pi*df.seconds/seconds_in_day)
df.drop('seconds', axis=1, inplace=True)

#6 circular day of the year
from datetime import date
days_in_year = 366
df['day_of_year'] = df.date.map(lambda x: x.strftime('%j')).astype(int)
df['sin_day'] = np.sin(2*np.pi*df.day_of_year/days_in_year)
df['cos_day'] = np.cos(2*np.pi*df.day_of_year/days_in_year)
df.drop('day_of_year', axis=1, inplace=True)

#todo
#7) add public holiday
holidays = pd.read_csv('../goodtrainbadtrain/data/holidays.csv',sep=',')
holidays = holidays['date'].to_list()

def check_holiday(date):
    if date in holidays:
        return True
    else:
        return False

df['public_holiday'] =  df.datum.map(lambda x: check_holiday(x))

#8) add covid lockdown
def process_time(intime, start, end):
    if start <= intime <= end:
        return True
    else:
        return False

start1 = dt.fromisoformat('2020-03-15'); end1 = dt.fromisoformat('2020-06-23')
start2 = dt.fromisoformat('2021-01-10'); end2 = dt.fromisoformat('2021-05-24')

df['covid_lockdown1'] = df['date'].map(lambda x: process_time(x, start1, end1))
df['covid_lockdown2'] = df['date'].map(lambda x: process_time(x, start2, end2))

df['covid_lockdown'] = df['covid_lockdown1'] | df['covid_lockdown2']
df.drop('covid_lockdown1',axis=1, inplace=True)
df.drop('covid_lockdown2',axis=1, inplace=True)

  df['city'] = df['city'].str.replace(key, city_dictionary[key])


# ADD DIRECTON INFO

In [234]:
journeys = pd.read_csv('../goodtrainbadtrain/data/journeylist_withberlin.csv')

#processing
journeys = journeys[journeys.leg1_train.notna()] #delete duplicated trips that once go to köln hbf and once to deutz
journeys = journeys[journeys.leg2_train.notna()] 
journeys = journeys[journeys.leg3_train.notna()] 

journeys = journeys.drop(['Unnamed: 0'],axis=1) 
journeys['key_ID'] = list(range(journeys.shape[0]))

#rename columns (necessary for wide_to_long function)
journeys.columns = [ 'date', 'weekday', 'month', 'journey_origin',
       'journey_destination', 'journey_start', 'journey_end',
       'journey_duration', 'journey_numberlegs', 
       'train_leg1', 'origin_leg1','destination_leg1', 'start_leg1', 'end_leg1', 'duration_leg1',
       'train_leg2', 'origin_leg2', 'destination_leg2', 'start_leg2','end_leg2', 'duration_leg2', 
       'train_leg3', 'origin_leg3','destination_leg3', 'start_leg3', 'end_leg3', 'duration_leg3', 
       'key_ID']

journeys_long = pd.wide_to_long(df = journeys,
                                stubnames=['train', 'origin','destination', 'start', 'end', 'duration'],
                                i=['key_ID'],
                                j='leg',
                                sep = '_',
                                suffix='.+').reset_index()

#delete empty legs
journeys_long = journeys_long[journeys_long.train != '-1']
journeys_long = journeys_long[journeys_long.origin != '-1']

#reorder columns
journeys_long = journeys_long[['key_ID', 
 'journey_origin', 'journey_destination','journey_start','journey_end','journey_duration', 'journey_numberlegs',
 'leg', 'train', 'origin', 'destination', 'start', 'end','duration',
 'date', 'month','weekday']]

def date_transformation(df, columns):
    for col in columns:
        df[col] = df[col].str.split('+', expand=True)[[0]]
        df[col] = pd.to_datetime(df[col])
    return df

journeys_long = date_transformation(journeys_long, ['journey_start', 'journey_end']) #, 'start', 'end'])
journeys_long['month'] = journeys_long['journey_start'].dt.month_name()
journeys_long['weekday'] = journeys_long['journey_start'].dt.day_name()

#add columns with city name
journeys_long['origin_city'] = journeys_long.origin
journeys_long['destination_city'] = journeys_long.destination

for key in city_dictionary.keys():
    journeys_long['origin_city'] = journeys_long['origin_city'].str.replace(key, city_dictionary[key])
    journeys_long['destination_city'] = journeys_long['destination_city'].str.replace(key, city_dictionary[key])
    
#code above does not work for berlin and frankfurt; workaround:
journeys_long['origin_city'] = journeys_long['origin_city'].apply(lambda x: x.replace('Frankfurt(M) Flughafen Fernbf','Frankfurt'))
journeys_long['origin_city'] = journeys_long['origin_city'].apply(lambda x: x.replace('Frankfurt(M) Flughafen Fernbf','Frankfurt'))
journeys_long['destination_city'] = journeys_long['destination_city'].apply(lambda x: x.replace('Berlin (tief)','Berlin'))
journeys_long['destination_city'] = journeys_long['destination_city'].apply(lambda x: x.replace('Berlin (tief)','Berlin'))


#add column trip (eg. Berlin-Köln)
journeys_long['trip'] = journeys_long['origin_city'] + '-' + journeys_long['destination_city']

ice_df = journeys_long[['origin_city','destination_city','trip','train']].groupby(['origin_city','destination_city','trip'])['train'].apply(','.join).reset_index()

df_list = []

for index, row in ice_df.iterrows():
    data = df[df.city == row.destination_city]
    data = data[data['zugnr'].isin(row.train.split(','))]
    data['trip'] = row.trip
    data['origin_city'] = row.origin_city
    data['destination_city'] = row.destination_city
    df_list.append(data) 

df = pd.concat(df_list, ignore_index = True)

  journeys_long['origin_city'] = journeys_long['origin_city'].str.replace(key, city_dictionary[key])
  journeys_long['destination_city'] = journeys_long['destination_city'].str.replace(key, city_dictionary[key])


# ADD WEATHER

In [235]:
#5) add weather variables
df['sharp_date'] = df['date'].dt.round('H')
#df['sharp_date']  = df.sharp_date.astype('str')

cgn_data = pd.read_csv('../goodtrainbadtrain/data/koln.csv', parse_dates=['time'])
muc_data = pd.read_csv('../goodtrainbadtrain/data/munchen.csv', parse_dates=['time'])
fra_data = pd.read_csv('../goodtrainbadtrain/data/frankfurt.csv', parse_dates=['time'])
man_data = pd.read_csv('../goodtrainbadtrain/data/mannheim.csv', parse_dates=['time'])
nur_data = pd.read_csv('../goodtrainbadtrain/data/nurnberg.csv', parse_dates=['time'])
stu_data = pd.read_csv('../goodtrainbadtrain/data/stuttgart.csv', parse_dates=['time'])
wur_data = pd.read_csv('../goodtrainbadtrain/data/wurzburg.csv', parse_dates=['time'])
ber_data = pd.read_csv('../goodtrainbadtrain/data/berlin.csv', parse_dates=['time'])
erf_data = pd.read_csv('../goodtrainbadtrain/data/erfurt.csv', parse_dates=['time'])
ess_data = pd.read_csv('../goodtrainbadtrain/data/essen.csv', parse_dates=['time'])
got_data = pd.read_csv('../goodtrainbadtrain/data/gottingen.csv', parse_dates=['time'])
hag_data = pd.read_csv('../goodtrainbadtrain/data/hagen.csv', parse_dates=['time'])
ham_data = pd.read_csv('../goodtrainbadtrain/data/hamburg.csv', parse_dates=['time'])
han_data = pd.read_csv('../goodtrainbadtrain/data/hannover.csv', parse_dates=['time'])


#weather = {"Köln": cgn_data, 
#           "München": muc_data, 
#           "Frankfurt": fra_data, 
#           "Mannheim": man_data, 
#           "Nürnberg": nur_data, 
#           "Stuttgart": stu_data,
#           "Würzburg": wur_data,
#           "Berlin": ber_data,
#           "Erfurt" :erf_data,
#            "Essen": ess_data,
#            "Göttingen":got_data,
#            "Hagen":hag_data,
#            "Hamburg":ham_data,
#            "Hannover":han_data
#           }

#for k, w_df in weather.items():
#    w_df['snow'] = w_df['snow'].replace(np.nan, 0)

#df = total_df.drop(columns=['dwpt', 'rhum', 'wdir', 'pres', 'tsun'])
#df['snow'] = df['snow'].replace(np.nan, 0)

# Load coco file
coco = pd.read_csv('../goodtrainbadtrain/data/weather_coco.csv', sep=';')
coco.set_index('Code', inplace=True)
coco = coco.to_dict()['Weather Condition']

# Define new classification for coco
new_classes = {
    '1': [1, 2],
    '2': [3, 4, 7, 14],
    '3': [5, 8, 10, 12, 15, 17, 19, 21, 23, 24, 25],
    '4': [6, 9, 11, 13, 16, 18, 20, 22, 26, 27]
}

# Apply new classification for coco
reclass = {}
for k, values in new_classes.items():
    for v in values:
        for c in range(1, 28):
            if v == c:
                reclass[v] = k

reclass = dict(sorted(reclass.items()))

cgn_data['coco'] = cgn_data['coco'].map(reclass).astype('int')
muc_data['coco'] = muc_data['coco'].map(reclass).astype('int')
ber_data['coco'] = ber_data['coco'].map(reclass).astype('int')

cgn_data['snow'] = cgn_data['snow'].replace(np.nan, 0)
muc_data['snow'] = muc_data['snow'].replace(np.nan, 0)
ber_data['snow'] = ber_data['snow'].replace(np.nan, 0)

cgn_data = cgn_data[['time','temp', 'prcp', 'snow', 'wspd', 'wpgt', 'coco']]
muc_data = muc_data[['time','temp', 'prcp', 'snow', 'wspd', 'wpgt', 'coco']]
ber_data = ber_data[['time','temp', 'prcp', 'snow', 'wspd', 'wpgt', 'coco']]

weather_merged = cgn_data.merge(muc_data, how='left', left_on='time', right_on='time')
weather_merged = weather_merged.merge(ber_data, how='left', left_on='time', right_on='time')

weather_merged['temp_max'] = weather_merged[['temp', 'temp_x','temp_y']].max(axis=1)
weather_merged['temp_min'] = weather_merged[['temp', 'temp_x','temp_y']].min(axis=1)
weather_merged['prcp_max'] = weather_merged[['prcp', 'prcp_x','prcp_y']].max(axis=1)
weather_merged['snow_max'] = weather_merged[['snow', 'snow_x','snow_y']].max(axis=1)
weather_merged['wspd_max'] = weather_merged[['wspd', 'wspd_x','wspd_y']].max(axis=1)
weather_merged['wpgt_max'] = weather_merged[['wpgt', 'wpgt_x','wpgt_y']].max(axis=1)
weather_merged['coco_max'] = weather_merged[['coco', 'coco_x','coco_y']].max(axis=1)

weather_merged = weather_merged[['time','temp_max','temp_min', 'prcp_max', 'snow_max', 'wspd_max', 'wpgt_max', 'coco_max']]

df['time_6_before'] = df.sharp_date - timedelta(hours=int(6))
df['time_12_before'] = df.sharp_date - timedelta(hours=int(12))

df = df.merge(weather_merged, how='left',left_on='sharp_date',right_on='time')
df = df.merge(weather_merged, how='left',left_on='time_6_before',right_on='time')
df = df.merge(weather_merged, how='left',left_on='time_12_before',right_on='time')

df['temp_max_combined'] = df[['temp_max', 'temp_max_x','temp_max_y']].max(axis=1)
df['temp_max_combined'] = df[['temp_max', 'temp_max_x','temp_max_y']].max(axis=1)
df['temp_min_combined'] = df[['temp_min', 'temp_min_x','temp_min_y']].min(axis=1)
df['prcp_max_combined'] = df[['prcp_max', 'prcp_max_x','prcp_max_y']].max(axis=1)
df['snow_max_combined'] = df[['snow_max', 'snow_max_x','snow_max_y']].max(axis=1)
df['wspd_max_combined'] = df[['wspd_max', 'wspd_max_x','wspd_max_y']].max(axis=1)
df['wpgt_max_combined'] = df[['wpgt_max', 'wpgt_max_x','wpgt_max_y']].max(axis=1)
df['coco_max_combined'] = df[['coco_max', 'coco_max_x','coco_max_y']].max(axis=1)

In [None]:

#for keys, values in weather.items():
#for keys, values in weather.items():
#    weather_df = values[['time','temp', 'prcp', 'snow', 'wspd', 'wpgt', 'coco']]
#    test_df = test_df.merge(weather_df, how='left',left_on='sharp_date',right_on='time')


# def weather_transform(row, cases, weather_df):
#     for 
#     v, c, t = case.split('_')
#     return weather_df.loc[weather_df['time'] == row['sharp_date'] - timedelta(hours=int(t)), v].reset_index(drop=True).loc[0].copy()
    
# c_hours = [6, 12]
# c_cities = ['oc', 'dc'] # oc: origin city, dc: destination city
# c_variables = ['temp', 'prcp', 'snow', 'wspd', 'wpgt', 'coco']

# cases = [v + '_' + c + '_' + str(h) for h in c_hours for c in c_cities for v in c_variables] #list of origin and time combination
# for c in list(weather.keys()):
#     test_df[cases] = test_df.apply(lambda row: weather_transform(row, cases, weather[c]), axis=1)

#def weather_transform(row, case, weather_df):
#    v, c, t = case.split('_')
#    return weather_df.loc[weather_df['time'] == row['sharp_date'] - timedelta(hours=int(t)), v].reset_index(drop=True).loc[0].copy()
    
#c_hours = [6, 12]
#c_cities = ['oc', 'dc'] # oc: origin city, dc: destination city
#c_variables = ['temp', 'prcp', 'snow', 'wspd', 'wpgt', 'coco']

#cases = [v + '_' + c + '_' + str(h) for h in c_hours for c in c_cities for v in c_variables] #list of origin and time combination


#def weather_transform(row, case, weather_df):
#    v, c, t = case.split('_')
#    return weather_df.loc[weather_df['time'] == row['sharp_date'] - timedelta(hours=int(t)), v].reset_index(drop=True).loc[0].copy()
    
#c_hours = [6, 12]
#c_cities_oc = ['oc'] # oc: origin city, dc: destination city
#c_cities_dc = ['dc']
#c_variables = ['temp', 'prcp', 'snow', 'wspd', 'wpgt', 'coco']

#cases_dc = [v + '_' + c + '_' + str(h) for h in c_hours for c in c_cities_dc for v in c_variables] 
#cases_oc = [v + '_' + c + '_' + str(h) for h in c_hours for c in c_cities_oc for v in c_variables] #list of origin and time combination

#df_list = list()
#for c in list(weather.keys())[5]:
#    test_df = df[df.origin_city == c]
#    for case in cases:
#        test_df[case] = test_df.apply(lambda row: weather_transform(row, case, weather[c]), axis=1)
#    df_list.append(test_df)

#c= 'Stuttgart'
#case= cases_oc[0] #'temp_oc_6'
#test_df = df[df.origin_city == c]

#for case in cases:
#        test_df[case] = test_df.apply(lambda row: weather_transform(row, case, weather[c]), axis=1)
#df_list.append(test_df)

#total_df =  pd.DataFrame()
#for station, w_df in weather.items():
#    weather_data = weather[station]
#    weather_data['sharp_date']  = weather_data.time.astype('str')
#    station_df = pd.merge(df[df['city'] == station], weather_data, how='left', left_on='sharp_date', right_on='sharp_date')
#    total_df = pd.concat([total_df, station_df])

#total_df =  pd.DataFrame()
#for station, w_df in weather.items():
#    weather_data = weather[station]
#    weather_data['sharp_date']  = weather_data.time.astype('str')
#    station_df = pd.merge(df[df['city'] == station], weather_data, how='left', left_on='sharp_date', right_on='sharp_date')
#    total_df = pd.concat([total_df, station_df])

#total_df =  pd.DataFrame()
#for trip in df.trip.unique():
#    subset = df[df.trip == trip]
#    origin = subset.origin_city[0]
#    destination = subset.destination_city[0]
#    weather_data_org = weather[origin]
#    weather_data_dest = weather[destination]
#    for index, row in subset.iterrows():
#        time = row.sharp_date
#        index_org = weather_data_org.index[weather_data_org.sharp_date == time]
#        index_dest = weather_data_dest.index[weather_data_dest.sharp_date == time]
#        org_6 = weather_data_org.loc[index_org-6]
#        org_12 = weather_data_org.loc[index_org-12]
#        dest_6 = weather_data_org.loc[index_org-6]
#        dest_12 = weather_data_org.loc[index_org-12]
#        org_6 = org_6.drop(columns=['dwpt', 'rhum', 'wdir', 'pres', 'tsun'])
#        org_12 = org_12.drop(columns=['dwpt', 'rhum', 'wdir', 'pres', 'tsun'])
#        dest_6 = dest_6.drop(columns=['dwpt', 'rhum', 'wdir', 'pres', 'tsun'])
#        dest_12 = dest_12.drop(columns=['dwpt', 'rhum', 'wdir', 'pres', 'tsun'])
#        org_6 = org_6.rename(columns=['temp_org_6', 'prcp_org_6', 'snow_org_6', 'wspd_org_6', 'wpgt_org_6', 'coco_org_6'])
#        org_12 = org_12.rename(columns=['temp_org_12', 'prcp_org_12', 'snow_org_12', 'wspd_org_12', 'wpgt_org_12', 'coco_org_12'])
#        dest_6 = dest_6.rename(columns=['temp_dest_6', 'prcp_dest_6', 'snow_dest_6', 'wspd_dest_6', 'wpgt_dest_6', 'coco_dest_6'])
#        dest_12 = dest_12.rename(columns=['temp_dest_12', 'prcp_dest_12', 'snow_dest_12', 'wspd_dest_12', 'wpgt_dest_12','coco_dest_12'])
#        new_row = pd.concat([row, org_6, org_12, dest_6, dest_12], axis=1)
#        total_df = pd.concat([total_df, new_row])

# TARGET PROCESSING

In [236]:
#1a) into several categories
# adelay-> into categories: no delay, small delay, medium delay, (big delay/cancellation)
max = df.adelay.max()
bins = [-2, -0.1, 0, 5, 30, max]
group_names = ['large delay/cancelled','on time','small delay', 'medium delay', 'large delay/cancelled']
df['target'] = pd.cut(df['adelay'], bins, labels=group_names, ordered=False)

#1b) into two categories: good - bad train
df['target_good_bad'] = df['target'].str.replace('large delay/cancelled','bad train')
df['target_good_bad'] = df['target_good_bad'].str.replace('medium delay','bad train')

df['target_good_bad'] = df['target_good_bad'].str.replace('on time','good train')
df['target_good_bad'] = df['target_good_bad'].str.replace('small delay','good train')

df['target_good_bad'] = df['target_good_bad'] == 'good train'

#value counts of target:
#on time                  67336
#small delay              28649
#medium delay             23617
#large delay/cancelled    10677

#2) binary target (on time - or not)
df['target_binary'] =  (df['adelay'] == 0)*1

#3) numeric target (cancelled and extreme values = 120 Min)
df['target_numeric'] = df['adelay']
df['target_numeric'] = np.where(df['target_numeric'] == -1, 120, df['target_numeric'])
df['target_numeric'] = np.where(df['target_numeric'] > 120, 120, df['target_numeric'])
#sns.boxplot(df['target_numeric'])

##Add summarized delay of each train number
df['target_numeric'] = df['adelay']
df['target_numeric'] = np.where(df['target_numeric'] == -1, 120, df['target_numeric'])
df['target_numeric'] = np.where(df['target_numeric'] > 120, 120, df['target_numeric'])

In [239]:
mean_delay_ices = df[['zugnr','target_numeric']].groupby('zugnr').mean('target_numeric').reset_index()
mean_delay_ices.columns = ['zugnr', 'mean_delay']
mean_delay_ices.to_csv('mean_delay_ices.csv')
df = df.merge(mean_delay_ices, how='left', left_on='zugnr', right_on='zugnr')

# CORRELATION / TARGET VISUALIZATION

In [215]:
#sns.barplot(x= df.weekday, y=df.target_numeric)

In [216]:
#sns.barplot(x= df.public_holiday, y=df.target_good_bad)

In [217]:
#sns.barplot(x= df.weekday, y=df.target_good_bad)

In [218]:
#sns.barplot(x= df.weekend, y=df.target_good_bad)

In [219]:
#sns.barplot(x= df.covid_lockdown, y=df.target_good_bad)

In [220]:
#sns.barplot(x= df.month, y=df.target_good_bad)

In [221]:
#binary= df.target_good_bad == True
#sns.barplot( x=binary, y= df.sin_time)

In [222]:
#binary= df.target_good_bad == True
#sns.barplot( x=binary, y= df.cos_time)

In [223]:
#sns.scatterplot(x= df.sin_time, y=df.cos_time)

In [254]:
#sns.barplot(x= df.target_good_bad, y= df.temp_max_combined)

# SAVE DATA

In [245]:
df = df[['zugnr', 'datum', 'bhf', 'adelay','ddelay',
'arrTime_clean', 'depTime_clean', 'city', 'date',
'weekday', 'weekend', 'month', 'time_of_day', 'sin_time', 'cos_time',
'sin_day', 'cos_day', 'public_holiday', 'covid_lockdown', 'trip',
'origin_city', 'destination_city', 'sharp_date', 
'temp_max_combined', 'temp_min_combined',
'prcp_max_combined', 'snow_max_combined', 'wspd_max_combined',
'wpgt_max_combined', 'coco_max_combined', 'target',
'target_good_bad', 'target_binary', 'target_numeric', 'mean_delay']]

In [255]:
df.to_csv('../goodtrainbadtrain/data/data_for_model_final.csv')

In [259]:
#my suggestion for features in the model:

df[[#trip info eg. 'Köln-Berlin'
    'trip',
    #mean delay of that ICE as numeric feature 
    #(file that translates ICE to mean delay: 'mean_delay_ices.csv')
    'mean_delay', 

    #translations of arrival time and date
    'weekday', #categorical variable; we could also use 'weekend' as binary (1 means weekend day)
    'sin_time', 'cos_time', #circular feature of time of the day
    'sin_day', 'cos_day', #circular feature of day of the year

    #special flagged days (both binary)
    'public_holiday', 'covid_lockdown', 

    #extreme weather values from different time points
    'temp_max_combined', 'temp_min_combined','prcp_max_combined', 'snow_max_combined', 'wspd_max_combined','wpgt_max_combined', 'coco_max_combined', 

    #target: 0=bad train, 1= good train
    'target_good_bad']]

Unnamed: 0,trip,mean_delay,weekend,sin_time,cos_time,sin_day,cos_day,public_holiday,covid_lockdown,temp_max_combined,temp_min_combined,prcp_max_combined,snow_max_combined,wspd_max_combined,wpgt_max_combined,coco_max_combined,target_good_bad
0,Berlin-Hannover,7.991597,True,-0.165048,0.986286,-0.051479,0.998674,False,False,3.0,-3.5,0.0,0.0,15.5,24.0,2,True
1,Berlin-Hannover,7.991597,False,-0.165048,0.986286,0.017166,0.999853,True,False,3.9,-2.1,0.0,0.0,20.9,30.0,2,True
2,Berlin-Hannover,7.991597,True,-0.065403,0.997859,0.085731,0.996318,False,False,7.4,-0.8,0.0,0.0,13.0,22.2,2,True
3,Berlin-Hannover,7.991597,True,-0.165048,0.986286,0.204552,0.978856,False,False,7.7,-0.2,0.0,0.0,22.2,42.6,2,True
4,Berlin-Hannover,7.991597,True,-0.165048,0.986286,0.320423,0.947274,False,False,6.0,-0.5,1.1,20.0,10.8,27.0,3,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200037,Würzburg-Köln,9.825666,True,-0.321439,-0.946930,0.565345,-0.824855,False,False,17.3,9.4,0.2,0.0,24.1,51.8,4,True
200038,Würzburg-Köln,9.825666,True,-0.321439,-0.946930,0.551102,-0.834438,False,False,14.9,7.1,0.4,0.0,18.5,41.0,3,True
200039,Würzburg-Köln,9.825666,False,-0.321439,-0.946930,0.536696,-0.843776,False,False,16.0,5.1,0.0,0.0,16.7,33.3,2,True
200040,Würzburg-Köln,9.825666,False,-0.321439,-0.946930,0.522133,-0.852864,False,False,21.3,9.0,0.0,0.0,14.8,32.0,2,False
