# Masterthesis
#### Julian Jetz

In [1]:
import glob
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy import stats
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from pandas.plotting import scatter_matrix
from currency_converter import CurrencyConverter
from datetime import date


In [2]:
na_values = ['nan', 'N/A', 'NaN', 'NaT']

In [3]:
def get_merged_csv(flist, **kwargs):
    return pd.concat([pd.read_csv(f, **kwargs) for f in flist], ignore_index=True)

In [4]:
df = get_merged_csv(glob.glob('csvfiles/*.csv'), na_values=na_values, index_col=False, dtype = {"STATE" : "str", "TYPE" : "str", "TYPEQ" : "str", "WEATHER" : "str", "VISIBLTY" : "str"})
df.shape[0]

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


116210

In [5]:
acc_causes = pd.read_csv('resources/ACC_CAUSE_LIST.csv',sep=';',encoding='latin1',index_col='ACCAUSE')

In [6]:
#track_speed_limits = pd.read_csv('resources/TRACK_SPEED_LIMIT.csv',sep=';',encoding='latin1',index_col='Track Classification')

In [7]:
#def getTooFast(trnspd, trkcls):

In [8]:
def getAccCauseLvl1(accause):
    return acc_causes.loc[accause]['ACCAUSE_LVL1']

In [9]:
def getAccCauseLvl2(accause):
    return acc_causes.loc[accause]['ACCAUSE_LVL2']

In [10]:
df['ACCCAUSE_LVL1']=df.apply(lambda row : getAccCauseLvl1(row['ACCAUSE']),axis=1)

In [11]:
df['ACCCAUSE_LVL2']=df.apply(lambda row : getAccCauseLvl2(row['ACCAUSE']),axis=1)

In [12]:
df['ACCCAUSE_LVL1'].unique()

array(['Miscellaneous', 'Rail, Joint Bar and Rail Anchoring',
       'Frogs, Switches and Track Appliances', 'Wheels', 'Brakes, Use of',
       'Truck Components', 'Flagging, Fixed, Hand and Radio Signals',
       'Switches, Use of', 'Coupler and Draft System',
       'Loading Procedures', 'Body', 'Train Handling/Train Makeup',
       'General Switching Rules',
       'other software/programming deficiencies, etc.)',
       'SIGNAL AND COMMUNICATION', 'Unusual Operational Situations',
       'Track Geometry', 'Environmental Conditions', 'Brakes',
       'Main Track Authority', 'Roadbed', 'Other Way and Structure',
       'Other Miscellaneous', 'Locomotives', 'Speed',
       'Axles and Journal Bearings', 'Doors',
       'Employee Physical Condition',
       'General Mechanical and Electrical Failures',
       'Trailer or Container on Flatcar', 'Cab Signals', nan],
      dtype=object)

In [13]:
df['LOCOMOTIVES1']=df.apply(lambda row : row['HEADEND1']+row['MIDMAN1']+row['MIDREM1']+row['RMAN1']+row['RREM1'],axis=1)

In [14]:
df['LOCOMOTIVES2']=df.apply(lambda row : row['HEADEND2']+row['MIDMAN2']+row['MIDREM2']+row['RMAN2']+row['RREM2'],axis=1)

In [15]:
def getEuro(damage, year, month, day):
    c = CurrencyConverter(fallback_on_missing_rate=True)
    damage=c.convert(damage, 'USD', 'EUR', date=date(year, month, day))
    return damage

#### Schaden in Euro umwandeln

In [16]:
#df['ACCDMG'] = df.apply(lambda row : getEuro(row['ACCDMG'], row['YEAR4'], row['MONTH'], row['DAY']), axis=1)

In [17]:
def isSerious(accdmg, killed, injured):
    type="Not Significant"
    # killed=row['RREMPKLD']+row['PASSKLD']+row['OTHERKLD']
    # injured=row['RREMPINJ']+row['PASSINJ']+row['OTHERINJ']
    if accdmg>=150000 or injured > 0 or killed > 0:
        type="Significant"
    return type

#### Bestimmung der Schwere des Unfalls (Nicht Signifikant, Signifikant und Schwer)

In [18]:
df['ACCTYPE'] = df.apply(lambda row : isSerious(row['ACCDMG'],row['RREMPKLD'] + row['PASSKLD'] + row['OTHERKLD'],row['RREMPINJ'] + row['PASSINJ'] + row['OTHERINJ']), axis=1)

*Umwandlung Monat zu Jahreszeiten*
*TODO*

In [19]:
pd.set_option('display.max_columns', 500)
#df.head(50)

In [20]:
#imp = IterativeImputer(missing_values=np.nan, sample_posterior=False, max_iter=10, tol=0.001, n_nearest_features=4, initial_strategy='median')
#imp.fit(df)
#imputed_df = pd.DataFrame(df=imp.transform(df), columns=['TYPTRK'], dtype='int')

In [21]:
df[df['TYPEQ'].isnull()].shape[0]

7466

In [22]:
df[df['TYPTRK'].isnull()].shape[0]

1345

In [23]:
df[df['TRNSPD']==0].shape[0]

18002

In [24]:
df[df['TONS']==0].shape[0]

49472

#### Entfernung aller n/a Werte

In [25]:
df=df.dropna(subset=['TYPEQ', 'TYPTRK']);

df.shape[0]


107406

In [26]:
df.head(50)

Unnamed: 0,ACCAUSE,ACCDMG,ACCTRK,ACCTRKCL,ADJUNCT1,ADJUNCT2,ADJUNCT3,ALCOHOL,AMPM,BRAKEMEN,CABOOSE1,CABOOSE2,CARNBR1,CARNBR2,CARS,CARSDMG,CARSHZD,CASINJ,CASINJRR,CASKLD,CASKLDRR,CAUSE,CAUSE2,CDTRHR,CDTRMIN,CNTYCD,CONDUCTR,COUNTY,DAY,DIVISION,DRUG,DUMMY1,DUMMY2,DUMMY3,DUMMY4,DUMMY5,DUMMY6,DUMMY7,EMPTYF1,EMPTYF2,EMPTYP1,EMPTYP2,ENGHR,ENGMIN,ENGRS,EQATT,EQPDMG,EVACUATE,FIREMEN,GXID,HEADEND1,HEADEND2,HIGHSPD,IMO,IMO2,IMO3,INCDTNO,INCDTNO2,INCDTNO3,IYR,IYR2,IYR3,JOINTCD,LOADED1,LOADED2,LOADF1,LOADF2,LOADP1,LOADP2,Latitude,Longitud,METHOD,MIDMAN1,MIDMAN2,MIDREM1,MIDREM2,MILEPOST,MONTH,MOPERA,NARR1,NARR10,NARR11,NARR12,NARR13,NARR14,NARR15,NARR2,NARR3,NARR4,NARR5,NARR6,NARR7,NARR8,NARR9,NARRLEN,OTHERINJ,OTHERKLD,PASSINJ,PASSKLD,PASSTRN,POSITON1,POSITON2,RAILROAD,RCL,REGION,RMAN1,RMAN2,RR2,RR3,RRCAR1,RRCAR2,RRDIV,RREM1,RREM2,RREMPINJ,RREMPKLD,SIGNAL,SSB1,SSB2,STATE,STATION,STCNTY,SUBDIV,TEMP,TIMEHR,TIMEMIN,TONS,TOTINJ,TOTKLD,TRKCLAS,TRKDMG,TRKDNSTY,TRKNAME,TRNDIR,TRNNBR,TRNSPD,TYPE,TYPEQ,TYPRR,TYPSPD,TYPTRK,Unnamed: 145,Unnamed: 146,Unnamed: 147,Unnamed: 148,Unnamed: 149,Unnamed: 150,Unnamed: 151,Unnamed: 152,Unnamed: 153,Unnamed: 154,Unnamed: 155,Unnamed: 156,Unnamed: 157,Unnamed: 158,Unnamed: 159,Unnamed: 160,Unnamed: 161,Unnamed: 162,Unnamed: 163,Unnamed: 164,Unnamed: 165,Unnamed: 166,Unnamed: 167,Unnamed: 168,Unnamed: 169,Unnamed: 170,Unnamed: 171,Unnamed: 172,VISIBLTY,WEATHER,YEAR,YEAR4,ACCCAUSE_LVL1,ACCCAUSE_LVL2,LOCOMOTIVES1,LOCOMOTIVES2,ACCTYPE
0,H999,16500,2,1,,,,,PM,2.0,0,0,376255,,0,0,0,0,0,0,0,H999,,5.0,35.0,53,1.0,HENNEPIN,27,LAKES,,,,0.0,W,0.0,,,56,3,0,0,5.0,35.0,1.0,Y,1000,0,0.0,,2.0,0,5,9,,9,LA1904,,LA1904,90,,90,1.0,N,,56,0,0,0,,,NI,0,0,0,0,0.0,9,,TRAIN 143 INBOUND ON NORTH RUNNER CROSSOVER TO...,,,,,,,71 CROSSOVER.,,,,,,,,113.0,0,0,0,0,N,3.0,,BN,,4,0,0,,BN,BN,,LAKES,0,0,0,0,,,,27,NORTHTOWN,27C053,,50,11,15,7100.0,0,0,1,15500,,671 CROSS,4.0,1143,5.0,1,1,1L,E,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,1,90,1990.0,Miscellaneous,Human Factors,2.0,0,Not Significant
1,T210,6764,4,0,,,,,PM,1.0,0,0,521717,,0,0,0,0,0,0,0,T210,,6.0,40.0,73,1.0,JEFFERSON,28,SYSTEM,,,,0.0,,0.0,,,1,0,0,0,6.0,40.0,1.0,Y,5688,0,0.0,,0.0,0,6,12,,12,RE902,,INDUSTRY,90,,90,1.0,Y,,37,3,0,0,,,N,0,0,0,0,0,12,,"#4 PORT CREW, OPERATING ENGINES 223 AND 224, W...",,,,,,,OTHER CUT OF CARS ON THE BACKSIDE TRACK ON WAR...,"NG SLOWLY SOUTH, A 6 SECTION OF RAIL BROKE AFT...","WHEN THE RAIL BROKE, THE 4TH, 5TH AND 6TH CARS...",,,,,,356.0,0,0,0,0,N,4.0,,BS,,3,2,0,,WGNZ,BN,,SYSTEM,0,0,0,0,,,,1,PORT BIRMINGHAM,01C073,,40,3,10,0.0,0,0,,1076,,BACKSIDE,2.0,4,6.0,1,7,3S,E,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2,90,1990.0,"Rail, Joint Bar and Rail Anchoring","Track, Roadbed and Structures",2.0,0,Not Significant
2,T314,25000,4,1,,,,,PM,1.0,0,0,80160,,0,0,0,0,0,0,0,T314,,7.0,20.0,133,1.0,YORK,17,SYSTEM,,,,0.0,,0.0,,,0,0,0,0,7.0,20.0,1.0,Y,0,0,0.0,,1.0,0,5,12,,12,YKR0490,,YKR0490,90,,90,1.0,Y,,2,2,0,0,,,N,0,0,0,0,12.5,12,,CREW COUPLED TO RAILCARS INSIDE BUILDING AND B...,,,,,,,SWITCH POINT APPROXIMATELY 100 FEET OUTSIDE OF...,GRADE.,,,,,,,206.0,0,0,0,0,N,1.0,,YKR,,2,0,0,,YKR,TTPX,,SYSTEM,0,0,0,0,,,,42,YORK,42C133,,50,3,40,250.0,0,0,1,25000,,#3,3.0,YARD,5.0,1,7,3S,E,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,1,90,1990.0,"Frogs, Switches and Track Appliances","Track, Roadbed and Structures",1.0,0,Not Significant
3,E64C,31605,2,1,,,,,PM,0.0,0,0,5744,,27,0,0,0,0,0,0,E64C,,7.0,25.0,97,1.0,PORTAGE,11,SYSTEM,,,,0.0,,1.0,,,12,0,0,0,7.0,25.0,1.0,Y,15800,0,0.0,,3.0,0,7,12,,12,G1693,,G1693,90,,90,1.0,Y,,75,5,0,0,,,NM,0,0,0,0,CM249,12,,TRAIN PULLING OUT FROM A-2 SWITCH AND CAR WC 5...,,,,,,,AIL AT THE MAIN LINE CROSSOVER. DUE TO THIN FL...,,,,,,,,166.0,0,0,0,0,N,26.0,,WC,,4,0,0,,WC,WC,,SYSTEM,0,0,0,0,,,,55,STEVENS POINT,55C097,,45,10,25,8744.0,0,0,1,15805,,MAIN LINE CROSSOVER,4.0,L013,7.0,1,1,3L,E,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,90,1990.0,Wheels,Mechanical and Electrical Failures,3.0,0,Not Significant
4,H021,7800,2,1,,,,,AM,1.0,0,0,81023,,1,1,0,0,0,0,0,H021,,1.0,30.0,39,1.0,FOND DU LAC,2,SYSTEM,,,,0.0,,1.0,,,0,0,0,0,1.0,30.0,1.0,Y,7500,0,0.0,,0.0,0,4,12,,12,G1694,,G1694,90,,90,1.0,Y,,2,0,0,0,,,NM,0,0,0,0,CM 158,12,,CREW KICKED 2 CARS DOWN TRACK 2 WHICH CLEARED ...,,,,,,,D CAR ON TRACK 2 AS CARS ON TRACK 2 EVIDENTLY ...,,,,,,,,177.0,0,0,0,0,N,1.0,,WC,,4,0,0,,WC,UTLX,,SYSTEM,0,0,0,0,,,,55,FOND DU LAC,55C039,,30,1,30,0.0,0,0,1,0,,YARD TRK NO.1,3.0,1,4.0,4,6,3L,E,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,6,90,1990.0,"Brakes, Use of",Human Factors,0.0,0,Not Significant
5,H021,7800,2,1,,,,,AM,1.0,0,0,195191,,0,0,0,0,0,0,0,H021,,1.0,30.0,39,1.0,FOND DU LAC,2,SYSTEM,,,,0.0,,1.0,,,1,0,0,0,1.0,30.0,1.0,Y,300,0,0.0,,0.0,0,4,12,,12,G1694,,G1694,90,,90,3.0,N,,1,0,0,0,,,NM,0,0,0,0,CM158,12,,CREW KICKED 2 CARS DOWN TRACK 2 WHICH CLEARED ...,,,,,,,ARS TOWARD TRACK 1 WHICH CORNERED CAR WHICH EV...,,,,,,,,196.0,0,0,0,0,N,1.0,,WC,,4,0,0,,WC,MSDR,,SYSTEM,0,0,0,0,,,,55,FOND DU LAC,55C039,,30,1,30,0.0,0,0,1,0,,YARD TRACK NO. 2,,,0.0,4,6,3L,E,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,6,90,1990.0,"Brakes, Use of",Human Factors,0.0,0,Not Significant
6,T311,6500,4,1,,,,0.0,PM,1.0,0,0,473101,,0,0,0,0,0,0,0,T311,,8.0,15.0,29,1.0,HANCOCK,31,PITTSBURGH,0.0,R020,,0.0,E,0.0,,,16,0,0,0,8.0,15.0,1.0,Y,3500,0,0.0,,3.0,0,4,12,,12,0530004,,INDUSTRY,90,,90,1.0,Y,,35,3,0,0,,,N,0,0,0,0,,12,,"WIMJ 37 SHOVING #5 TRACK, WEIRTON STEEL YARD, ...",,,,,,,JUSTMENT.,,,,,,,,109.0,0,0,0,0,N,26.0,,CR,,2,0,0,,WEST,CR,,PITTSB,0,0,0,0,,,,54,WEIRTON,54C029,,24,4,0,2500.0,0,0,1,3000,,#5 WEIRTON STEEL,4.0,WIMJ,4.0,1,1,1L,E,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,1,90,1990.0,"Frogs, Switches and Track Appliances","Track, Roadbed and Structures",3.0,0,Not Significant
7,T221,60000,1,0,,,,,PM,0.0,0,0,1139,,0,0,0,0,0,0,0,T221,T002,7.0,0.0,45,1.0,DALE,3,SYSTEM,,,,0.0,,0.0,102.0,,7,0,0,0,7.0,0.0,1.0,Y,50000,0,0.0,,3.0,0,5,12,,12,5,,5,90,,90,1.0,Y,,54,8,0,0,,,I,0,0,0,0,800,12,,RAIN SOFTENED THE ROADBED CAUSING RAIL TO BREA...,,,,,,,"ERAILED, 11TH & 12TH TURNED OVER.",,,,,,,,133.0,0,0,0,0,N,9.0,,WGCR,,3,0,0,,WGCR,PVGX,,SYSTEM,0,0,0,0,,,,1,WATERFORD,01C045,,60,3,0,2400.0,0,0,X,10000,0.95,MAIN TRACK,1.0,,5.0,1,1,3L,E,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,3,90,1990.0,"Rail, Joint Bar and Rail Anchoring","Track, Roadbed and Structures",3.0,0,Not Significant
8,E40C,65000,1,0,,,,,PM,0.0,0,0,1122,,0,0,0,0,0,0,0,E40C,,7.0,0.0,45,1.0,DALE,21,SYSTEM,,,,0.0,,0.0,,,0,0,0,0,7.0,0.0,1.0,Y,45000,0,0.0,,3.0,0,8,12,,12,8,,8,90,,90,1.0,Y,,15,3,0,0,,,I,0,0,0,0,811,12,,ENGINEMAN RANEW FELT TUG IN TRAIN AND APPLIED ...,,,,,,,,,,,,,,,95.0,0,0,0,0,N,7.0,,WGCR,,3,0,0,,WGCR,PVGX,,SYSTEM,0,0,0,0,,,,1,GERALD,01C045,,60,3,30,1500.0,0,0,X,20000,0.95,MAIN TRACK,1.0,,8.0,1,1,3L,E,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2,90,1990.0,Truck Components,Mechanical and Electrical Failures,3.0,0,Not Significant
9,T299,5850,2,1,,,,,PM,,0,0,980083,,0,0,0,0,0,0,0,T299,,,,57,,WEBER,22,UTAH,,,,0.0,W,0.0,,,1,0,0,0,,,,,100,0,,,0.0,0,4,12,,12,1290UT214,,1290UT214,90,,90,1.0,N,,0,0,0,0,,,N,0,0,0,0,991.0,12,,SP 292880 DERAILED ACCOUNT BROKEN RAIL; WHEN C...,,,,,,,"DERAILED CAR, RAIL BROKE AND ROLLED OVER DERIL...",,,,,,,,200.0,0,0,0,0,N,1.0,,UP,,7,0,0,,UP,UP,,UTAH,0,0,0,0,,,,49,OGDEN,49C057,,-9,11,0,0.0,0,0,1,0,,EAST 17,4.0,,0.0,1,5,1L,E,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,1,90,1990.0,"Rail, Joint Bar and Rail Anchoring","Track, Roadbed and Structures",0.0,0,Not Significant


In [None]:
#duplicateDF = pd.concat(g for _, g in df.groupby("INCDTNO") if len(g) > 1).copy()
#duplicateDF.sort_values(by=['INCDTNO'], inplace=True)

In [None]:
#duplicateDF

In [None]:
df.drop(df.loc[df['ACCTYPE']=="Not Significant"].index, inplace=True)
df.shape[0]

#### Ausreißer entfernen

In [None]:
std_dev = 3
df = df[(np.abs(stats.zscore(df[['LOCOMOTIVES1','LOADF1', 'LOADP1', 'EMPTYF1', 'EMPTYP1','LOCOMOTIVES2','LOADF2', 'EMPTYF2', 'TONS', 'TEMP', 'ACCDMG']])) < float(std_dev)).all(axis=1)]
df.shape[0]

##### TODO: Prüfen Adaptive Learning rate

#### [Adam Optimizer](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam) <br>
Die Adam-Optimierung ist eine stochastische Gradientenabsenkungsmethode, die auf einer adaptiven Schätzung von Momenten erster und zweiter Ordnung basiert. Das Verfahren ist "recheneffizient, hat wenig Speicherbedarf, ist invariant gegenüber der diagonalen Neuskalierung von Gradienten und eignet sich gut für Probleme, die in Bezug auf Daten/Parameter groß sind".
[Arxiv](https://arxiv.org/pdf/1412.6980.pdf)<br><br>
*Learning rate:* In der maschinellen Lern- und Statistiktechnik ist die Lernrate ein Tuningparameter in einem Optimierungsalgorithmus, der die Schrittweite bei jeder Iteration bestimmt und sich dabei auf ein Minimum einer Verlustfunktion zubewegt. Da sie beeinflusst, inwieweit neu gewonnene Informationen alte Informationen übersteuern, stellt sie metaphorisch die Geschwindigkeit dar, mit der ein maschinelles Lernmodell "lernt". Bei der Festlegung einer Lernrate gibt es einen Kompromiss zwischen der Konvergenzrate und der Überschreitung. Während die Richtung zum Minimum in der Regel aus dem Gradienten der Verlustfunktion bestimmt wird, bestimmt die Lernrate, wie groß ein Schritt in diese Richtung ist.Eine zu hohe Lernrate führt dazu, dass der Lernsprung über Minima hinausgeht, aber eine zu niedrige Lernrate dauert entweder zu lange, um sich zu konvergieren oder in einem unerwünschten lokalen Minimum stecken zu bleiben.


In [None]:
opti = tf.optimizers.Adam(learning_rate = 0.001)

#### [Nadam Optimizer](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Nadam?authuser=2&hl=vi&version=stable)
[Incorporating Nesterov Momentum into Adam](http://cs229.stanford.edu/proj2015/054_report.pdf)

In [None]:
# opti = tf.optimizers.Nadam(learning_rate = 0.01)

#### [Adagrad Optimizer](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adagrad?hl=vi&authuser=2&version=stable)
Adagrad ist ein Optimierer mit parameter-spezifischen Lernraten, die angepasst werden, je nachdem, wie häufig ein Parameter während des Trainings aktualisiert wird. Je mehr Updates ein Parameter erhält, desto kleiner sind die Updates. [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)

In [None]:
#opti = tf.optimizers.Adagrad(learning_rate = 0.001)

#### [Adamax Optimizer](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adamax?hl=vi&authuser=2&version=stable)
Adamax ist eine Variante von Adam, die auf der Unendlichkeitsnorm basiert. Adamax ist Adam manchmal überlegen, besonders bei Modellen mit Einbettungen. [Arxiv](https://arxiv.org/pdf/1412.6980.pdf)

In [None]:
# opti = tf.optimizers.Adamax(learning_rate = 0.01)

## Complete regression model

Festlegen der abhängigen Variablen X und der vorherzusagenden Variable y

In [None]:
x_data = df[['YEAR4', 'MONTH', 'LOCOMOTIVES1', 'LOADF1', 'LOADP1', 'EMPTYF1', 'EMPTYP1','LOCOMOTIVES2', 'LOADF2', 'EMPTYF2', 'TONS', 'TEMP', 'SPEED','TRNSPD', 'ACCCAUSE_LVL1', 'TYPE', 'TYPEQ', 'TYPTRK', 'VISIBLTY', 'WEATHER', 'STATE']]
y = df['ACCDMG']

In [None]:
#x_data = df[df['ACCTYPE']=="Significant"][['YEAR4', 'MONTH', 'LOADF1', 'LOADP1', 'EMPTYF1', 'EMPTYP1','LOADF2',  'EMPTYF2', 'TONS', 'TEMP', 'TRNSPD', 'ACCAUSE', 'TYPE', 'TYPEQ', 'TYPTRK', 'VISIBLTY', 'WEATHER', 'COUNTY']].copy()
#y = df[df['ACCTYPE']=="Significant"][['ACCDMG']].copy()

In [None]:
#cause_dummies= pd.get_dummies(df['ACCAUSE'], prefix='cause')
#x_data = pd.concat([x_data, df[['ACCAUSE', 'TYPE', 'TYPEQ', 'TYPTRK', 'VISIBLTY', 'WEATHER', 'COUNTY']], axis=1)

In [None]:
x_data.describe(include='all')

In [None]:
y.describe(include='all')

#### Scattermatrix und Korrelationen der Daten
Jahr, Monat, Anzahl der beladenen Frachtwagons und Personenwagons, Gewicht, Temperatur, Geschwindigkeit und Unfallkosten

In [None]:
scatter_matrix(df[df['ACCTYPE']=="Significant"][['YEAR4', 'MONTH','LOCOMOTIVES1', 'LOADF1', 'LOADP1', 'EMPTYF1', 'EMPTYP1','LOCOMOTIVES2','LOADF2', 'EMPTYF2', 'TONS', 'TEMP', 'TRNSPD', 'ACCCAUSE_LVL1', 'TYPE', 'TYPEQ', 'TYPTRK', 'VISIBLTY', 'WEATHER', 'STATE', 'ACCDMG']], diagonal='kde', figsize=(18, 18))
plt.savefig(r"Images/scatterALL.png")

In [None]:
colormap = plt.cm.RdBu
corr = df[df['ACCTYPE']=="Significant"][['ACCDMG', 'YEAR4', 'MONTH', 'LOCOMOTIVES1', 'LOADF1', 'LOADP1', 'EMPTYF1', 'EMPTYP1','LOCOMOTIVES2','LOADF2', 'EMPTYF2', 'TONS', 'TEMP', 'TRNSPD', 'ACCCAUSE_LVL1', 'TYPE', 'TYPEQ', 'TYPTRK', 'VISIBLTY', 'WEATHER', 'STATE']].corr()
corr

In [None]:
ax=plt.figure(figsize=(20,15)).gca()
sns.heatmap(corr,ax=ax,linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True);
plt.savefig(r"Images/corrALL.png")

#### Aufteilen der Daten in Train und Test Datensatz

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size=0.3)

#### Festlegen der numerischen Merkmalsspalten

In [None]:
trnspd = tf.feature_column.numeric_column('TRNSPD')
year = tf.feature_column.numeric_column('YEAR4')
month = tf.feature_column.numeric_column('MONTH')
locomotives1 = tf.feature_column.numeric_column('LOCOMOTIVES1')
loadf1 = tf.feature_column.numeric_column('LOADF1')
loadp1 = tf.feature_column.numeric_column('LOADP1')
emptyf1 = tf.feature_column.numeric_column('EMPTYF1')
emptyp1 = tf.feature_column.numeric_column('EMPTYP1')
locomotives2 = tf.feature_column.numeric_column('LOCOMOTIVES2')
loadf2 = tf.feature_column.numeric_column('LOADF2')
emptyf2 = tf.feature_column.numeric_column('EMPTYF2')
tons = tf.feature_column.numeric_column('TONS')
temp = tf.feature_column.numeric_column('TEMP')

In [None]:
x_data[['ACCCAUSE_LVL1', 'TYPE', 'TYPEQ', 'TYPTRK', 'VISIBLTY', 'WEATHER', 'STATE']].astype(str).values

#### Festlegen der kategorischen Merkmalsspalten. 
Anstatt die Daten als einen one-hot Vektor mit vielen Dimensionen darzustellen, stellt eine Einbettungsspalte diese Daten als einen niederdimensionalen, dichten Vektor dar, in dem jede Zelle eine beliebige Zahl enthalten kann, nicht nur 0 oder 1. Die Größe der Einbettung ist ein Parameter, der angepasst werden muss (*TODO*).

In [None]:
accause = tf.feature_column.categorical_column_with_hash_bucket('ACCCAUSE_LVL1',hash_bucket_size=10000)

embedding_size = int(math.floor(len(x_data['ACCCAUSE_LVL1'].unique())**0.25))
accause=tf.feature_column.embedding_column(accause, dimension=embedding_size)

In [None]:
acctype = tf.feature_column.categorical_column_with_hash_bucket('TYPE',hash_bucket_size=10000)

embedding_size = int(math.floor(len(x_data['TYPE'].unique())**0.25))
acctype=tf.feature_column.embedding_column(acctype, dimension=embedding_size)

In [None]:
typeq = tf.feature_column.categorical_column_with_hash_bucket('TYPEQ',hash_bucket_size=10000)

embedding_size = int(math.floor(len(x_data['TYPEQ'].unique())**0.25))
typeq=tf.feature_column.embedding_column(typeq, dimension=embedding_size)

In [None]:
typtrk = tf.feature_column.categorical_column_with_hash_bucket('TYPTRK',hash_bucket_size=10000)

embedding_size = int(math.floor(len(x_data['TYPTRK'].unique())**0.25))
typtrk=tf.feature_column.embedding_column(typtrk, dimension=embedding_size)

In [None]:
visibility = tf.feature_column.categorical_column_with_hash_bucket('VISIBLTY',hash_bucket_size=10000)

embedding_size = int(math.floor(len(x_data['VISIBLTY'].unique())**0.25))
visibility=tf.feature_column.embedding_column(visibility, dimension=embedding_size)

In [None]:
weather = tf.feature_column.categorical_column_with_hash_bucket('WEATHER',hash_bucket_size=10000)

embedding_size = int(math.floor(len(x_data['WEATHER'].unique())**0.25))
weather=tf.feature_column.embedding_column(weather, dimension=embedding_size)

In [None]:
state = tf.feature_column.categorical_column_with_hash_bucket('STATE',hash_bucket_size=10000)

embedding_size = int(math.floor(len(x_data['STATE'].unique())**0.25))
state=tf.feature_column.embedding_column(state, dimension=embedding_size)

#### Festlegen der Merkmalsspalten

In [None]:
feature_col =[year, month, trnspd, typeq, locomotives1, loadf1, loadp1, emptyf1, emptyp1, locomotives2, loadf2, emptyf2, typtrk, acctype, accause, visibility, weather, temp, state]

#### Aufstellen der Input Funktion

Batch_Size=Größe der zurückzusendenden Batches.<br>
Num_Epochs=Anzahl der Perioden, die man über Daten iterieren muss.<br>
Shuffle=Sollendie Datensätze in zufälliger Reihenfolge gelesen werden?

In [None]:
input_func= tf.compat.v1.estimator.inputs.pandas_input_fn(x=x_train, 
                                                y= y_train, 
                                                batch_size=10, 
                                                num_epochs=1000, 
                                                 shuffle=True)

#### Aufstellen der Eval Input Funktion

In [None]:
test_input_func = tf.compat.v1.estimator.inputs.pandas_input_fn(x=x_test,                                                   
                                                 batch_size=10, 
                                                 num_epochs=1, 
                                                 shuffle=False)
eval_input_func = tf.compat.v1.estimator.inputs.pandas_input_fn(x=x_test,
                                                      y=y_test, 
                                                      batch_size=10, 
                                                      num_epochs=1, 
                                                      shuffle=False)
train_input_func = tf.compat.v1.estimator.inputs.pandas_input_fn(x=x_train,                                                   
                                                 batch_size=10, 
                                                 num_epochs=1, 
                                                 shuffle=False)

#### Initialisierung des Estimators (DNNRegressor)
hidden_units=Das Argument hidden_units ermöglicht es, ein Array mit der Anzahl der Knoten für jede Schicht zu erzeugen. Dies ermöglicht es, ein neuronales Netzwerk zu erstellen, indem einfach seine Größe und Form berücksichtigt wird, anstatt das Ganze von Grund auf von Hand zu vernetzen. (TODO: fine tune)

In [None]:
estimator = tf.estimator.DNNRegressor(hidden_units=[15,30,45,30], feature_columns=feature_col, optimizer=opti, dropout=0.5)

#### Train Model

In [None]:
estimator.train(input_fn=input_func, max_steps=60000)


#### Evaluate Model mit Eval input function

In [None]:
result_eval = estimator.evaluate(input_fn=eval_input_func)
result_eval

#### Scatterplot Vergleich tatsächliche und vorhergesagte Werte

In [None]:
predictions=[]
for pred in estimator.predict(input_fn=test_input_func):
    predictions.append(pred['predictions'][0].astype(float))
plt.plot(y_test, predictions, 'o')
plt.xlabel('Actual values (test data)')
plt.ylabel('predicted values (test data)')

In [None]:
train_predictions=[]
for pred in estimator.predict(input_fn=train_input_func):
    train_predictions.append(pred['predictions'][0].astype(float))
plt.plot(y_train, train_predictions, 'o')
plt.xlabel('Actual values (train data)')
plt.ylabel('predicted values (train data)')

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, predictions))**0.5
rmse

#### Verleich tatsächliche und vorhergesagte Werte 
Beispiel: 30 zufällig ausgewählte Werte

In [None]:
pred = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
pred1 = pred.sample(100)

pred1.plot(kind='bar',figsize=(20,16))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

#### Residual Plot 
Abweichung zwischen den vorhergesagten und tatsächlichen Unfallkosten (Testdaten=grün, Trainingsdaten=blau)

In [None]:
    plt.scatter(train_predictions, train_predictions - y_train, c='b', s=40, alpha=0.5, label='Train Data')
    plt.scatter(predictions, predictions - y_test, c='g', s=40, label='Test Data')
    plt.hlines(y=0, xmin=-0.03, xmax=0.2)
    plt.title('Residual Plot of DNN Regression')
    plt.ylabel('Residuals')
    plt.xlabel('Accident Damage')
    plt.legend()
    plt.show()