# 00 Preparation
Wir machen eine Kopie des DataFrame, ersetzen alle NaN Werte und entfernen nicht benötigte Spalten und teilen das DataFrame auf in Features (x) und Zielvariable (y).

In [2]:
# IMPORT LIBRARIES
# IMPORT LIBRARIES
import pandas as pd

# LOAD DATA
df = pd.read_csv('05_flight_data.csv')

# COPY DATAFRAME
df_cleaned = df.copy()
df

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,DepTimeBlk,DepDelay,DepDel15,CRSArrTime,ArrTimeBlk,ArrDelay,ArrDel15,Cancelled,Diverted
0,2011,4,10,6,4,WN,13495,12191,1435,1400-1459,2.0,0.0,1550,1500-1559,-6.0,0.0,0,0
1,2011,4,10,6,4,WN,13495,12191,1330,1300-1359,-4.0,0.0,1445,1400-1459,-12.0,0.0,0,0
2,2011,4,10,6,4,WN,13495,12191,1030,1000-1059,-2.0,0.0,1145,1100-1159,-14.0,0.0,0,0
3,2011,4,10,6,4,WN,13495,12889,1900,1900-1959,0.0,0.0,2055,2000-2059,-6.0,0.0,0,0
4,2011,4,10,6,4,WN,13495,12889,1340,1300-1359,-1.0,0.0,1530,1500-1559,2.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504392,2011,4,10,17,1,DL,11433,14635,730,0700-0759,-4.0,0.0,1022,1000-1059,-10.0,0.0,0,0
504393,2011,4,10,17,1,DL,14771,11433,705,0700-0759,-7.0,0.0,1441,1400-1459,-27.0,0.0,0,0
504394,2011,4,10,17,1,DL,11433,14869,1725,1700-1759,1.0,0.0,1924,1900-1959,-4.0,0.0,0,0
504395,2011,4,10,17,1,DL,14679,12478,2243,2200-2259,9.0,0.0,700,0700-0759,-10.0,0.0,0,0


In [3]:
# REPLACE NAN VALUES WITH 0
df_cleaned['ArrDelay'] = df_cleaned['ArrDelay'].apply(lambda x: 0 if pd.isna(x) else x)
df_cleaned['DepDelay'] = df_cleaned['DepDelay'].apply(lambda x: 0 if pd.isna(x) else x)
df_cleaned['DepDel15'] = df_cleaned['DepDel15'].apply(lambda x: 0 if pd.isna(x) else x)
df_cleaned['ArrDel15'] = df_cleaned['ArrDel15'].apply(lambda x: 0 if pd.isna(x) else x)

# COPY DATAFRAME
df_features1 = df_cleaned.copy()

# DELETE COLUMNS
df_features1 = df_features1.drop(['Carrier', 'DepTimeBlk', 'ArrTimeBlk'], axis=1)

# FEATURES & TARGET
x = df_features1.drop(['ArrDelay'], axis=1)
y = df_features1['ArrDelay']

# 01 Training
Wir nutzen eine **Lineare Regression**, um die Flug Verspätungen (in Minuten) vorherzusagen.

In [4]:
# IMPORT LIBRARIES
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# SPLIT DATA
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.4, random_state=1234)

# LINEAR REGRESSION
lr = LinearRegression()
lr.fit(X_train, Y_train)

# PREDICTIONS
Y_predict = lr.predict(X_test)

# 02 Evaluate
Wir berechnen den **RMSE**, um die durchschnittliche **Abweichung** der vorhergesagten von den tatsächlichen Verspätungen zu erhalten.

In [5]:
# IMPORT LIBRARIES
from sklearn.metrics import mean_squared_error
import math

# CALCULATE RMSE
rmse = math.sqrt(mean_squared_error(Y_test, Y_predict))
print(f"RMSE: {rmse:.3f}")

RMSE: 10.642


# 03 Feature Selection
Wir berechnen für alle Spalten den **P-Value**, um herauszufinden, welche Spalten für unsere Vorhersage von Flug Verspätungen **nicht relevant** sind.

In [6]:
from sklearn.feature_selection import f_regression as fr

# F_REGRESSION (NUMERISCHES TARGET )
# Ergebnis ist eine Liste mit je 1 Array für F-Score und P-Value
result = fr(x, y)

In [7]:
# PUT RESULTS IN DATAFRAME
# Je eine Spalte für F-Score (2 NK) und P-Value (6 NK)
df = pd.DataFrame({'f_score': result[0], 'p_value': result[1]})
df['f_score'] = df['f_score'].map('{:4.2f}'.format)
df['p_value'] = df['p_value'].map('{:2.6f}'.format)

In [8]:
# ADD COLUMN FEATURES WITH COLUMN NAMES
columns = list(x.columns)
df['feature'] = columns

df.sort_values(by=['p_value'], ascending=[True], inplace=True)
df

Unnamed: 0,f_score,p_value,feature
3,445.21,0.0,DayofMonth
5,41.25,0.0,OriginAirportID
7,2660.55,0.0,CRSDepTime
8,2458633.54,0.0,DepDelay
9,312349.17,0.0,DepDel15
10,2603.62,0.0,CRSArrTime
11,431432.9,0.0,ArrDel15
4,0.52,0.469571,DayOfWeek
12,0.18,0.667207,Cancelled
6,0.05,0.819654,DestAirportID


# 04 Erneute Feature Selection
Jetzt werfen wir alle Spalten raus, die einen **P-Value > 0.05** haben und trainieren das Modell erneut, um den **RMSE** anschließend zu vergleichen.

In [9]:
# COPY DATAFRAME
df_features2 = df_cleaned.copy()

# DELETE COLUMNS WITH P-VALUE > 5%
df_features2 = df_features2.drop(['Carrier', 'DepTimeBlk', 'ArrTimeBlk', 'DayOfWeek', 'Cancelled','DestAirportID', 'Diverted', 'Year', 'Quarter', 'Month'], axis=1)

# FEATURES & TARGET
x = df_features2.drop(['ArrDelay'], axis=1)
y = df_features2['ArrDelay']

In [10]:
# SPLIT DATA
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.4, random_state=1234)

# LINEAR REGRESSION
lr = LinearRegression()
lr.fit(X_train, Y_train)

# PREDICTIONS
Y_predict = lr.predict(X_test)

In [11]:
# CALCULATE RMSE
# CALCULATE RMSE
rmse = math.sqrt(mean_squared_error(Y_test, Y_predict))
print(f"RMSE: {rmse:.3f}")

RMSE: 10.649
