## Setup

In [89]:
import os
import requests
import pandas as pd

from dotenv import load_dotenv
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, log_loss

load_dotenv()
set_config(transform_output = 'pandas')

FROST_ID = os.getenv("FROST_ID")
HISTORICAL_FLIGHTS_PATH = os.getenv("HISTORICAL_FLIGHTS_PATH")

## Load data

In [115]:
hist_flights_df = pd.read_csv(HISTORICAL_FLIGHTS_PATH)
hist_flights_df.head()

Unnamed: 0,flight_id,dep_airport,dep_airport_group,arr_airport,arr_airport_group,service_type,std,sta,cancelled,atd,ata
0,WF149,HOV,B,OSL,,J,2018-01-02T16:40:00.000000,2018-01-02T17:15:00.000000,0,,2018-01-02T18:53:00.000000
1,WF722,OSL,,MJF,D,J,2018-01-28T13:04:00.000000,2018-01-28T14:50:00.000000,0,,
2,WF188,FDE,A,OSL,,J,2018-04-07T07:10:00.000000,2018-04-07T08:10:00.000000,0,,2018-04-07T07:55:00.000000
3,WF176,HOV,B,OSL,,J,2018-04-07T11:00:00.000000,2018-04-07T12:05:00.000000,0,,2018-04-07T12:00:00.000000
4,WF148,HOV,B,OSL,,J,2018-04-30T08:25:00.000000,2018-04-30T09:26:00.000000,0,,2018-04-30T09:36:00.000000


## Preprocess data

In [None]:
hist_flights_df = hist_flights_df[hist_flights_df["cancelled"] == 0]

for col in ["std", "sta", "atd", "ata"]:
    hist_flights_df[col] = pd.to_datetime(hist_flights_df[col])
    
hist_flights_df["std_collision_min"] = hist_flights_df["std"] - pd.Timedelta(minutes=16)
hist_flights_df["std_collision_max"] = hist_flights_df["std"] + pd.Timedelta(minutes=5)
hist_flights_df["sta_collision_min"] = hist_flights_df["sta"] - pd.Timedelta(minutes=15)
hist_flights_df["sta_collision_max"] = hist_flights_df["sta"] + pd.Timedelta(minutes=8)

hist_flights_df["atd_collision_min"] = hist_flights_df["atd"] - pd.Timedelta(minutes=16)
hist_flights_df["atd_collision_max"] = hist_flights_df["atd"] + pd.Timedelta(minutes=5)
hist_flights_df["ata_collision_min"] = hist_flights_df["ata"] - pd.Timedelta(minutes=15)
hist_flights_df["ata_collision_max"] = hist_flights_df["ata"] + pd.Timedelta(minutes=8)

test_df = hist_flights_df[hist_flights_df["dep_airport"] == "HOV"].copy()
test_df["std_"] = test_df["std"].between(test_df["std_collision_min"], test_df["std_collision_max"]).index
test_df.head()

Unnamed: 0,flight_id,dep_airport,dep_airport_group,arr_airport,arr_airport_group,service_type,std,sta,cancelled,atd,ata,std_collision_min,std_collision_max,sta_collision_min,sta_collision_max,atd_collision_min,atd_collision_max,ata_collision_min,ata_collision_max,t
0,WF149,HOV,B,OSL,,J,2018-01-02 16:40:00,2018-01-02 17:15:00,0,NaT,2018-01-02 18:53:00,2018-01-02 16:24:00,2018-01-02 16:45:00,2018-01-02 17:00:00,2018-01-02 17:23:00,NaT,NaT,2018-01-02 18:38:00,2018-01-02 19:01:00,0
3,WF176,HOV,B,OSL,,J,2018-04-07 11:00:00,2018-04-07 12:05:00,0,NaT,2018-04-07 12:00:00,2018-04-07 10:44:00,2018-04-07 11:05:00,2018-04-07 11:50:00,2018-04-07 12:13:00,NaT,NaT,2018-04-07 11:45:00,2018-04-07 12:08:00,3
4,WF148,HOV,B,OSL,,J,2018-04-30 08:25:00,2018-04-30 09:26:00,0,NaT,2018-04-30 09:36:00,2018-04-30 08:09:00,2018-04-30 08:30:00,2018-04-30 09:11:00,2018-04-30 09:34:00,NaT,NaT,2018-04-30 09:21:00,2018-04-30 09:44:00,4
7,WF166,HOV,B,OSL,,J,2018-08-03 18:15:00,2018-08-03 19:15:00,0,NaT,NaT,2018-08-03 17:59:00,2018-08-03 18:20:00,2018-08-03 19:00:00,2018-08-03 19:23:00,NaT,NaT,NaT,NaT,7
41,WF148,HOV,B,OSL,,,2019-01-05 09:25:00,2019-01-05 10:25:00,0,NaT,NaT,2019-01-05 09:09:00,2019-01-05 09:30:00,2019-01-05 10:10:00,2019-01-05 10:33:00,NaT,NaT,NaT,NaT,41


## Split data

In [92]:
X = hist_flights_df.drop(columns = ['target_actual'])
y = hist_flights_df['target_actual']

X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, train_size = 0.7)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, train_size = 0.5, test_size = 0.5)

KeyError: "['target_actual'] not found in axis"

## Temporary

In [None]:
train.head()

Unnamed: 0,airport_group,hour,target_actual,target_sched,flights_cnt,avg_duration,max_duration,passenger_share,cargo_share,charter_share,dow,month,hournum,weekend
0,C,2018-01-01 07:00:00,0,1,2,27.5,35.0,1.0,0.0,0.0,0,1,7,0
1,C,2018-01-01 08:00:00,1,1,2,50.0,80.0,1.0,0.0,0.0,0,1,8,0
2,D,2018-01-01 09:00:00,0,0,2,42.5,50.0,1.0,0.0,0.0,0,1,9,0
3,G,2018-01-01 09:00:00,0,0,1,67.0,67.0,1.0,0.0,0.0,0,1,9,0
4,C,2018-01-01 09:00:00,1,1,2,27.5,30.0,1.0,0.0,0.0,0,1,9,0


## Baseline

In [None]:
# y_val er fasit, target_actual
y_val = val["target_actual"]

# Naiv prediksjon: bruk target_sched direkte
y_pred_baseline = val["target_sched"]

# For log loss og AUC trenger vi "sannsynligheter".
# Vi kan bruke target_sched som sannsynlighet (0 eller 1).
auc_baseline = roc_auc_score(y_val, y_pred_baseline)
eps = 1e-3  # lite justeringsledd
y_pred_proba = val["target_sched"].replace({0: eps, 1: 1-eps})

logloss_baseline = log_loss(y_val, y_pred_proba)

print("Baseline AUC:", auc_baseline)
print("Baseline Log Loss:", logloss_baseline)

Baseline AUC: 0.7924117445456063
Baseline Log Loss: 1.4334834480322787


## Model

## Weather

In [None]:
endpoint = 'https://frost.met.no/observations/v0.jsonld'
parameters = {
    'sources': 'SN18700,SN90450',
    'elements': 'mean(air_temperature P1D),sum(precipitation_amount P1D),mean(wind_speed P1D)',
    'referencetime': '2010-04-01/2010-04-03',
}
r = requests.get(endpoint, parameters, auth=(FROST_ID,''))
json = r.json()

ConnectionError: HTTPSConnectionPool(host='frost.met.no', port=443): Max retries exceeded with url: /observations/v0.jsonld?sources=SN18700%2CSN90450&elements=mean%28air_temperature+P1D%29%2Csum%28precipitation_amount+P1D%29%2Cmean%28wind_speed+P1D%29&referencetime=2010-04-01%2F2010-04-03 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x73f7111e3d90>: Failed to resolve 'frost.met.no' ([Errno -2] Name or service not known)"))

In [None]:
if r.status_code == 200:
    data = json['data']
    print('Data retrieved from frost.met.no!')
else:
    print('Error! Returned status code %s' % r.status_code)
    print('Message: %s' % json['error']['message'])
    print('Reason: %s' % json['error']['reason'])

Data retrieved from frost.met.no!


In [None]:
df = pd.DataFrame()
for i in range(len(data)):
    row = pd.DataFrame(data[i]['observations'])
    row['referenceTime'] = data[i]['referenceTime']
    row['sourceId'] = data[i]['sourceId']
    df = pd.concat([df, row])

df = df.reset_index()

columns = ['sourceId','referenceTime','elementId','value','unit','timeOffset']
df2 = df[columns].copy()
# Convert the time value to something Python understands
df2['referenceTime'] = pd.to_datetime(df2['referenceTime'])

df2.head()

Unnamed: 0,sourceId,referenceTime,elementId,value,unit,timeOffset
0,SN18700:0,2010-04-01 00:00:00+00:00,mean(air_temperature P1D),3.2,degC,PT0H
1,SN18700:0,2010-04-01 00:00:00+00:00,mean(air_temperature P1D),3.0,degC,PT6H
2,SN18700:0,2010-04-01 00:00:00+00:00,sum(precipitation_amount P1D),13.5,mm,PT18H
3,SN18700:0,2010-04-01 00:00:00+00:00,sum(precipitation_amount P1D),29.0,mm,PT6H
4,SN18700:0,2010-04-01 00:00:00+00:00,mean(wind_speed P1D),1.7,m/s,PT0H
