## Import & DF's

In [68]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from Utils.AircraftTransformers import AircraftClusterTransformer
from Utils.DataExploration import explore_dataframe
from Utils.FeatureExploration import explore_features
from Utils.Models import LayeredDelayModel
from Utils.Transformers import FeatureNameCleaner, RemapTransformer, DFMerger, DropFeaturesTransformer, \
    DistanceTransformer, FeatureRenameTransformer, TypeCastDatetimeTransformer, \
    DatetimeDifferenceTransformer, EqualityFlagTransformer, RegexFlagTransformer, ConditionalValueUpdater, \
    DateTimeFeatureExtractor, LocalizeDatetimePerRowTransformer, TypeCastTransformer, TargetCategoryClusterer, \
    SmoothedTargetMeanEncoder, SmoothedTargetPositiveRateEncoder, NumericPolyLogTransformer, InvertibleQuantileCapper, \
    LogRobustYScaler

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
df_airports = pd.read_csv("../data/airportsdata/airports.csv")
df_iata = pd.read_csv("../data/airportsdata/iata_macs.csv")

df_train = pd.read_csv("../data/zindi/train.csv")
df_test = pd.read_csv("../data/zindi/test.csv")

df_train_delayed = df_train.drop(df_train[df_train["target"] == 0].index)

## First EDA

### Airports Data

#### Data Description

Extensive database of location and timezone data for nearly every operational airport and landing strip in the world, with 28,271 entries.

### Each entry consists of the following data:

- icao: ICAO 4-letter Location Indicator (Doc 7910) or (if none) an internal Pseudo-ICAO Identifier [1] (28,271 entries)
- iata: IATA 3-letter Location Code (7,859 entries) or an empty string [2]
- name Official name (diacritized latin script)
- city City (diacritized latin script), ideally using the local language or English
- subd Subdivision (e.g. state, province, region, etc.), ideally using the local-language or English names of ISO 3166-2
- country: ISO 3166-1 alpha-2 country code (plus XK for Kosovo)
- elevation: MSL elevation of the highest point of the landing area, in feet (warning: it is often wrong)
- lat: Latitude (decimal) of the airport reference point (max 5 or 6 decimal digits)
- lon: Longitude (decimal) of the airport reference point (max 5 or 6 decimal digits)
- tz: Timezone expressed as a tz database name (IANA-compliant)
- lid: U.S. FAA Location Identifier (12,610 entries), or an empty string



- [1] See here for an explanation on how the Pseudo-ICAO Identifier is generated for airports and seaplane bases without an ICAO 4-letter Location Indicator. (https://github.com/mborsetti/airportsdata/blob/main/README_identifiers.rst)
- [2] IATA Multi Airport Cities (MAC) are not not airports and therfore not included, but we provide a database and a Python function that returns the above data for all the airports of a IATA MAC. Please see documentation here.
Best efforts are placed to review all contributions for accuracy, but accuracy cannot be guaranteed nor should be expected by users. (https://github.com/mborsetti/airportsdata/blob/main/README_IATA.rst)

### Important notes:
- Timezone was originally sourced from TimeZoneDB;
- No historical data (closed airports are removed);
- No heliports without a IATA code;
- No sea bases without a IATA code;
- No surface transportation stations, even if they have an official IATA code.

### Exploration Results:
- Iata, City, Subd, country and lid is incomplete
- Iata could be inferred from df_iata

In [70]:
#explore_features(df_airports)

In [71]:
#explore_dataframe(df_airports, plot_corr_matrix=False)

In [72]:
#df_airports["name"].unique()

In [73]:
#explore_features(df_iata)

### Flight Data

#### Variable definitions

- DATOP - Date of flight
- FLTID - Flight number
- DEPSTN - dep point
- ARRSTN - arr point
- STD - Scheduled Time dep
- STA - Scheduled Time arr
- STATUS - Flight status
- ETD - Expected Time dep
- ETA - Expected Time arr
- **ATD - Actual Time of dep**
- ATA - Actual Time of arr
- **DELAY1 - Delay code 1**
- **DUR1 - delay time 1**
- **DELAY2 - Delay code 2**
- **DUR2 - delay time 2**
- **DELAY3 - Delay code 3**
- **DUR3 - delay time 3**
- **DELAY4 - Delay code 4**
- **DUR4 - delay time 4****
- AC - Aircraft Code

In [74]:
explore_features(df_train)

ID  Name    non-null count   D-Type   Feature Summary                                                                                                              
0   ID      [38;5;2m107833 non-null[0m  [38;5;1mobject[0m   [38;5;11monly unique Values[0m                                                                                                           
1   DATOP   [38;5;2m107833 non-null[0m  [38;5;1mobject[0m   2018-08-31 (183), 2016-08-25 (179), 2018-08-27 (178), 2016-08-26 (178), 2016-08-30 (177)                                     
2   FLTID   [38;5;2m107833 non-null[0m  [38;5;1mobject[0m   WKL 0000  (3105), TU 0613  (1284), TU 0397  (1184), AOG 0000  (1103), TU 0634  (1075)                                        
3   DEPSTN  [38;5;2m107833 non-null[0m  [38;5;1mobject[0m   TUN (42522), DJE (10252), ORY (6755), MIR (5248), MRS (2845)                                                                 
4   ARRSTN  [38;5;2m107833 non-null[0m  [38;5;1mobject[0m 

In [75]:
#explore_dataframe(df_train)

## Data merging & Second EDA/Feature Engineering

In [76]:
preprocess_pipeline = Pipeline([
    ("clean_feature_names", FeatureNameCleaner()),
    ("rename_dep_airport", RemapTransformer(column="depstn", mapping={"SXF": "BER"})),
    ("rename_arr_airport", RemapTransformer(column="arrstn", mapping={"SXF": "BER"})),
    ("merge_dep_airports", DFMerger(df_to_merge=df_airports, left_on="depstn", right_on="iata", prefix="dep_")),
    ("merge_arr_airports", DFMerger(df_to_merge=df_airports, left_on="arrstn", right_on="iata", prefix="arr_")),

    ## Feature Cleaning
    ("drop_unused_features", DropFeaturesTransformer(["id", "dep_icao", "arr_icao", "dep_iata", "arr_iata", "dep_name", "arr_name", "dep_city", "arr_city", "dep_lid", "arr_lid", "dep_subd", "arr_subd"]),),
    ("rename_features", FeatureRenameTransformer({
        "datop": "date_of_flight",
        "fltid": "flight_id",
        "depstn": "dep_airport",
        "arrstn": "arr_airport",
        "std": "scheduled_dep_time",
        "sta": "scheduled_arr_time",
        "ac": "aircraft_code",
        "dep_tz": "dep_timezone",
        "arr_tz": "arr_timezone",
    })),

    ("object_to_string_type", TypeCastTransformer({
        "dep_airport": "str",
        "arr_airport": "str",
        "status": "str",
        "aircraft_code": "str",
        "dep_country": "str",
        "arr_country": "str"})),
])

In [77]:
df_train_preprocessed = preprocess_pipeline.fit_transform(df_train)
df_test_preprocessed = preprocess_pipeline.fit_transform(df_test)

df_train_preprocessed = df_train_preprocessed.reset_index(drop=True)
df_test_preprocessed = df_test_preprocessed.reset_index(drop=True)

In [78]:
explore_features(df_train_preprocessed)

ID  Name                non-null count   D-Type   Feature Summary                                                                                                              
0   date_of_flight      [38;5;2m107833 non-null[0m  [38;5;1mobject[0m   2018-08-31 (183), 2016-08-25 (179), 2018-08-27 (178), 2016-08-26 (178), 2016-08-30 (177)                                     
1   flight_id           [38;5;2m107833 non-null[0m  [38;5;1mobject[0m   WKL 0000  (3105), TU 0613  (1284), TU 0397  (1184), AOG 0000  (1103), TU 0634  (1075)                                        
2   dep_airport         [38;5;2m107833 non-null[0m  [38;5;1mobject[0m   TUN (42522), DJE (10252), ORY (6755), MIR (5248), MRS (2845)                                                                 
3   arr_airport         [38;5;2m107833 non-null[0m  [38;5;1mobject[0m   TUN (42572), DJE (10198), ORY (6755), MIR (5251), MRS (2845)                                                                 
4   scheduled_de

In [79]:
#explore_dataframe(df_train_preprocessed, target_feature_name="target", plot_data_spread=True, plot_pairs=False, plot_corr_matrix=False, plot_target_correlation=False)

## Quick Baseline model

In [80]:
X_train_baseline, X_test_baseline, y_train_baseline, y_test_baseline = train_test_split(df_train_preprocessed.drop('target', axis=1), df_train_preprocessed['target'], test_size=0.2)

baseline_cat_features = ["flight_id", "dep_airport", "arr_airport", "status", "aircraft_code", "dep_country",
                "arr_country", "dep_timezone", "arr_timezone", ]
baseline_num_features = ["dep_elevation", "dep_lat", "dep_lon", "arr_elevation", "arr_lat", "arr_lon", ]

baseline_clf_params = {
    "iterations": 300,
    "depth": 6,
    "learning_rate": 0.1,
    "loss_function": "Logloss",
    "random_seed": 42
}

baseline_reg_params = {
    "iterations": 300,
    "depth": 6,
    "learning_rate": 0.1,
    "loss_function": "MAE",
    "random_seed": 42
}

baseline_model = LayeredDelayModel(
        clf_params=baseline_clf_params,
        reg_params=baseline_reg_params,
        cat_features_clf=baseline_cat_features,
        num_features_clf=baseline_num_features,
        cat_features_reg=baseline_cat_features,
        num_features_reg=baseline_num_features
    )

layered_baseline_pipeline = Pipeline([
    ("layered_model", baseline_model)
])

if False:
    layered_baseline_pipeline.fit(X_train_baseline, y_train_baseline)
    y_pred_baseline_layered = layered_baseline_pipeline.predict(X_test_baseline)

    baseline_model.evaluate(X_test_baseline, y_test_baseline)

    rmse = np.sqrt(mean_squared_error(
        y_test_baseline,
        y_pred_baseline_layered
    ))

    print(f"Layered RMSE (finale): {rmse:.2f}")

## Feature Engineering


In [81]:
cat_features_clf = ["status"]
num_features_clf = []

cat_features_reg = ["status"]
num_features_reg = []

### Scale-Pipelines

### Target is Zero

Some features reliably predict, that the flight is on time (Target is zero).

- if the flight id Ends with **0000** its an internal flight (e.g. maintance)
    - We set the ```flight_duration_minutes``` to zero for internal flights because they are stationary at one airport
- if the status is **SCH** or **DEL**

We set a ```is_on_time``` flag in both cases

In [82]:
is_on_time_pipeline = Pipeline([
    ("mark_internal_flights", RegexFlagTransformer("flight_id", r"0000$", "is_internal_flight", True)),
    ("is_on_time_flag_status", RegexFlagTransformer("status", r"^SCH|DEL$", "is_on_time", True)),
    ("is_on_time_flag_internal_flights", ConditionalValueUpdater("is_internal_flight", True, "is_on_time", True)),
])

cat_features_clf += ["is_on_time"]

### Time


- we convert all Time strings to ```DateTime```
- using those features we calculate the ```flight_duration_minutes``` Feature
- using the ```dep_timezone``` and ```arr_timezone``` we calculate a ```local_sch_dep_time``` and a ```local_sch_arr_time``` Feature
- We extract ```date_of_flight_month```, ```date_of_flight_weekday```, a ```date_of_flight_is_weekend``` flag and the ```local_sch_dep_time_hour``` and ```local_sch_arr_time_hour``` Feature
- We set a flag if it is an ```is_overnight_flight_flag```
- afterwards we drop features that are already encoded

#### ToDo:

- Clustering
   - Month (Season)
   - Hour (Time of Day)
- smooth flight duration

In [83]:
time_pipeline = Pipeline([
    ("date_of_flight_to_daytime", TypeCastDatetimeTransformer("date_of_flight")),
    ("scheduled_dep_time_to_daytime", TypeCastDatetimeTransformer("scheduled_dep_time")),
    ("scheduled_arr_time_to_daytime", TypeCastDatetimeTransformer("scheduled_arr_time", "%Y-%m-%d %H.%M.%S")),

    ("compute_flight_time", DatetimeDifferenceTransformer("scheduled_dep_time", "scheduled_arr_time", "flight_duration_minutes", "minutes")),

    ("calculate_local_arr_time", LocalizeDatetimePerRowTransformer("scheduled_dep_time", "dep_timezone", "local_sch_dep_time")),
    ("calculate_local_dep_time", LocalizeDatetimePerRowTransformer("scheduled_arr_time", "arr_timezone", "local_sch_arr_time")),

    ("extract_time_features_1", DateTimeFeatureExtractor("date_of_flight", ['month', 'weekday', 'is_weekend'], month_as_category=False, weekday_as_category=False)),
    ("extract_local_time_features_1", DateTimeFeatureExtractor("local_sch_dep_time", ['hour', 'weekday', 'month'])),
    ("extract_local_time_features_2", DateTimeFeatureExtractor("local_sch_arr_time", ['hour', 'weekday', 'month'])),

    ("set_same_timezone_flag", EqualityFlagTransformer("dep_timezone", "arr_timezone", "same_timezone_flag")),
    ("set_overnight_flight_flag", EqualityFlagTransformer("local_sch_arr_time_weekday", "local_sch_dep_time_weekday", "is_overnight_flight_flag", True)),

    ("mean_delay_dep_hour", SmoothedTargetMeanEncoder("local_sch_dep_time_hour", "local_sch_dep_time_hour_mean_delay")),
    ("mean_delay_arr_hour", SmoothedTargetMeanEncoder("local_sch_arr_time_hour", "local_sch_arr_time_hour_mean_delay")),
    ("mean_delay_dep_weekday", SmoothedTargetMeanEncoder("local_sch_dep_time_weekday", "local_sch_dep_time_weekday_mean_delay")),
    ("mean_delay_arr_weekday", SmoothedTargetMeanEncoder("local_sch_arr_time_weekday", "local_sch_arr_time_weekday_mean_delay")),
    ("mean_delay_dep_month", SmoothedTargetMeanEncoder("local_sch_dep_time_month", "local_sch_dep_time_month_mean_delay")),
    ("mean_delay_arr_month", SmoothedTargetMeanEncoder("local_sch_arr_time_month", "local_sch_arr_time_month_mean_delay")),

    ("p_delay_dep_hour", SmoothedTargetPositiveRateEncoder("local_sch_dep_time_hour", "local_sch_dep_time_hour_p_delay")),
    ("p_delay_arr_hour", SmoothedTargetPositiveRateEncoder("local_sch_arr_time_hour", "local_sch_arr_time_hour_p_delay")),
    ("p_delay_dep_weekday", SmoothedTargetPositiveRateEncoder("local_sch_dep_time_weekday", "local_sch_dep_time_weekday_p_delay")),
    ("p_delay_arr_weekday", SmoothedTargetPositiveRateEncoder("local_sch_arr_time_weekday", "local_sch_arr_time_weekday_p_delay")),
    ("p_delay_dep_month", SmoothedTargetPositiveRateEncoder("local_sch_dep_time_month", "local_sch_dep_time_month_p_delay")),
    ("p_delay_arr_month", SmoothedTargetPositiveRateEncoder("local_sch_arr_time_month", "local_sch_arr_time_month_p_delay")),

    ("set_internal_flight_duration_to_0", ConditionalValueUpdater("is_internal_flight", True, "flight_duration_minutes", 0)),

    ("drop_unused_features", DropFeaturesTransformer(["date_of_flight", "scheduled_dep_time", "scheduled_arr_time", "local_sch_dep_time", "local_sch_arr_time",
                                                      "local_sch_arr_time_weekday", "local_sch_dep_time_weekday", "local_sch_dep_time_month", "local_sch_arr_time_month",
                                                      "local_sch_arr_time_hour", "local_sch_dep_time_hour"
                                                      #"dep_timezone", "arr_timezone",
    ])),
])

num_features_time = ["flight_duration_minutes", ]

num_features_clf += ["local_sch_dep_time_weekday_p_delay", "local_sch_arr_time_weekday_p_delay",
                    "local_sch_dep_time_month_p_delay", "local_sch_arr_time_month_p_delay",
                    "local_sch_dep_time_hour_p_delay", "local_sch_arr_time_hour_p_delay",
                    ] + num_features_time

num_features_reg += ["local_sch_arr_time_hour_mean_delay", "local_sch_dep_time_weekday_mean_delay",
                    "local_sch_dep_time_hour_mean_delay", "local_sch_arr_time_weekday_mean_delay",
                    "local_sch_dep_time_month_mean_delay", "local_sch_arr_time_month_mean_delay",
                    ] + num_features_time


cat_features = ["date_of_flight_month", "date_of_flight_weekday", "date_of_flight_is_weekend",
                     "same_timezone_flag", "is_overnight_flight_flag", "dep_timezone", "arr_timezone"]

cat_features_clf += cat_features
cat_features_reg += cat_features

### Geographical Data

- smooth elevation, flight_distance

In [84]:
geographical_pipeline = Pipeline([
    ("compute_distance", DistanceTransformer(lat_1="dep_lat", lon_1="dep_lon", lat_2="arr_lat", lon_2="arr_lon", new_feature_name="flight_distance")),
    ("set_same_airport_flag", EqualityFlagTransformer("dep_airport", "arr_airport", "same_airport_flag")),
    ("set_domestic_flight_flag", EqualityFlagTransformer("dep_country", "arr_country", "domestic_flight_flag")),

    ("mean_delay_dep_airport", SmoothedTargetMeanEncoder("dep_airport", "dep_airport_mean_delay")),
    ("mean_delay_arr_airport", SmoothedTargetMeanEncoder("arr_airport", "arr_airport_mean_delay")),
    ("mean_delay_dep_country", SmoothedTargetMeanEncoder("dep_country", "dep_country_mean_delay")),
    ("mean_delay_arr_country", SmoothedTargetMeanEncoder("arr_country", "arr_country_mean_delay")),

    ("p_delay_dep_airport", SmoothedTargetPositiveRateEncoder("dep_airport", "dep_airport_p_delay")),
    ("p_delay_arr_airport", SmoothedTargetPositiveRateEncoder("arr_airport", "arr_airport_p_delay")),
    ("p_delay_dep_country", SmoothedTargetPositiveRateEncoder("dep_country", "dep_country_p_delay")),
    ("p_delay_arr_country", SmoothedTargetPositiveRateEncoder("arr_country", "arr_country_p_delay")),

    ("drop_unused_features", DropFeaturesTransformer(["dep_airport", "arr_airport", "dep_country", "dep_lat", "dep_lon", "arr_country", "arr_lat", "arr_lon"])),

])

num_features_geographical = ["flight_distance", "dep_elevation", "arr_elevation"]
cat_features_geographical = ["same_airport_flag", "domestic_flight_flag"]

num_features_clf += ["dep_airport_p_delay", "arr_airport_p_delay",
                     "dep_country_p_delay", "arr_country_p_delay",
                    ] + num_features_geographical

num_features_reg += ["dep_airport_mean_delay", "arr_airport_mean_delay",
                     "dep_country_mean_delay", "arr_country_mean_delay",
                    ] + num_features_geographical

cat_features_clf += cat_features_geographical
cat_features_reg += cat_features_geographical

### AC and Flight ID

In [85]:
aircraft_code_pipeline = Pipeline([
    ("mean_delay_aircraft_code", SmoothedTargetMeanEncoder("aircraft_code", "aircraft_code_mean_delay")),
    ("mean_delay_flight_id", SmoothedTargetMeanEncoder("flight_id", "flight_id_mean_delay")),

    ("p_delay_aircraft_code", SmoothedTargetPositiveRateEncoder("aircraft_code", "aircraft_code_p_delay")),
    ("p_delay_flight_id", SmoothedTargetPositiveRateEncoder("flight_id", "flight_id_p_delay")),

    ("remap_aircaft_code", AircraftClusterTransformer("aircraft_code")),
    ("drop_unused_features", DropFeaturesTransformer(["flight_id"])),
    #("cluster_aircraft_codes", TargetCategoryClusterer(cat_feature="aircraft_code", data_feature="target", n_clusters_max=5, new_feature_name="aircraft_codes_clustered")),
])

num_features_clf += ["aircraft_code_p_delay", "flight_id_p_delay"]

num_features_reg += ["aircraft_code_mean_delay", "flight_id_mean_delay"]

cat_features_reg += ["aircraft_code"]
cat_features_clf += ["aircraft_code"]

### Type Transformation and Scaling

In [86]:
category_transformation_pipeline = Pipeline([
    ("poly", NumericPolyLogTransformer(["arr_elevation", "dep_elevation", "flight_distance", "flight_duration_minutes"])),
    ("to_category_type", TypeCastTransformer({
        "status": "category",
        "date_of_flight_weekday": "category",
        "date_of_flight_month": "category",
        "dep_timezone": "category",
        "arr_timezone": "category",
    }))
])

### Model Pipeline

In [87]:
clf_params = {
    "iterations": 500, #300
    "depth": 10,
    "learning_rate": 0.1,
    "loss_function": "Logloss",
    "random_seed": 42
}

reg_params = {
    "iterations": 500, #300
    "depth": 10,
    "learning_rate": 0.1,
    "loss_function": "RMSE",
    "random_seed": 42
}

model = LayeredDelayModel(
        clf_params=clf_params,
        reg_params=reg_params,
        cat_features_clf=cat_features_clf,
        num_features_clf=num_features_clf,
        cat_features_reg=cat_features_reg,
        num_features_reg=num_features_reg
    )

target_scaler = LogRobustYScaler()

process_pipeline = Pipeline([
    ("is_on_time_pipeline", is_on_time_pipeline),
    ("time_pipeline", time_pipeline),
    ("geographical_pipeline", geographical_pipeline),
    ("aircraft_code_pipeline", aircraft_code_pipeline),
    ("category_transformation", category_transformation_pipeline),
    #("target_scaler", target_scaler),
])

model_pipeline = Pipeline([
    ("process_pipeline", process_pipeline),
    ("model", model)
])

In [88]:
X_train, X_test, y_train, y_test = train_test_split(df_train_preprocessed.drop('target', axis=1), df_train_preprocessed['target'], test_size=0.2)

model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
#y_pred_unscaled = target_scaler.inverse_transform(y_pred)

print(f"\nLayered RMSE (finale): {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}\n")

process_pipeline.fit(X_train, y_train)
X_test_transformed = process_pipeline.transform(X_test)
model.evaluate(X_test_transformed, y_test)

#print()
#explore_features(X_test_transformed)


0:	learn: 0.6154698	total: 169ms	remaining: 1m 24s
100:	learn: 0.3739520	total: 17.3s	remaining: 1m 8s
200:	learn: 0.3425772	total: 36s	remaining: 53.6s
300:	learn: 0.3228553	total: 54.3s	remaining: 35.9s
400:	learn: 0.3057087	total: 1m 12s	remaining: 17.9s
499:	learn: 0.2911243	total: 1m 30s	remaining: 0us
0:	learn: 139.1373605	total: 112ms	remaining: 56s
100:	learn: 120.5675685	total: 11s	remaining: 43.3s
200:	learn: 117.6746326	total: 21.1s	remaining: 31.4s
300:	learn: 114.7586543	total: 32.7s	remaining: 21.6s
400:	learn: 111.9446820	total: 43.2s	remaining: 10.7s
499:	learn: 109.8453703	total: 53.8s	remaining: 0us

Layered RMSE (finale): 104.99

[94m=== Layered Delay Model Evaluation ===[0m
Classifier Accuracy: 0.8076
Regressor RMSE (delay>0): 126.89


In [91]:
import pickle

with open("../models/Johannes_layered_catboost.pkl", "wb") as f:
    pickle.dump(model_pipeline, f)

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def layered_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(
    layered_rmse,
    greater_is_better=False
)

cv_results = cross_validate(
    model_pipeline,
    df_train_preprocessed.drop("target", axis=1),
    df_train_preprocessed["target"],
    cv=cv,
    scoring=rmse_scorer,
    return_train_score=True,
    n_jobs=-1
)

rmse_train = -cv_results["train_score"]
rmse_test  = -cv_results["test_score"]

print(f"CV Train RMSE: {rmse_train.mean():.2f} ± {rmse_train.std():.2f}")
print(f"CV Test  RMSE: {rmse_test.mean():.2f} ± {rmse_test.std():.2f}")

In [None]:
explore_dataframe(df_test, target_feature_name="target", plot_data_spread=True, plot_pairs=False, plot_target_correlation=False, plot_corr_matrix=False)

### Print cluster Data

In [None]:
cluster_mapping_df = (
    pd.DataFrame.from_dict(
        dep_airport_clustering_transformer.categories_,
        orient="index",
        columns=["cluster"]
    )
    .reset_index()
    .rename(columns={"index": "airport"})
    .sort_values("cluster")
    )

airport_counts = (
    df_train_processed["dep_airport"]
    .value_counts()
    .rename("airport_count")
    .reset_index()
    .rename(columns={"dep_airport": "airport"})
)

result = (
    airport_counts
    .merge(cluster_mapping_df, on="airport", how="left")
    .sort_values(["cluster", "airport_count"], ascending=[True, False])
)

for unique in cluster_mapping_df.cluster.unique():
    print(cluster_mapping_df[cluster_mapping_df["cluster"] == unique])