In [35]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 70)
pd.set_option('display.max_columns', 50)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder
from flaml import AutoML


In [10]:
df = pd.read_csv('../../data/clean_data_final.csv')
X = df.drop(columns='PFPI_MINUTES')
y = df['PFPI_MINUTES']

In [29]:
y_binary = pd.cut(y.rename('late_or_vlate'), bins=[-1, 10, y.max()+1], labels = [1, 0])

0         1
1         1
2         1
3         1
4         1
         ..
493246    1
493247    1
493248    1
493249    1
493250    1
Name: late_or_vlate, Length: 493251, dtype: category
Categories (2, int64): [1 < 0]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y_concat, test_size=0.3)
num_transformer = MinMaxScaler()

cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse = False)

transformer = make_column_transformer((num_transformer, ['Lat_OR','Lon_OR', 'Lat_DES','Lon_DES']),
                                  (cat_transformer, ['ENGLISH_DAY_TYPE', 'SERVICE_GROUP_CODE_AFFECTED', 'INCIDENT_REASON',
                                                        'UNIT_CLASS_AFFECTED', 'TRAIN_SERVICE_CODE_AFFECTED', 
                                                     'PERFORMANCE_EVENT_CODE', 
                                                     'APP_TIMETABLE_FLAG_AFF']),
                                remainder = 'passthrough')



X_train_scaled = transformer.fit_transform(X_train)
X_test_scaled = transformer.transform(X_test)

In [36]:
automl = AutoML()
automl_settings = {
    "time_budget": 60,  # in seconds
    "metric": 'accuracy',
    "task": 'classification',
    "log_file_name": "late_vlate.log",
}

automl.fit(X_train=X_train_scaled, y_train=y_train['late_or_vlate'],
           **automl_settings)

[flaml.automl.logger: 12-05 15:13:37] {1679} INFO - task = classification
[flaml.automl.logger: 12-05 15:13:37] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 12-05 15:13:38] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 12-05 15:13:38] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 12-05 15:13:38] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-05 15:13:38] {2344} INFO - Estimated sufficient time budget=53372s. Estimated necessary time budget=1230s.
[flaml.automl.logger: 12-05 15:13:38] {2391} INFO -  at 0.9s,	estimator lgbm's best error=0.0426,	best estimator lgbm's best error=0.0426
[flaml.automl.logger: 12-05 15:13:38] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-05 15:13:40] {2391} INFO -  at 2.7s,	estimator lgbm's best error=0.0426,	best estimator lgbm's best error=0.0426
[flaml.automl.logger: 12-0

[flaml.automl.logger: 12-05 15:14:11] {2218} INFO - iteration 34, current learner rf
[flaml.automl.logger: 12-05 15:14:12] {2391} INFO -  at 35.4s,	estimator rf's best error=0.0409,	best estimator rf's best error=0.0409
[flaml.automl.logger: 12-05 15:14:12] {2218} INFO - iteration 35, current learner rf
[flaml.automl.logger: 12-05 15:14:13] {2391} INFO -  at 36.2s,	estimator rf's best error=0.0409,	best estimator rf's best error=0.0409
[flaml.automl.logger: 12-05 15:14:13] {2218} INFO - iteration 36, current learner extra_tree
[flaml.automl.logger: 12-05 15:14:14] {2391} INFO -  at 36.7s,	estimator extra_tree's best error=0.0426,	best estimator rf's best error=0.0409
[flaml.automl.logger: 12-05 15:14:14] {2218} INFO - iteration 37, current learner extra_tree
[flaml.automl.logger: 12-05 15:14:14] {2391} INFO -  at 37.4s,	estimator extra_tree's best error=0.0426,	best estimator rf's best error=0.0409
[flaml.automl.logger: 12-05 15:14:14] {2218} INFO - iteration 38, current learner extra_

In [41]:
df_10 = df[(df['PFPI_MINUTES'] < 10) & (df['PFPI_MINUTES'] > 0)].copy()

In [42]:
df_10

Unnamed: 0,TRAIN_SERVICE_CODE_AFFECTED,SERVICE_GROUP_CODE_AFFECTED,ENGLISH_DAY_TYPE,APP_TIMETABLE_FLAG_AFF,UNIT_CLASS_AFFECTED,INCIDENT_REASON,PERFORMANCE_EVENT_CODE,PFPI_MINUTES,Lat_OR,Lon_OR,Lat_DES,Lon_DES,ORIG_MONTH_SIN,ORIG_MONTH_COS,ORIG_DAY_SIN,ORIG_DAY_COS,ORIG_HOUR_SIN,ORIG_HOUR_COS,ORIG_MINUTE_SIN,ORIG_MINUTE_COS,DEST_MONTH_SIN,DEST_MONTH_COS,DEST_DAY_SIN,DEST_DAY_COS,DEST_HOUR_SIN,DEST_HOUR_COS,DEST_MINUTE_SIN,DEST_MINUTE_COS
0,22215003,EK03,BH,Y,375.0,M,M,9.0,51.546500,-0.10408,51.546500,-0.10408,0.406737,0.913545,0.207912,0.978148,0.743145,-0.669131,-0.309017,0.951057,0.406737,0.913545,0.207912,0.978148,0.669131,-0.743145,-0.978148,2.079117e-01
1,22215003,EK03,BH,Y,375.0,M,M,6.0,51.546500,-0.10408,51.546120,-0.07513,0.406737,0.913545,0.207912,0.978148,0.994522,-0.104528,-0.951057,-0.309017,0.406737,0.913545,0.207912,0.978148,0.978148,-0.207912,-0.207912,-9.781476e-01
14,25234001,EK04,WD,Y,317.0,M,M,4.0,51.523880,-0.05982,51.517991,-0.08140,0.406737,0.913545,0.309017,0.951057,0.669131,0.743145,-0.743145,0.669131,0.406737,0.913545,0.309017,0.951057,0.743145,0.669131,-0.309017,-9.510565e-01
15,25234001,EK04,WD,Y,317.0,M,M,5.0,51.701930,-0.02399,51.701930,-0.02399,0.406737,0.913545,0.309017,0.951057,0.743145,0.669131,0.743145,-0.669131,0.406737,0.913545,0.309017,0.951057,0.809017,0.587785,0.309017,9.510565e-01
16,21235001,EK04,WD,Y,315.0,M,M,3.0,51.517991,-0.08140,51.517991,-0.08140,0.406737,0.913545,0.309017,0.951057,0.743145,0.669131,-0.951057,0.309017,0.406737,0.913545,0.309017,0.951057,0.809017,0.587785,1.000000,2.832769e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493246,22214000,EK01,WD,Y,378.0,R,M,5.0,51.555480,-0.15136,51.532500,-0.24454,0.500000,0.866025,0.587785,-0.809017,0.743145,-0.669131,0.809017,-0.587785,0.500000,0.866025,0.587785,-0.809017,0.669131,-0.743145,0.669131,-7.431448e-01
493247,22204000,EK01,WD,Y,378.0,R,M,4.0,51.526520,-0.23569,51.532500,-0.24454,0.500000,0.866025,0.587785,-0.809017,0.743145,-0.669131,-0.913545,0.406737,0.500000,0.866025,0.587785,-0.809017,0.669131,-0.743145,-0.587785,8.090170e-01
493248,22214000,EK01,WD,Y,378.0,R,M,7.0,51.491410,-0.27553,51.522740,-0.25487,0.500000,0.866025,0.587785,-0.809017,0.743145,-0.669131,-0.978148,0.207912,0.500000,0.866025,0.587785,-0.809017,0.669131,-0.743145,-0.866025,5.000000e-01
493249,22214000,EK01,WD,Y,378.0,R,M,4.0,51.491410,-0.27553,51.522740,-0.25487,0.500000,0.866025,0.587785,-0.809017,0.669131,-0.743145,0.978148,-0.207912,0.500000,0.866025,0.587785,-0.809017,0.669131,-0.743145,-0.406737,-9.135455e-01


In [45]:
X_10 = df_10.drop(columns='PFPI_MINUTES')
y_10 = df_10['PFPI_MINUTES']
X_10_train, X_10_test, y_10_train, y_10_test = train_test_split(X_10,y_10, test_size=0.3)
num_transformer = MinMaxScaler()

cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse = False)

transformer = make_column_transformer((num_transformer, ['Lat_OR','Lon_OR', 'Lat_DES','Lon_DES']),
                                  (cat_transformer, ['ENGLISH_DAY_TYPE', 'SERVICE_GROUP_CODE_AFFECTED', 'INCIDENT_REASON',
                                                        'UNIT_CLASS_AFFECTED', 'TRAIN_SERVICE_CODE_AFFECTED', 
                                                     'PERFORMANCE_EVENT_CODE', 
                                                     'APP_TIMETABLE_FLAG_AFF']),
                                remainder = 'passthrough')



X_10_train_scaled = transformer.fit_transform(X_10_train)
X_10_test_scaled = transformer.transform(X_10_test)

In [47]:
automl = AutoML()
settings = {
    "time_budget": 1800,  # total running time in seconds
    "metric": 'rmse',  # primary metrics for regression can be chosen from: ['mae','mse','r2']
#     "estimator_list": 'regression',  # list of ML learners; we tune XGBoost in this example
    "task": 'regression',  # task type
    "log_file_name": 'train_delays_10.log',  # flaml log file
    "seed": 37,  # random seed
}
automl.fit(X_train=X_10_train_scaled, y_train=y_10_train, **settings)


[flaml.automl.logger: 12-05 15:26:20] {1679} INFO - task = regression
[flaml.automl.logger: 12-05 15:26:20] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 12-05 15:26:21] {1788} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 12-05 15:26:21] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 12-05 15:26:21] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-05 15:26:21] {2344} INFO - Estimated sufficient time budget=6616s. Estimated necessary time budget=47s.
[flaml.automl.logger: 12-05 15:26:21] {2391} INFO -  at 0.2s,	estimator lgbm's best error=1.6812,	best estimator lgbm's best error=1.6812
[flaml.automl.logger: 12-05 15:26:21] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-05 15:26:21] {2391} INFO -  at 0.2s,	estimator lgbm's best error=1.5430,	best estimator lgbm's best error=1.5430
[flaml.automl.logger: 12-05 15:26:21] {2218} IN

[flaml.automl.logger: 12-05 15:26:25] {2218} INFO - iteration 34, current learner lgbm
[flaml.automl.logger: 12-05 15:26:25] {2391} INFO -  at 4.2s,	estimator lgbm's best error=1.4301,	best estimator lgbm's best error=1.4301
[flaml.automl.logger: 12-05 15:26:25] {2218} INFO - iteration 35, current learner rf
[flaml.automl.logger: 12-05 15:26:25] {2391} INFO -  at 4.3s,	estimator rf's best error=1.4585,	best estimator lgbm's best error=1.4301
[flaml.automl.logger: 12-05 15:26:25] {2218} INFO - iteration 36, current learner rf
[flaml.automl.logger: 12-05 15:26:25] {2391} INFO -  at 4.5s,	estimator rf's best error=1.4585,	best estimator lgbm's best error=1.4301
[flaml.automl.logger: 12-05 15:26:25] {2218} INFO - iteration 37, current learner rf
[flaml.automl.logger: 12-05 15:26:25] {2391} INFO -  at 4.7s,	estimator rf's best error=1.4543,	best estimator lgbm's best error=1.4301
[flaml.automl.logger: 12-05 15:26:25] {2218} INFO - iteration 38, current learner lgbm
[flaml.automl.logger: 12-

[flaml.automl.logger: 12-05 15:28:12] {2218} INFO - iteration 70, current learner rf
[flaml.automl.logger: 12-05 15:28:12] {2391} INFO -  at 111.8s,	estimator rf's best error=1.4511,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:28:12] {2218} INFO - iteration 71, current learner extra_tree
[flaml.automl.logger: 12-05 15:28:13] {2391} INFO -  at 112.1s,	estimator extra_tree's best error=1.4571,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:28:13] {2218} INFO - iteration 72, current learner rf
[flaml.automl.logger: 12-05 15:28:13] {2391} INFO -  at 112.5s,	estimator rf's best error=1.4491,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:28:13] {2218} INFO - iteration 73, current learner xgb_limitdepth
[flaml.automl.logger: 12-05 15:28:15] {2391} INFO -  at 114.8s,	estimator xgb_limitdepth's best error=1.4261,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:28:15] {2218} INFO - iteration 74, cu

[flaml.automl.logger: 12-05 15:29:58] {2218} INFO - iteration 105, current learner rf
[flaml.automl.logger: 12-05 15:29:58] {2391} INFO -  at 217.8s,	estimator rf's best error=1.4491,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:29:58] {2218} INFO - iteration 106, current learner xgboost
[flaml.automl.logger: 12-05 15:30:03] {2391} INFO -  at 223.0s,	estimator xgboost's best error=1.3923,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:30:03] {2218} INFO - iteration 107, current learner xgb_limitdepth
[flaml.automl.logger: 12-05 15:30:14] {2391} INFO -  at 234.0s,	estimator xgb_limitdepth's best error=1.4026,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:30:14] {2218} INFO - iteration 108, current learner xgboost
[flaml.automl.logger: 12-05 15:30:17] {2391} INFO -  at 236.9s,	estimator xgboost's best error=1.3900,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:30:17] {2218} INFO - iteratio

[flaml.automl.logger: 12-05 15:37:18] {2218} INFO - iteration 140, current learner xgboost
[flaml.automl.logger: 12-05 15:37:43] {2391} INFO -  at 682.4s,	estimator xgboost's best error=1.3838,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:37:43] {2218} INFO - iteration 141, current learner extra_tree
[flaml.automl.logger: 12-05 15:37:44] {2391} INFO -  at 683.1s,	estimator extra_tree's best error=1.4253,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:37:44] {2218} INFO - iteration 142, current learner xgboost
[flaml.automl.logger: 12-05 15:37:47] {2391} INFO -  at 686.9s,	estimator xgboost's best error=1.3838,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:37:47] {2218} INFO - iteration 143, current learner xgb_limitdepth
[flaml.automl.logger: 12-05 15:37:54] {2391} INFO -  at 693.4s,	estimator xgb_limitdepth's best error=1.3908,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:37:54] {2218}

[flaml.automl.logger: 12-05 15:49:40] {2218} INFO - iteration 175, current learner xgboost
[flaml.automl.logger: 12-05 15:49:44] {2391} INFO -  at 1403.9s,	estimator xgboost's best error=1.3838,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:49:44] {2218} INFO - iteration 176, current learner xgb_limitdepth
[flaml.automl.logger: 12-05 15:51:14] {2391} INFO -  at 1493.9s,	estimator xgb_limitdepth's best error=1.3908,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:51:14] {2218} INFO - iteration 177, current learner xgboost
[flaml.automl.logger: 12-05 15:51:20] {2391} INFO -  at 1499.5s,	estimator xgboost's best error=1.3838,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:51:20] {2218} INFO - iteration 178, current learner xgboost
[flaml.automl.logger: 12-05 15:52:34] {2391} INFO -  at 1573.4s,	estimator xgboost's best error=1.3838,	best estimator lgbm's best error=1.3751
[flaml.automl.logger: 12-05 15:52:34] {2218} I

In [48]:
y_10_pred = automl.predict(X_10_test_scaled)

In [49]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_true=y_10_test, y_pred=y_10_pred)

0.9536441735359866

In [None]:
automl