In [11]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

# Загрузка данных

In [2]:
X_train = pd.read_csv('../data/X_train.csv', parse_dates=['reportts'])
y_train = pd.read_csv('../data/y_train.csv', parse_dates=['reportts'])

# X_test = pd.read_csv('../data/X_test.csv', parse_dates=['reportts']).fillna(0)

X_train.rename(columns={"reportts": "datetime"}, inplace=True);
# X_test.rename(columns={"reportts": "datetime"}, inplace=True);
y_train.rename(columns={"reportts": "datetime"}, inplace=True);

dataset = X_train.merge(y_train, on=['acnum', 'pos', 'datetime']).dropna(subset=['egtm'])


In [3]:
dataset.head()

Unnamed: 0,datetime,acnum,pos,fltdes,dep,arr,ivs12,ibe,iaie,iai,...,votm,vsva,w14,pf,wai,nai,prv,hpv,xf,egtm
0,2018-12-24 10:53:22,VQ-BGU,1,8990.0,EDHI,UUDD,,,,,...,,0.0,1160.0,0.53,0.0,0.0,1.0,0.0,0.0,44.437
1,2018-12-24 10:53:22,VQ-BGU,2,8990.0,EDHI,UUDD,,,,,...,,0.0,1160.0,0.53,0.0,0.0,1.0,0.0,0.0,45.869
2,2018-12-25 15:23:23,VQ-BGU,1,1024.0,URSS,UUDD,,,,,...,,0.0,1179.0,0.53,0.0,0.0,1.0,0.0,0.0,44.379
3,2018-12-25 15:23:23,VQ-BGU,2,1024.0,URSS,UUDD,,,,,...,,0.0,1179.0,0.53,0.0,0.0,1.0,0.0,0.0,44.904
4,2018-12-25 20:49:27,VQ-BGU,1,217.0,UUDD,UNBB,,,,,...,,0.0,1302.0,0.51,0.0,0.0,1.0,0.0,0.0,43.742


In [4]:
cat_features = ['pos','dep', 'arr']

for cat in cat_features:
    lbl = preprocessing.LabelEncoder()
    dataset[cat] = lbl.fit_transform(dataset[cat].astype(str))
    dataset[cat] = dataset[cat].astype('category')

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2827 entries, 0 to 2832
Data columns (total 166 columns):
 #    Column      Dtype         
---   ------      -----         
 0    datetime    datetime64[ns]
 1    acnum       object        
 2    pos         category      
 3    fltdes      float64       
 4    dep         category      
 5    arr         category      
 6    ivs12       float64       
 7    ibe         float64       
 8    iaie        float64       
 9    iai         float64       
 10   zwbp        float64       
 11   acct        float64       
 12   alt         float64       
 13   alt_peak    float64       
 14   alt_rtd     float64       
 15   aoc         float64       
 16   b25         float64       
 17   baf         float64       
 18   bbf         float64       
 19   bbr         float64       
 20   bbv         float64       
 21   btmc        float64       
 22   cas         float64       
 23   cas_peak    float64       
 24   cas_rtd     float64       
 25   

In [6]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler()),  # Standardize features
    ('catboost', CatBoostRegressor())  # CatBoost Regressor
])

In [15]:
def train_model(X, y):
    assert len(X) == len(y)

    X = X.fillna(0).drop(columns=[
      'datetime', 'acnum', 'pos', 'fltdes','dep', 'arr'
    ])

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

    # Define the pipeline
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
        ('scaler', StandardScaler()),  # Standardize features
        ('catboost', CatBoostRegressor(random_state=42,verbose=False))  # CatBoost Regressor
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict on validation set
    predicted = pipeline.predict(X_val)

    # Calculate evaluation metrics
    rmse = mean_squared_error(y_val, predicted, squared=False)
    mae = mean_absolute_error(y_val, predicted)

    predicted = pipeline.predict(X_val)
    rmse = mean_squared_error(y_val, predicted, squared=False)
    mae = mean_absolute_error(y_val, predicted)

    return rmse, mae, pipeline

In [16]:
fleet = ['VQ-BGU', 'VQ-BDU']

for acnum in fleet:
  X = dataset[dataset['acnum'] == acnum].drop(columns=['egtm'])
  y = dataset[dataset['acnum'] == acnum]['egtm']

  rmse, mae, model = train_model(X, y)

  print(f'acnum={acnum} RMSE={rmse:.3f} MAE={mae:.3f}')


acnum=VQ-BGU RMSE=1.015 MAE=0.592
acnum=VQ-BDU RMSE=0.498 MAE=0.385
