In [112]:

import pandas as pd
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import numpy as np
from sklearn.preprocessing import StandardScaler
from joblib import dump,load
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

Collecting catboost
  Downloading catboost-1.2.5-cp37-cp37m-manylinux2014_x86_64.whl (98.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m122.0 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:20[0m
[?25hCollecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m116.6 kB/s[0m eta [36m0:00:00[0m1m110.5 kB/s[0m eta [36m0:00:01[0m
Collecting plotly
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m701.3 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, graphviz, plotly, catboost
Successfully installed catboost-1.2.5 graphviz-0.20.1 plotly-5.18.0 tenacity-8.2.3


# ОПИСАНИЕ 

# протестировал комбинации catboost ,  randomforest , GradientBoostingRegressor  , elasticnet, SVR в итоге идеальнее всего на 10 разных выборках показали себя svr и бустинг также добавил масштабирование признаков в метод обучения и предсказания 


In [66]:
X_train = pd.read_csv('X_train.csv', parse_dates=['reportts'])


In [67]:
y_train = pd.read_csv('y_train.csv', parse_dates=['reportts'])


In [68]:
X_test = pd.read_csv('X_test.csv',parse_dates=['reportts'])

In [113]:


class AircraftModel:
   
    def __init__(self):
        self.models = {}
        self.scaler = StandardScaler()  
        
    def get_train_dataset(self, X_train, y_train):
        df = X_train.merge(y_train, on=['acnum', 'pos', 'reportts'])
        cols = ['egt', 'n1a', 'n2a', 'nf', 'ff', 'mn', 't2', 'tat', 'oat', 'alt',
                'p2e','wai', 'nai', 'prv', 'hpv', 'xf', 'acnum', 'egtm'] 
        dataset = df[cols]
        return dataset 
    
    def get_dataset(self, X_test):
        cols = ['egt', 'n1a', 'n2a', 'nf', 'ff', 'mn', 't2', 'tat', 'oat', 'alt',
                'p2e','wai', 'nai', 'prv', 'hpv', 'xf', 'acnum'] 
        dataset = X_test[cols] 
        return dataset
    
    def train_model(self, df):
        metrics = {}
        for acnum in df['acnum'].unique():
            model_df = df[df['acnum'] == acnum] 
            valid_data = model_df.drop('acnum', axis=1) 
            
            scaled_data = self.scaler.fit_transform(valid_data.drop(['egtm'], axis=1))
            X_train, X_test, y_train, y_test = train_test_split(scaled_data, valid_data['egtm'], test_size=0.2) 
            
            if acnum == 'VQ-BGU': 
                model = SVR()  
                param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}  
                grid_model = GridSearchCV(estimator=model, param_grid=param_grid, 
                                          scoring='neg_mean_squared_error', cv=5, error_score='raise') 
                grid_model.fit(X_train, y_train) 
                self.models[acnum] = grid_model.best_estimator_ 
                
            elif acnum == 'VQ-BDU':
                model = GradientBoostingRegressor() 
                param_grid = {'n_estimators': [100, 200, 300],
                              'learning_rate': [0.01, 0.1, 0.5],
                              'max_depth': [3, 5, 7]}  
                grid_model = GridSearchCV(estimator=model, param_grid=param_grid, 
                                          scoring='neg_mean_squared_error', cv=5, error_score='raise') 
                grid_model.fit(X_train, y_train)  
                self.models[acnum] = grid_model.best_estimator_ 
               
                

            else: 
                raise ValueError(f"No such aircraft {acnum}") 
            
            predictions = self.models[acnum].predict(X_test)
            mae = mean_absolute_error(y_test, predictions)
            rmse = np.sqrt(mean_squared_error(y_test, predictions))
            mape_score = mean_absolute_percentage_error(y_test, predictions)
            r2score = r2_score(y_test, predictions)

            metrics[acnum] = {'MAE': mae, 'RMSE': rmse, 'MAPE': mape_score, 'R2 Score': r2score}

        return metrics
    
    def predict(self, df): 
        df_prepared = self.get_dataset(df)
        df_cleaned = self.clear_data(df_prepared)
        
        results = {}
        for acnum in df_cleaned['acnum'].unique():
            model = self.models.get(acnum)
            if model:
                X_test = df_cleaned[df_cleaned['acnum'] == acnum].drop('acnum', axis=1)
                
                scaled_X_test = self.scaler.transform(X_test)
                
                predictions = model.predict(scaled_X_test)
                results[acnum] = predictions
            else:
                raise ValueError(f"No predictions stored for aircraft model {acnum}")
        return results
    
    def percent_missing(self, df):
        percent_nan = (df.isnull().sum() / len(df)) * 100
        return percent_nan[percent_nan > 0].sort_values()

    def clear_data(self, df):
        percent_miss = self.percent_missing(df)
        columns_to_drop = percent_miss[percent_miss > 95].index
        df = df.drop(columns=columns_to_drop)
        columns_to_fill = percent_miss[(percent_miss > 0) & (percent_miss <= 5)].index
        for column in columns_to_fill:
            df[column].fillna(df[column].median(), inplace=True)

        return df  


In [114]:
air = AircraftModel()

In [115]:
df = air.get_train_dataset(X_train, y_train)

In [116]:
df = air.clear_data(df)

In [118]:
air.train_model(df)

{'VQ-BGU': {'MAE': 1.8595724550241866,
  'RMSE': 2.8127767954208793,
  'MAPE': 0.061471718025497286,
  'R2 Score': 0.7828963086080526},
 'VQ-BDU': {'MAE': 2.6127635125441886,
  'RMSE': 3.2850177789629584,
  'MAPE': 0.10724916359843563,
  'R2 Score': 0.7600256695593601}}

In [119]:
air.models

{'VQ-BGU': SVR(C=100, gamma=0.1),
 'VQ-BDU': GradientBoostingRegressor(n_estimators=300)}

In [120]:
predict = air.predict(X_test)

In [122]:
predict

{'VQ-BGU': array([31.26051211, 29.94039563, 28.50802506, 28.34785537, 25.49811255,
        26.54508668, 27.83451007, 27.95457129, 26.70293782, 26.00884099,
        25.38326177, 25.16897408, 25.32442979, 27.06755521, 27.72425476,
        27.73220851, 25.71535857, 26.45575312, 25.2253102 , 26.2204536 ,
        25.41959043, 24.40023697, 26.45289372, 26.02886714, 27.45911996,
        27.3466193 , 38.6255674 , 37.20602368, 25.87891061, 26.86583369,
        30.16990226, 31.98706295, 27.83312801, 24.42549967, 26.5490158 ,
        27.7750623 , 30.35843706, 28.91052765, 24.92862689, 23.14766034,
        30.1671295 , 30.4764707 , 28.76381065, 29.20833097, 28.69945168,
        28.14519875, 30.91820462, 29.39058183, 29.72016854, 27.80842749,
        27.79005668, 28.93535128, 22.87654756, 22.41240951, 25.25809205,
        25.85067974, 23.96775238, 25.21023216, 25.3802245 , 25.82371999,
        26.93675998, 27.33287581, 26.27042443, 26.13978692, 24.90359267,
        26.09634523, 29.94220901, 30.0737