In [1]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from tqdm.notebook import tqdm
import catboost as cb
import numpy as np
import pandas as pd
from scipy import stats
from scipy.special import boxcox1p, inv_boxcox1p



PATH_TO_TEST_DATA = 'data/datasets/X_test.parquet'
PATH_TO_TEST_INTERVALS = 'data/datasets/test_intervals.xlsx'

In [11]:
X_test = dd.read_parquet(PATH_TO_TEST_DATA, engine="pyarrow")

# fix this
X_light = dd.read_parquet(PATH_TO_TEST_DATA, columns=['ЭКСГАУСТЕР 4. ТОК РОТОРА 1'], engine="pyarrow")

In [12]:
def get_single_exgauster_columns_dict(X_test):
    
    all_columns = list(X_test.columns)
    columns_dict = {}
    for exg_number in [4, 5, 6, 7, 8, 9]:
        exg_name = f'ЭКСГАУСТЕР {exg_number}'
        columns_dict[exg_number] = [col for col in all_columns if exg_name in col]
    return columns_dict

In [17]:
columns_dict = get_single_exgauster_columns_dict(X_test)

In [58]:
columns_dict[4]

['ЭКСГАУСТЕР 4. ТОК РОТОРА 1',
 'ЭКСГАУСТЕР 4. ТОК РОТОРА2',
 'ЭКСГАУСТЕР 4. ТОК СТАТОРА',
 'ЭКСГАУСТЕР 4. ДАВЛЕНИЕ МАСЛА В СИСТЕМЕ',
 'ЭКСГАУСТЕР 4. ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 1',
 'ЭКСГАУСТЕР 4. ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 2',
 'ЭКСГАУСТЕР 4. ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 3',
 'ЭКСГАУСТЕР 4. ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 4',
 'ЭКСГАУСТЕР 4. ТЕМПЕРАТУРА МАСЛА В СИСТЕМЕ',
 'ЭКСГАУСТЕР 4. ТЕМПЕРАТУРА МАСЛА В МАСЛОБЛОКЕ',
 'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 1',
 'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 2',
 'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 3',
 'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 3. ПРОДОЛЬНАЯ.',
 'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 4',
 'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 4. ПРОДОЛЬНАЯ.']

#### Добавляем фичи для задач 1 и 3

In [13]:
from warnings import simplefilter

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


def add_features(X_test, columns_dict, exg_number):
    
    features = X_test[columns_dict[exg_number]].compute()
    features["date"] = features.index.date
    
    features[f'ЭКСГАУСТЕР {exg_number}. ВИБРАЦИЯ НА ОПОРЕ 4'] = features[f'ЭКСГАУСТЕР {exg_number}. ВИБРАЦИЯ НА ОПОРЕ 4'].abs()
    
    def compute_window_features(data, name_f):
        windows = ['7D', '30D'] # del '5h', '10h'
        for f in tqdm(name_f):
            for window in windows:
                data[f"{f}_{window}_mean"] = data[f].rolling(window, min_periods=1).mean()
                data[f"{f}_{window}_std"] = data[f].rolling(window, min_periods=1).std()
                data[f"{f}_{window}_median"] = data[f].rolling(window, min_periods=1).median()
                data[f"{f}_{window}_max"] = data[f].rolling(window, min_periods=1).max()
                
            data[f"{f}_7D_chg_mean"] = data[f"{f}"]/data[f"{f}_7D_mean"]
            data[f"{f}_30D_chg_mean"] = data[f"{f}"]/data[f"{f}_30D_mean"]
            data[f"{f}_7D_chg_median"] = data[f"{f}"]/data[f"{f}_7D_median"]
            data[f"{f}_30D_chg_median"] = data[f"{f}"]/data[f"{f}_30D_median"]        
            data[f"{f}_diff_between_values"] = data[f"{f}"] / data[f"{f}"].rolling(1, min_periods=1).mean()
        return data
    
    data_test = compute_window_features(features, columns_dict[exg_number])
    data_test.drop(columns=["date"], inplace=True)
    
    return data_test

#### Задача №3

In [14]:
from catboost import CatBoostRegressor


def load_catboost_model(exg_number):
    
    model = CatBoostRegressor()
    model.load_model(f'models/cb_regressor_exg_{exg_number}_boxcox_org_loss.cbm')
    return model

def make_regressor_predictions(X_test, model, exg_number):
    pred = model.predict(X_test)    
    return pred

def postprocess_task_3(pred):
    return np.round(pred, decimals=-1).astype(int)


def make_task_3_preds(X_test, columns_dict, exg_number):
    
    data_test = add_features(X_test, columns_dict, exg_number)
    
    model = load_catboost_model(exg_number)
    reg_preds = make_task_3_predictions(data_test, model)
    preds = postprocess_task_3(reg_preds)
    
    return preds

In [15]:

def make_exg_task_3_prediction(exg_number):
    
    data_test = add_features(X_test, columns_dict, exg_number)
    model = load_catboost_model(exg_number)
    
    reg_preds = make_regressor_predictions(data_test, model, exg_number)
    round_preds = postprocess_task_3(reg_preds)
    
    preds_df = pd.DataFrame({f'round_preds_exg_{exg_number}': round_preds})
    preds_df['date'] = X_light.index
    preds_df.to_csv(f'submit_preds/round_preds_exg_{exg_number}.csv')


In [None]:

for exg_number in [5, 6, 7, 8, 9]:
    make_exg_task_3_prediction(exg_number)