In [1]:
!pip install openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from tqdm.notebook import tqdm
import catboost as cb
import numpy as np
import pandas as pd
from scipy import stats
from scipy.special import boxcox1p, inv_boxcox1p



PATH_TO_TEST_DATA = 'data/X_test.parquet'
PATH_TO_TEST_INTERVALS = 'data/test_intervals.xlsx'

In [2]:
X_test = dd.read_parquet(PATH_TO_TEST_DATA, engine="pyarrow")

# fix this
# X_light = dd.read_parquet(PATH_TO_TEST_DATA, columns=['ЭКСГАУСТЕР 4. ТОК РОТОРА 1'], engine="pyarrow")

In [3]:
def get_single_exgauster_columns_dict(X_test):
    
    all_columns = list(X_test.columns)
    columns_dict = {}
    for exg_number in [4, 5, 6, 7, 8, 9]:
        exg_name = f'ЭКСГАУСТЕР {exg_number}'
        columns_dict[exg_number] = [col for col in all_columns if exg_name in col]
    return columns_dict

In [4]:
columns_dict = get_single_exgauster_columns_dict(X_test)

#### Добавляем фичи для задач 1 и 3

In [5]:
from warnings import simplefilter

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


def add_features(X_test, columns_dict, exg_number):
    
    features = X_test[columns_dict[exg_number]].compute()
    features["date"] = features.index.date
    
    # features[f'ЭКСГАУСТЕР {exg_number}. ВИБРАЦИЯ НА ОПОРЕ {exg_number}'] = features[f'ЭКСГАУСТЕР {exg_number}. ВИБРАЦИЯ НА ОПОРЕ {exg_number}'].abs()
    
    def compute_window_features(data, name_f):
        windows = ['1h', '1D', '7D', '30D'] # del '5h', '10h'
        for f in tqdm(name_f):
            for window in windows:
                data[f"{f}_{window}_mean"] = data[f].rolling(window, min_periods=1).mean()
                data[f"{f}_{window}_std"] = data[f].rolling(window, min_periods=1).std()
                data[f"{f}_{window}_median"] = data[f].rolling(window, min_periods=1).median()
                data[f"{f}_{window}_max"] = data[f].rolling(window, min_periods=1).max()
                
            data[f"{f}_1D_chg_mean"] = data[f"{f}"]/data[f"{f}_1D_mean"]
            data[f"{f}_7D_chg_mean"] = data[f"{f}"]/data[f"{f}_7D_mean"]
            data[f"{f}_30D_chg_mean"] = data[f"{f}"]/data[f"{f}_30D_mean"]
            data[f"{f}_1D_chg_median"] = data[f"{f}"]/data[f"{f}_1D_median"]
            data[f"{f}_7D_chg_median"] = data[f"{f}"]/data[f"{f}_7D_median"]
            data[f"{f}_30D_chg_median"] = data[f"{f}"]/data[f"{f}_30D_median"]        
            data[f"{f}_diff_between_values"] = data[f"{f}"] / data[f"{f}"].rolling(1, min_periods=1).mean()
        return data
    
    data_test = compute_window_features(features, columns_dict[exg_number])
    data_test.drop(columns=["date"], inplace=True)
    
    return data_test

#### Задача №3

In [None]:
from catboost import CatBoostRegressor


def load_catboost_model(exg_number):
    
    model = CatBoostRegressor()
    model.load_model(f'models/cb_regressor_exg_{exg_number}_boxcox_org_loss.cbm')
    return model

def make_regressor_predictions(X_test, model, exg_number):
    pred = model.predict(X_test)    
    return pred

def postprocess_task_3(pred):
    return np.round(pred, decimals=-1).astype(int)


def make_task_3_preds(X_test, columns_dict, exg_number):
    
    data_test = add_features(X_test, columns_dict, exg_number)
    
    model = load_catboost_model(exg_number)
    reg_preds = make_task_3_predictions(data_test, model)
    preds = postprocess_task_3(reg_preds)
    
    return preds

### Задача №1

In [None]:

def find_interval_by_quantile(model_prediction, bins, y_test, y_preds):   

    intervals = list()
    for i in range(1, bins):
        intervals.append(y_preds.quantile(i / bins))

    if model_prediction < intervals[0]:
        val_min = -np.inf
        val_max = intervals[0]

    elif model_prediction > intervals[-1]:
        val_min = intervals[-1]
        val_max = np.inf      

    else:    
        for i in range(1, bins):
            if model_prediction < intervals[i]:
                val_min = intervals[i-1]
                val_max = intervals[i]                
                break

    y_test = pd.Series(y_test)
    y_preds = pd.Series(y_preds)
    y_preds_interval_index = y_preds[(y_preds >= val_min) & (y_preds <= val_max)].index    
    interval_data = y_test[y_preds_interval_index]
    return interval_data

def find_probability(interval_data, horizon):
    return interval_data[interval_data <= horizon].shape[0] / interval_data.shape[0]

def find_prob(model_prediction, horizon, y_test, y_preds, bins=10):
    return find_probability(find_interval_by_quantile(model_prediction, bins, y_test, y_preds), horizon)  


In [None]:
def make_task_1_preds(reg_preds, X_light, exg_number, boxcox_lmbda: float = 0.7, conf_th: float = 0.05):
    
    table = pd.read_excel(PATH_TO_TEST_INTERVALS)
    table[f'preds_proba_{exg_number}'] = 0
    table[f'preds_label_{exg_number}'] = 0
    
    for index, row in table.iterrows():
        
        start_interval_time = row['start']
        finish_interval_time = row['finish']
        
        idx = len(X_light.loc[X_light.index <= start_interval_time])
        dt_index = X_light.loc[X_light.index <= start_interval_time].tail(1).index
        
        horizon = pd.Timedelta(pd.to_datetime(finish_interval_time) - pd.DatetimeIndex.to_pydatetime(dt_index)[0])
        horizon = horizon.total_seconds() / 60
        horizon_boxcox = boxcox1p(horizon, boxcox_lmbda)
        
        model_prediction = reg_preds[idx-1]

        proba = find_prob(model_prediction, horizon_boxcox, y_test, y_preds, bins=10)
        table.loc[index, f'preds_proba_{exg_number}'] = proba
        table.loc[index, f'preds_label_{exg_number}'] = 1 if proba >= conf_th else 0

    result = table.copy()
    
    return result

### exg 4

In [33]:
exg_number = 4

data_test = add_features(X_test, columns_dict, exg_number)
model = load_catboost_model(exg_number)
reg_preds = make_regressor_predictions(data_test, model, exg_number)


y_test = pd.read_csv(f'test_preds/y_test_exg_{exg_number}.csv')
y_test = y_test['y']

y_preds = pd.read_csv(f'test_preds/y_preds_exg_{exg_number}.csv')
y_preds = y_preds.iloc[:, 1]

results = make_task_1_preds(reg_preds, data_test, exg_number, boxcox_lmbda=0.7)
results["machine"] = exg_number
results[f"preds_label_{exg_number}"] = (results[f'preds_proba_{exg_number}'] > 0.001).astype(int)
results.drop(columns=["machine", "tm"], inplace=True)
results.to_csv(f"results/preds_task_1_machine_{exg_number}.csv", index=False)
data_test["time_to_m1_seconds"] = inv_boxcox1p(reg_preds, 0.7)
data_test["time_to_m1_seconds"].round(0).to_csv(f"results/preds_task_3_machine_{exg_number}.csv")
data_test.drop(columns=["time_to_m1_seconds"], inplace=True)

### exg 5

In [11]:
exg_number = 5

data_test = add_features(X_test, columns_dict, exg_number)
model = load_catboost_model(exg_number)
reg_preds = make_regressor_predictions(data_test, model, exg_number)


y_test = pd.read_csv(f'test_preds/y_test_exg_{exg_number}.csv')
y_test = y_test['y']

y_preds = pd.read_csv(f'test_preds/y_preds_exg_{exg_number}.csv')
y_preds = y_preds.iloc[:, 1]

results = make_task_1_preds(reg_preds, data_test, exg_number, boxcox_lmbda=0.7)
results["machine"] = exg_number
results[f"preds_label_{exg_number}"] = (results[f'preds_proba_{exg_number}'] > 0.001).astype(int)
results.drop(columns=["machine", "tm"], inplace=True)
results.to_csv(f"results/preds_task_1_machine_{exg_number}.csv", index=False)
data_test["time_to_m1_seconds"] = inv_boxcox1p(reg_preds, 0.7)
data_test["time_to_m1_seconds"].round(0).to_csv(f"results/preds_task_3_machine_{exg_number}.csv")
data_test.drop(columns=["time_to_m1_seconds"], inplace=True)

  0%|          | 0/16 [00:00<?, ?it/s]

### exg 6

In [None]:
exg_number = 6

data_test = add_features(X_test, columns_dict, exg_number)
model = load_catboost_model(exg_number)
reg_preds = make_regressor_predictions(data_test, model, exg_number)


y_test = pd.read_csv(f'test_preds/y_test_exg_{exg_number}.csv')
y_test = y_test['y']

y_preds = pd.read_csv(f'test_preds/y_preds_exg_{exg_number}.csv')
y_preds = y_preds.iloc[:, 1]

results = make_task_1_preds(reg_preds, data_test, exg_number, boxcox_lmbda=0.7)
results["machine"] = exg_number
results[f"preds_label_{exg_number}"] = (results[f'preds_proba_{exg_number}'] > 0.001).astype(int)
results.drop(columns=["machine", "tm"], inplace=True)
results.to_csv(f"results/preds_task_1_machine_{exg_number}.csv", index=False)
data_test["time_to_m1_seconds"] = inv_boxcox1p(reg_preds, 0.7)
data_test["time_to_m1_seconds"].round(0).to_csv(f"results/preds_task_3_machine_{exg_number}.csv")
data_test.drop(columns=["time_to_m1_seconds"], inplace=True)

  0%|          | 0/16 [00:00<?, ?it/s]

### exg 7

In [None]:
exg_number = 7

data_test = add_features(X_test, columns_dict, exg_number)
model = load_catboost_model(exg_number)
reg_preds = make_regressor_predictions(data_test, model, exg_number)


y_test = pd.read_csv(f'test_preds/y_test_exg_{exg_number}.csv')
y_test = y_test['y']

y_preds = pd.read_csv(f'test_preds/y_preds_exg_{exg_number}.csv')
y_preds = y_preds.iloc[:, 1]

results = make_task_1_preds(reg_preds, data_test, exg_number, boxcox_lmbda=0.7)
results["machine"] = exg_number
results[f"preds_label_{exg_number}"] = (results[f'preds_proba_{exg_number}'] > 0.001).astype(int)
results.drop(columns=["machine", "tm"], inplace=True)
results.to_csv(f"results/preds_task_1_machine_{exg_number}.csv", index=False)
data_test["time_to_m1_seconds"] = inv_boxcox1p(reg_preds, 0.7)
data_test["time_to_m1_seconds"].round(0).to_csv(f"results/preds_task_3_machine_{exg_number}.csv")
data_test.drop(columns=["time_to_m1_seconds"], inplace=True)

### exg 8

In [None]:
exg_number = 8

data_test = add_features(X_test, columns_dict, exg_number)
model = load_catboost_model(exg_number)
reg_preds = make_regressor_predictions(data_test, model, exg_number)


y_test = pd.read_csv(f'test_preds/y_test_exg_{exg_number}.csv')
y_test = y_test['y']

y_preds = pd.read_csv(f'test_preds/y_preds_exg_{exg_number}.csv')
y_preds = y_preds.iloc[:, 1]

results = make_task_1_preds(reg_preds, data_test, exg_number, boxcox_lmbda=0.7)
results["machine"] = exg_number
results[f"preds_label_{exg_number}"] = (results[f'preds_proba_{exg_number}'] > 0.001).astype(int)
results.drop(columns=["machine", "tm"], inplace=True)
results.to_csv(f"results/preds_task_1_machine_{exg_number}.csv", index=False)
data_test["time_to_m1_seconds"] = inv_boxcox1p(reg_preds, 0.7)
data_test["time_to_m1_seconds"].round(0).to_csv(f"results/preds_task_3_machine_{exg_number}.csv")
data_test.drop(columns=["time_to_m1_seconds"], inplace=True)

### exg 9

In [None]:
exg_number = 9

data_test = add_features(X_test, columns_dict, exg_number)
model = load_catboost_model(exg_number)
reg_preds = make_regressor_predictions(data_test, model, exg_number)


y_test = pd.read_csv(f'test_preds/y_test_exg_{exg_number}.csv')
y_test = y_test['y']

y_preds = pd.read_csv(f'test_preds/y_preds_exg_{exg_number}.csv')
y_preds = y_preds.iloc[:, 1]

results = make_task_1_preds(reg_preds, data_test, exg_number, boxcox_lmbda=0.7)
results["machine"] = exg_number
results[f"preds_label_{exg_number}"] = (results[f'preds_proba_{exg_number}'] > 0.001).astype(int)
results.drop(columns=["machine", "tm"], inplace=True)
results.to_csv(f"results/preds_task_1_machine_{exg_number}.csv", index=False)
data_test["time_to_m1_seconds"] = inv_boxcox1p(reg_preds, 0.7)
data_test["time_to_m1_seconds"].round(0).to_csv(f"results/preds_task_3_machine_{exg_number}.csv")
data_test.drop(columns=["time_to_m1_seconds"], inplace=True)