In [1]:
!pip install openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import numpy as np
import pandas as pd
import joblib
import dask.dataframe as dd
from tqdm.notebook import tqdm
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import KFold

data_test_path = '/home/jovyan/work/СеверСтальХакатон/X_test.parquet'
label_path = '/home/jovyan/work/СеверСтальХакатон/y_train.parquet'
PATH_TO_TEST_INTERVALS = 'test_intervals.xlsx'

In [3]:
def read_parquet(data_path):
    
    df_data = dd.read_parquet(data_path, engine="pyarrow")
    df_data = df_data.compute()
    
    return df_data

In [4]:
def make_features(df_data):
    
    df_data.interpolate(inplace=True)
    # нормирование
    scaler = MinMaxScaler()
    df_preprocess = pd.DataFrame(scaler.fit_transform(df_data.values), columns=df_data.columns, index=df_data.index)

    # формирование фичей
    feature_window = 240
    cols = df_preprocess.columns

    for col in tqdm(cols):
        df_preprocess[f'{col}_mean'] = df_preprocess[col].rolling(min_periods=1, window=feature_window).mean()
        df_preprocess[f'{col}_median'] = df_preprocess[col].rolling(min_periods=1, window=feature_window).median()
        df_preprocess[f'{col}_max'] = df_preprocess[col].rolling(min_periods=1, window=feature_window).max()
        df_preprocess[f'{col}_min'] = df_preprocess[col].rolling(min_periods=1, window=feature_window).min()
        df_preprocess[f'{col}_std'] = df_preprocess[col].rolling(min_periods=1, window=feature_window).std()
        df_preprocess[f'{col}_quantile25'] = df_preprocess[col].rolling(min_periods=1, window=feature_window).quantile(0.25)
        df_preprocess[f'{col}_quantile75'] = df_preprocess[col].rolling(min_periods=1, window=feature_window).quantile(0.75)
        df_preprocess[f'{col}_quantile95'] = df_preprocess[col].rolling(min_periods=1, window=feature_window).quantile(0.95)
        df_preprocess[f'{col}_range'] = df_preprocess[f'{col}_max'] - df_preprocess[f'{col}_min']
        df_preprocess[f'{col}_Max/Min'] = df_preprocess[f'{col}_max'] / df_preprocess[f'{col}_min']
    
    # Чистим данные, заменяем nan и inf на ffill
    df_preprocess.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_preprocess.ffill(inplace=True)
    df_preprocess.dropna(inplace=True)
    
    return df_preprocess

In [5]:
def predict_model(X_pred, model_name = 'random_forest.joblib'):
    
    X_pred = X_pred.values
    loaded_model = joblib.load(model_name)
    y_pred = loaded_model.predict(X_pred)
    
    return y_pred

In [6]:
def get_single_exgauster_columns_dicts(X_train, y_train):
    
    all_columns = list(X_train.columns)
    x_columns_dict = {}
    for exg_number in [4, 5, 6, 7, 8, 9]:
        exg_name = f'ЭКСГАУСТЕР {exg_number}'
        x_columns_dict[exg_number] = [col for col in all_columns if exg_name in col]
        
    all_columns = list(y_train.columns)
    y_columns_dict = {}
    for exg_number in [4, 5, 6, 7, 8, 9]:
        exg_name = f'№{exg_number}'
        y_columns_dict[exg_number] = [col for col in all_columns if exg_name in col]
        
    return x_columns_dict, y_columns_dict

In [7]:
data = read_parquet(data_test_path)
y_data = read_parquet(label_path)

x_columns_dict, y_columns_dict = get_single_exgauster_columns_dicts(data, y_data)

table = pd.read_excel(PATH_TO_TEST_INTERVALS)

In [8]:
ex_number = [4, 5, 6, 7, 8, 9]


for i in tqdm(ex_number):
    
    X_data = data[x_columns_dict[i]]
    columns_names = y_columns_dict[i]
    X_data = make_features(X_data)
    
    df_columns = pd.DataFrame(index = X_data.index ,columns = columns_names)
    
    pred = predict_model(X_data, model_name = f'exg_{i}.joblib')
    
    df_columns[columns_names] = pred
    df_columns.replace(1, 2, inplace=True)
    
    file_name = f'submit{i}_M3.pkl'
    df_columns[columns_names].to_pickle(file_name)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:   52.3s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:  6.0min finished


  0%|          | 0/16 [00:00<?, ?it/s]

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:  1.2min
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:  8.1min finished


  0%|          | 0/16 [00:00<?, ?it/s]

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:   49.9s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:  5.9min finished


  0%|          | 0/16 [00:00<?, ?it/s]

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:   58.4s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:  6.6min finished


  0%|          | 0/16 [00:00<?, ?it/s]

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:  1.1min
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:  7.8min finished


  0%|          | 0/16 [00:00<?, ?it/s]

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:   52.9s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:  6.5min finished
