In [162]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import seaborn as sns
from datetime import datetime
import random
import os
import optuna

In [163]:
def set_all_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

DEFAULT_RANDOM_SEED = 84
set_all_seeds(seed=DEFAULT_RANDOM_SEED)

In [164]:
path_big_train = '../train/data/train.csv'
path_big_test = '../train/data/test.csv'
df_train = pd.read_csv(path_big_train)
df_test = pd.read_csv(path_big_test)

In [None]:
# Обрабатываем датасет

import re

targets = ['target_month', 'target_day']
numerical = list(filter(lambda x: re.match(r'\d+[dm]_', x), df_train.columns))
categorical = ['kod_vrab', 'model',  'zavod_build', 'kuzov', 'telega', 'expected_srok_sl_y_b', 'date_build_b', 'manage_type', 'rod_id_x', 'reestr_state',
                'tormoz', 'tipvozd', 'tippogl', 'ownertype', 'is_in_kti', 'season']
dates = ['month', 'date_kap', 'date_dep', 'date_pl_rem', 'last_rem_date', 'date_build', 'srok_sl']

numerical += [
    "days_to_pl_rem",
    "days_from_last_rem",
    "ost_prob",
    "gruz",
    "cnsi_gruz_capacity",
    "cnsi_volumek",
    "tara",
    "cnsi_probeg_dr",
    "cnsi_probeg_kr",
    "norma_km",
    "expected_srok_sl_y"
]

numerical += ['kod_vrab_2_tr',
              'kod_vrab_3_tr', 'kod_vrab_5_tr', 'neis1_kod_tr_max',
              'neis2_kod_tr_max', 'neis3_kod_tr_max', 'gr_probeg_tr_max',
              'gr_probeg_tr_mean', 'gr_probeg_tr_min', 'por_probeg_tr_max',
              'por_probeg_tr_mean', 'por_probeg_tr_min', '_1m_tr_rems',
              'kod_vrab_0_pr', 'kod_vrab_1_pr', 'diff_road_pr', 'diff_st_pr',
              '_10d_pr_rems', '_20d_pr_rems', '_1m_pr_rems', '_allm_pr_rems',
              '_allm_tr_rems', 'last_month_for_cnt', 'months_after_tr_rems',
              'months_after_pr_rems']
numerical += ['ost_prob_in_a_month',
              'ost_prob_in_3_months',
              'ost_prob_in_10_days',
              'days_to_srok_sl',
              'days_to_iskl',
              'days_since_kap',
              'days_since_dep']


notna_features = ['date_kap', 'date_dep', 'date_iskl']
notna_features_res = [feature + '_notna' for feature in notna_features]
categorical += notna_features_res

print('Unused columns:')
for col in df_train.columns:
    if col not in targets and col not in numerical and col not in categorical and col not in dates:
        print(col)

features = numerical + categorical + targets
train_features = numerical + categorical

medians = df_train[numerical].median()

def preprcoess_df(df, test=False):
    for date in dates:
        df[date] = pd.to_datetime(df[date])
    df = df[df['month'] >= '2022-09-01']

    for col in notna_features:
        df[col + '_notna'] = df[col].notna().astype(int)

    features = numerical + categorical + targets
    if test:
        features = train_features
    df = df[features]

    df[categorical] = df[categorical].astype('str')
    df[numerical] = df[numerical].fillna(medians)

    return df

df_train = preprcoess_df(df_train)
df_test = preprcoess_df(df_test, test=True)

In [None]:
df_train.shape, df_test.shape

In [174]:
# Тренируем XGBoost и LightGBM

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

def train(target, model_name, features, cat_features):
    cnt_folds = 5
    clfs = []
    scores = []
    kf = StratifiedKFold(n_splits=cnt_folds, shuffle=True, random_state=DEFAULT_RANDOM_SEED)

    #features = numerical + categorical + [target]
    features += cat_features + [target]
    df = df_train[features]
    #print(df.dtypes)

    if model_name == 'lgbm':
        for cat_feature in cat_features:
            df[cat_feature] = df[cat_feature].astype('category')
    else:
        for cat_feature in cat_features:
            le = LabelEncoder()
            df[cat_feature] = le.fit_transform(df[cat_feature])

    X = df.drop(target, axis=1)
    y = df[target]
    print(df)
    for ind, (train_index, test_index) in enumerate(kf.split(X=X, y=y)):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        if model_name == 'lgbm':
            clf = LGBMClassifier(random_state=DEFAULT_RANDOM_SEED)

            clf.fit(X_train, y_train,
                    eval_set=[(X_test, y_test)])
        else:
            clf = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=DEFAULT_RANDOM_SEED)

            clf.fit(X_train, y_train,
                    eval_set=[(X_test, y_test)],
                    verbose=150)

        clfs.append(clf)
        y_pred = clf.predict(X_test)
        score = f1_score(y_test, y_pred)
        scores.append(score)
        print(f'{ind} model: {score} f1')

    assert len(clfs) == cnt_folds

    print("mean f1 score --", np.mean(scores, dtype="float16"), np.std(scores).round(4))

    return clfs, np.mean(scores, dtype="float16"), np.std(scores).round(4)

In [None]:
month_clfs_lgbm, month_f1_lgbm, month_std_lgbm = train('target_month', 'lgbm', months_features, cat_months_features)
day_clfs_lgbm, day_f1_lgbm, day_std_lgbm = train('target_day', 'lgbm', days_features, cat_days_features)
month_clfs_xg, month_f1_xg, month_std_xg = train('target_month', 'xg', months_features, cat_months_features)
day_clfs_xg, day_f1_xg, day_std_xg = train('target_day', 'xg', days_features, cat_days_features)

In [None]:
print(month_f1_lgbm, month_std_lgbm)
print(day_f1_lgbm, day_std_lgbm)
print(month_f1_xg, month_std_xg)
print(day_f1_xg, day_std_xg)

In [None]:
(month_f1_lgbm + day_f1_lgbm + month_f1_xg + day_f1_xg) / 4

In [102]:
# Предсказываем на скрещенных моделях

def predict(clfs, df, name_model):
    features = numerical + categorical
    X = df[features]

    if name_model == 'lgbm':
        for cat_feature in categorical:
            X[cat_feature] = X[cat_feature].astype('category')
    else:
        for cat_feature in categorical:
            le = LabelEncoder()
            X[cat_feature] = le.fit_transform(X[cat_feature])

    y_pred = np.zeros((X.shape[0], 2))
    for clf in clfs:
        y_pred += clf.predict_proba(X)
    y_pred /= len(clfs)

    return y_pred

In [None]:
def make_predictions(clfs_day1, clfs_day2, clfs_month1, clfs_month2, df):
    pred_day1 = predict(clfs_day1, df, 'lgbm')
    pred_month1 = predict(clfs_month1, df, 'lgbm')
    pred_day2 = predict(clfs_day2, df, 'xg')
    pred_month2 = predict(clfs_month2, df, 'xg')

    pred_day = (pred_day1 + pred_day2) / 2
    pred_month = (pred_month1 + pred_month2) / 2
    pred_day = pred_day.argmax(axis=1)
    pred_month = pred_month.argmax(axis=1)

    df_test_new = df.copy()[['wagnum', 'month']]
    df_test_new['target_day'] = pred_day
    df_test_new['target_month'] = pred_month

    df_test_new = df_test_new[['wagnum', 'target_month', 'target_day']]
    sample = pd.read_csv('../train/target/y_predict.csv')
    sample = sample.merge(df_test_new, on='wagnum', how='left')

    return sample

prediction = make_predictions(day_clfs_lgbm, day_clfs_xg, month_clfs_lgbm, month_clfs_xg, df_test)
assert len(prediction) == len(df_test)

prediction.describe()

In [None]:
prediction = make_predictions(day_clfs_xg, month_clfs_lgbm, df_test)
assert len(prediction) == len(df_test)

prediction.describe()

In [104]:
prediction.to_csv('predictions/lgbm_xg_siam.csv', index=False)