In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train_raw = pd.read_csv('../train/data/train.csv')
df_test_raw = pd.read_csv('../train/data/test.csv')

df_train_raw.columns

Index(['wagnum', 'month', 'target_month', 'target_day', '10d_nasip_mean',
       '5d_avg_distance_min', '10d_isload_mean', '60d_opor_station_sign_mean',
       '60d_skoroport_mean', '60d_avg_distance_max',
       ...
       'diff_st_pr', '_10d_pr_rems', '_20d_pr_rems', '_1m_pr_rems',
       '_allm_pr_rems', '_allm_tr_rems', 'last_month_for_cnt',
       'months_after_tr_rems', 'months_after_pr_rems', 'is_in_kti'],
      dtype='object', length=147)

In [3]:
import re

targets = ['target_month', 'target_day']
numerical = list(filter(lambda x: re.match(r'\d+[dm]_', x), df_train_raw.columns))
categorical = ['kod_vrab', 'model',  'zavod_build', 'kuzov', 'telega', 'expected_srok_sl_y_b', 'date_build_b', 'manage_type', 'rod_id_x', 'reestr_state',
                'tormoz', 'tipvozd', 'tippogl', 'ownertype']
dates = ['month', 'date_kap', 'date_dep', 'date_pl_rem', 'last_rem_date', 'date_build', 'srok_sl', 'date_iskl']

numerical += [
    "days_to_pl_rem",
    "days_from_last_rem",
    "ost_prob",
    "gruz",
    "cnsi_gruz_capacity",
    "cnsi_volumek",
    "tara",
    "cnsi_probeg_dr",
    "cnsi_probeg_kr",
    "norma_km",
    "expected_srok_sl_y"
]

notna_features = ['date_kap', 'date_dep', 'date_iskl']
notna_features_res = [feature + '_notna' for feature in notna_features]
categorical += notna_features_res

print('Unused columns:')
for col in df_train_raw.columns:
    if col not in targets and col not in numerical and col not in categorical and col not in dates:
        print(col)

features = numerical + categorical + targets
train_features = numerical + categorical

medians = df_train_raw[numerical].median()

def preprcoess_df(df, test=False):
    for date in dates:
        df[date] = pd.to_datetime(df[date])
    df = df[df['month'] > '2022-08-01']

    for col in notna_features:
        df[col + '_notna'] = df[col].notna().astype(int)

    df[categorical] = df[categorical].astype('str')

    return df

df_train = preprcoess_df(df_train_raw)
df_test = preprcoess_df(df_test_raw, test=True)


df_train.shape, df_test.shape

Unused columns:
wagnum
ownership_type
rod_id_y
days_to_iskl
days_to_srok_sl
iskl_in_a_year
ost_prob_in_a_month
kod_vrab_2_tr
kod_vrab_3_tr
kod_vrab_5_tr
neis1_kod_tr_max
neis2_kod_tr_max
neis3_kod_tr_max
gr_probeg_tr_max
gr_probeg_tr_mean
gr_probeg_tr_min
por_probeg_tr_max
por_probeg_tr_mean
por_probeg_tr_min
_1m_tr_rems
kod_vrab_0_pr
kod_vrab_1_pr
diff_road_pr
diff_st_pr
_10d_pr_rems
_20d_pr_rems
_1m_pr_rems
_allm_pr_rems
_allm_tr_rems
last_month_for_cnt
months_after_tr_rems
months_after_pr_rems
is_in_kti


((203610, 147), (33707, 147))

In [4]:
features

['10d_nasip_mean',
 '5d_avg_distance_min',
 '10d_isload_mean',
 '60d_opor_station_sign_mean',
 '60d_skoroport_mean',
 '60d_avg_distance_max',
 '30d_soprovod_mean',
 '60d_another_road_max',
 '5d_st_ferry_sign_mean',
 '60d_avg_distance_min',
 '10d_diff_mean',
 '30d_openvagons_mean',
 '5d_isload_mean',
 '60d_naliv_mean',
 '30d_skoroport_mean',
 '60d_soprovod_mean',
 '30d_naliv_mean',
 '60d_st_river_sign_mean',
 '10d_openvagons_mean',
 '5d_avg_distance_max',
 '30d_st_sea_sign_mean',
 '60d_isload_mean',
 '10d_skoroport_mean',
 '5d_st_river_sign_mean',
 '60d_st_ferry_sign_mean',
 '5d_diff_mean',
 '10d_naval_mean',
 '10d_naliv_mean',
 '30d_naval_mean',
 '30d_avg_distance_max',
 '30d_st_border_sign_mean',
 '5d_st_border_sign_mean',
 '60d_nasip_mean',
 '60d_st_sea_sign_mean',
 '10d_st_car_sign_mean',
 '10d_avg_distance_max',
 '30d_st_river_sign_mean',
 '10d_st_sea_sign_mean',
 '60d_naval_mean',
 '5d_naval_mean',
 '10d_smerz_mean',
 '10d_avg_distance_min',
 '60d_openvagons_mean',
 '5d_naliv_mean

In [5]:
df_train.head()

Unnamed: 0,wagnum,month,target_month,target_day,10d_nasip_mean,5d_avg_distance_min,10d_isload_mean,60d_opor_station_sign_mean,60d_skoroport_mean,60d_avg_distance_max,...,diff_st_pr,_10d_pr_rems,_20d_pr_rems,_1m_pr_rems,_allm_pr_rems,_allm_tr_rems,last_month_for_cnt,months_after_tr_rems,months_after_pr_rems,is_in_kti
0,0,2022-09-01,0,0,0.0,1980.115725,0.0,0.0,0.0,2540.472727,...,0.0,0.0,0.0,0.0,0.0,0.0,760,731.0,731.0,1
1,1,2022-09-01,0,0,0.0,2138.51087,1.0,0.0,0.0,2540.472727,...,0.0,0.0,0.0,0.0,0.0,0.0,760,731.0,731.0,0
2,2,2022-09-01,0,0,0.8,1920.025345,0.7,0.0,0.0,2540.472727,...,0.0,0.0,0.0,0.0,0.0,0.0,760,731.0,731.0,0
3,3,2022-09-01,0,0,0.0,1920.025345,0.0,0.0,0.0,2540.472727,...,0.0,0.0,0.0,0.0,0.0,2.0,760,0.0,731.0,0
4,4,2022-09-01,0,0,0.0,1917.051546,0.0,0.0,0.0,2540.472727,...,0.0,0.0,0.0,0.0,0.0,0.0,760,731.0,731.0,0


In [6]:
df_train[df_train.wagnum == 101][['60d_diff_mean', '30d_diff_mean', '5d_diff_mean', 'target_month', 'ost_prob',]]

Unnamed: 0,60d_diff_mean,30d_diff_mean,5d_diff_mean,target_month,ost_prob
101,144.183333,146.966667,52.4,0,35094.0
34077,191.4,235.833333,192.6,1,28019.0
68053,151.25,73.466667,0.0,0,157931.0
102029,142.05,210.633333,486.4,0,151612.0
136020,149.05,75.533333,0.0,0,148988.0
170000,105.3,153.533333,73.0,0,144382.0


In [7]:
medians = df_train[numerical].median()

def fill_nans(df):
    df['date_iskl'] = df['date_iskl'].fillna(pd.to_datetime('2025-01-01'))
    df['date_iskl'] = pd.to_datetime(df['date_iskl'])
    df['days_from_last_rem'] = df['days_from_last_rem'].fillna(1200)
    df[numerical] = df[numerical].fillna(medians)
    return df

df_train = fill_nans(df_train)
df_test = fill_nans(df_test)

In [8]:
def add_features(df):
    df['days_to_iskl'] = (df['date_iskl'] - df['month']).dt.days
    df['days_to_srok_sl'] = (df['srok_sl'] - df['month']).dt.days
    df['iskl_in_a_year'] = df['days_to_iskl'] < 365
    df['ost_prob_in_a_month'] = df['ost_prob'] - df['30d_diff_mean'] * 30
    df['ost_prob_in_3_months'] = df['ost_prob'] - df['30d_diff_mean'] * 90
    df['ost_prob_in_10_days'] = df['ost_prob'] - df['30d_diff_mean'] * 10
    df['age_y'] = (df['month'] - df['date_build']).dt.days / 365

    df['days_since_kap'] = (df['month'] - df['date_kap']).dt.days
    df['days_since_dep'] = (df['month'] - df['date_dep']).dt.days

    df['30d_next_time_prob'] = df['ost_prob'] - df['30d_avg_distance_max']
    df['5d_next_time_prob'] = df['ost_prob'] - df['5d_avg_distance_max']

    df['season'] = df['month'].dt.month.apply(lambda x: 'winter' if x in [12, 1, 2] else 'spring' if x in [3, 4, 5] else 'summer' if x in [6, 7, 8] else 'autumn')

    return df


df_train = add_features(df_train)
df_test = add_features(df_test)

  df['ost_prob_in_3_months'] = df['ost_prob'] - df['30d_diff_mean'] * 90
  df['ost_prob_in_10_days'] = df['ost_prob'] - df['30d_diff_mean'] * 10
  df['age_y'] = (df['month'] - df['date_build']).dt.days / 365
  df['days_since_kap'] = (df['month'] - df['date_kap']).dt.days
  df['days_since_dep'] = (df['month'] - df['date_dep']).dt.days
  df['30d_next_time_prob'] = df['ost_prob'] - df['30d_avg_distance_max']
  df['5d_next_time_prob'] = df['ost_prob'] - df['5d_avg_distance_max']
  df['season'] = df['month'].dt.month.apply(lambda x: 'winter' if x in [12, 1, 2] else 'spring' if x in [3, 4, 5] else 'summer' if x in [6, 7, 8] else 'autumn')
  df['ost_prob_in_3_months'] = df['ost_prob'] - df['30d_diff_mean'] * 90
  df['ost_prob_in_10_days'] = df['ost_prob'] - df['30d_diff_mean'] * 10
  df['age_y'] = (df['month'] - df['date_build']).dt.days / 365
  df['days_since_kap'] = (df['month'] - df['date_kap']).dt.days
  df['days_since_dep'] = (df['month'] - df['date_dep']).dt.days
  df['30d_next_time_pro

In [9]:
numerical = df_train_raw.select_dtypes(exclude=['object']).columns

corr = df_train_raw[numerical].corr()

print(corr['target_month'].sort_values(ascending=False)[:10])
print(corr['target_month'].sort_values(ascending=True)[:10])

target_month            1.000000
target_day              0.564527
days_from_last_rem      0.136233
last_month_for_cnt      0.108118
months_after_pr_rems    0.106759
wagnum                  0.083299
por_probeg_tr_min       0.082819
por_probeg_tr_mean      0.082756
por_probeg_tr_max       0.082662
gr_probeg_tr_min        0.071579
Name: target_month, dtype: float64
ost_prob              -0.250584
ost_prob_in_a_month   -0.250439
days_to_pl_rem        -0.154548
_allm_pr_rems         -0.106475
diff_st_pr            -0.100198
kod_vrab_0_pr         -0.083434
tara                  -0.067995
expected_srok_sl_y    -0.067550
cnsi_volumek          -0.066799
kod_vrab_1_pr         -0.058093
Name: target_month, dtype: float64


In [10]:
df_train.to_csv('../train/data/train.csv', index=False)
df_test.to_csv('../train/data/test.csv', index=False)