In [323]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import datetime
from graphviz import Digraph
import time
from tqdm import tqdm
from sklearn.model_selection import TimeSeriesSplit

In [307]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format

In [379]:
df = pd.read_csv('/Users/kost/Desktop/решено/data/statistics-07-20.csv',sep=';',parse_dates=['operation_date'])
train_ops = pd.read_csv('/Users/kost/Desktop/решено/data/poezd.csv', sep=';', encoding='utf-8')
car_ops = pd.read_csv('/Users/kost/Desktop/решено/data/codes.csv',sep=';', encoding='utf-8')

In [380]:
ndf = df.sort_values(by=['operation_st_id','car_number','operation_date']).drop_duplicates().reset_index(drop=True)

### Первичный анализ датасета

In [381]:
operation_code = 2
print(train_ops[train_ops.index_code == operation_code].to_string(index=False))
print(car_ops[car_ops.index_code == operation_code].to_string(index=False))

 index_code mnem_code                                                         description
          2        От Отпр. поезда со станции (без сдачи на дорогу,отделение,участок ДНЦ)
 index_code mnem_code                            description
          2      ОТПР ОТПРАВЛЕНИЕ ВАГОНА СО СТАНЦИИ         


In [382]:
# уникальные станции по операциям
df.operation_st_id.value_counts()

2,000,038,976.00    207099
2,000,038,600.00    146427
2,000,037,862.00    116936
2,001,933,494.00     96307
2,001,930,816.00     82794
                     ...  
2,000,037,640.00         1
2,000,036,452.00         1
2,000,036,458.00         1
2,000,035,312.00         1
2,000,036,334.00         1
Name: operation_st_id, Length: 744, dtype: int64

In [383]:
# уникальные вагоны по операциям
df.car_number.value_counts()

55864821    310
55822928    290
55927537    287
55626428    273
55701130    272
           ... 
60175320      1
95378899      1
29064912      1
57448243      1
60025954      1
Name: car_number, Length: 441248, dtype: int64

### Избавляемся от пропусков в operation_car

In [384]:
for i in ndf.operation_train[ndf.operation_car.isna()].value_counts().index:
    print(i,train_ops.description[train_ops.index_code == i].values)

2.0 ['Отпр. поезда со станции (без сдачи на дорогу,отделение,участок ДНЦ)']
62.0 ['Отпр. поезда со станции со сдачей на соседний участок']
22.0 ['Отпр. поезда co станции со сдачей на другую дорогу']
42.0 ['Отпр. поезда со станции со сдачей на соседнее отделение']


In [385]:
# все пропуски можно заменить операцией отправления
ndf.operation_car = ndf.operation_car.fillna(2)

### Избавляемся от пропусков в danger

In [315]:
ndf.danger = ndf.danger.fillna(0)

### Отрисовка графов последовательностей операций

In [316]:
def ops_tr_for_set_of_carts(cartsSet: set) -> pd.DataFrame:
    """
    Builds DF with historical data of changes in states from one operation to another for a set of carts IDs.
    :param cartsSet: set with carts numbers from nom_vag_op
    :return: dataframe with following format (note that state changes are not unique)
        from 	to
    0 	3 	    4
    1 	4 	    3
    2 	3 	    4
    3 	4 	    3
    ...
    """
    stateChangeDF = pd.DataFrame(columns=['from', 'to'])
    iter = 0
    for cartID in tqdm(cartsSet):
        iter += 1
        opsListCart = list(ndf.operation_car[ndf.car_number == cartID])
        state_change = pd.DataFrame({'from': opsListCart[:-1], 'to': opsListCart[1:]})
        stateChangeDF = pd.concat(
            [stateChangeDF, state_change],
            axis=0,
            join="outer",
            ignore_index=True,
            keys=None,
            levels=None,
            names=None,
            verify_integrity=False,
            copy=True,
        )
    return stateChangeDF


def count_ops_transitions(cartsSet: set) -> dict:
    """
    Builds a dict with unique operation changes and state change frequency.
    :param cartsSet: set with carts numbers from nom_vag_op
    :return: dict with unique operations and their quantity. See example below:
    {'3->4': 2, '4->3': 2, '3->2': 1, '2->80': 1}
    """
    sChangeList = []
    linksDict: dict = {}
    sChangeDF = ops_tr_for_set_of_carts(cartsSet)
    for i in range(sChangeDF.shape[0]):
        src = str(list(sChangeDF.loc[i])[0])
        trg = str(list(sChangeDF.loc[i])[1])
        sChangeList.append(src + "->" + trg)
    for entry in sChangeList:
        key = str(entry)
        if key in linksDict.keys():
            linksDict[key] += 1
        else:
            linksDict[key] = 1
    return linksDict

In [317]:
# cars
# carSet = {37843901}
# carSet = {94033792}

# stations
# option A - 
# carSet = set(ndf.car_number[ndf.operation_st_id == 2000038976])

# option B - drop all ops chains with less then 3
st = ndf.car_number[ndf.operation_st_id == 2000038976].value_counts()
carSet = set(st[st > 2].index)

In [318]:
sdict = count_ops_transitions(carSet)

DG = Digraph('finite_state_machine', filename='ops_fsm_rzd')
DG.attr(rankdir='LR', size='8,5')

for key in sdict:
    src = car_ops[car_ops.index_code == float(key.split('->')[0])].values[0][1]
    dst = car_ops[car_ops.index_code == float(key.split('->')[1])].values[0][1]
    DG.edge(src, dst, weight=str(sdict[key]), label=str(sdict[key]))

DG.view()

100%|██████████| 38496/38496 [04:11<00:00, 153.00it/s]


'ops_fsm_rzd.pdf'

### Профайлер

In [None]:
## profiler code
dt = datetime.datetime.now().strftime("%m/%d/%y - %H:%M")
profile = ProfileReport(ndf, title=dt, explorative=True)
fname = 'pd-report.html'
profile.to_file(output_file=fname)

### =================

In [319]:
ndf

Unnamed: 0,index_train,length,car_number,destination_esr,adm,danger,gruz,loaded,operation_car,operation_date,operation_st_esr,operation_st_id,operation_train,receiver,rodvag,rod_train,sender,ssp_station_esr,ssp_station_id,tare_weight,weight_brutto
0,880106902880202.00,,30891014,880303.00,,0.00,,2.00,3.00,2020-07-22 13:00:00,880303.00,2000035070.00,4.00,,90.00,83.00,,880202.00,,,47.00
1,880106902880202.00,,30893911,880303.00,,0.00,,2.00,3.00,2020-07-22 13:00:00,880303.00,2000035070.00,4.00,,90.00,83.00,,880202.00,,,47.00
2,,1.38,29068855,893500.00,20.00,0.00,421034.00,,18.00,2020-07-22 13:37:00,880407.00,2000035090.00,,91860990.00,20.00,,33977721.00,,,269.00,
3,880407949880106.00,1.38,29068855,893500.00,,0.00,521016.00,2.00,4.00,2020-07-22 22:30:00,880407.00,2000035090.00,5.00,91860990.00,20.00,72.00,33977721.00,880106.00,2000035130.00,,240.00
4,880407949880106.00,1.38,29068855,893500.00,,0.00,521016.00,,2.00,2020-07-23 00:03:00,880407.00,2000035090.00,2.00,91860990.00,20.00,72.00,33977721.00,880106.00,2000035130.00,,240.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4188133,,,64437627,917207.00,,0.00,,2.00,79.00,2020-07-26 22:10:00,,,,,,,,,,,
4188134,969004044968209.00,1.00,64437627,917207.00,,0.00,161113.00,2.00,4.00,2020-07-26 23:43:00,,,5.00,161246.00,60.00,,52682351.00,968209.00,2000038610.00,,
4188135,969004044968209.00,1.00,64437627,917207.00,,0.00,161113.00,,2.00,2020-07-27 00:36:00,,,2.00,161246.00,60.00,52.00,52682351.00,968209.00,2000038610.00,,1036.00
4188136,,1.00,65635427,693903.00,20.00,0.00,351043.00,,13.00,2020-07-30 16:30:00,,,,0.00,60.00,,1373772.00,,,244.00,


### add target

In [386]:
ind = []
for i in range(ndf.shape[0]):
    if (
        ndf.operation_car[i] == 4
        and ndf.operation_car[i + 1] == 2
        and ndf.car_number[i] == ndf.car_number[i + 1]
    ):
        ind.append(1)
    else:
        ind.append(0)

ind.insert(0, 0)
ind = ind[:-1]

ndf["target"] = ind

In [387]:
ndf = ndf.fillna(0)

In [388]:
ndf

Unnamed: 0,index_train,length,car_number,destination_esr,adm,danger,gruz,loaded,operation_car,operation_date,operation_st_esr,operation_st_id,operation_train,receiver,rodvag,rod_train,sender,ssp_station_esr,ssp_station_id,tare_weight,weight_brutto,target
0,880106902880202.00,0.00,30891014,880303.00,0.00,0.00,0.00,2.00,3.00,2020-07-22 13:00:00,880303.00,2000035070.00,4.00,0.00,90.00,83.00,0.00,880202.00,0.00,0.00,47.00,0
1,880106902880202.00,0.00,30893911,880303.00,0.00,0.00,0.00,2.00,3.00,2020-07-22 13:00:00,880303.00,2000035070.00,4.00,0.00,90.00,83.00,0.00,880202.00,0.00,0.00,47.00,0
2,0.00,1.38,29068855,893500.00,20.00,0.00,421034.00,0.00,18.00,2020-07-22 13:37:00,880407.00,2000035090.00,0.00,91860990.00,20.00,0.00,33977721.00,0.00,0.00,269.00,0.00,0
3,880407949880106.00,1.38,29068855,893500.00,0.00,0.00,521016.00,2.00,4.00,2020-07-22 22:30:00,880407.00,2000035090.00,5.00,91860990.00,20.00,72.00,33977721.00,880106.00,2000035130.00,0.00,240.00,0
4,880407949880106.00,1.38,29068855,893500.00,0.00,0.00,521016.00,0.00,2.00,2020-07-23 00:03:00,880407.00,2000035090.00,2.00,91860990.00,20.00,72.00,33977721.00,880106.00,2000035130.00,0.00,240.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4188133,0.00,0.00,64437627,917207.00,0.00,0.00,0.00,2.00,79.00,2020-07-26 22:10:00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0
4188134,969004044968209.00,1.00,64437627,917207.00,0.00,0.00,161113.00,2.00,4.00,2020-07-26 23:43:00,0.00,0.00,5.00,161246.00,60.00,0.00,52682351.00,968209.00,2000038610.00,0.00,0.00,0
4188135,969004044968209.00,1.00,64437627,917207.00,0.00,0.00,161113.00,0.00,2.00,2020-07-27 00:36:00,0.00,0.00,2.00,161246.00,60.00,52.00,52682351.00,968209.00,2000038610.00,0.00,1036.00,1
4188136,0.00,1.00,65635427,693903.00,20.00,0.00,351043.00,0.00,13.00,2020-07-30 16:30:00,0.00,0.00,0.00,0.00,60.00,0.00,1373772.00,0.00,0.00,244.00,0.00,0


### split to train and test

In [389]:
X = X.drop(columns=['index_train','gruz','adm','ssp_station_esr','ssp_station_id','tare_weight','loaded'])

In [390]:
X = ndf.sort_values(by=['operation_date']).reset_index(drop=True)

In [413]:
test = X.iloc[-500000:]
train = X.iloc[:-500000]

In [414]:
X_train = train.drop(columns='target')
X_test = test.drop(columns='target')
y_train = train.target
y_test = test.target

### Catboost

In [398]:
import catboost

In [415]:
# c параметрами по умолчанию
model = catboost.CatBoostClassifier()
model.fit(X_train, y_train)

Learning rate set to 0.343527
0:	learn: 0.0482186	total: 492ms	remaining: 8m 11s
1:	learn: 0.0112645	total: 985ms	remaining: 8m 11s
2:	learn: 0.0069490	total: 1.44s	remaining: 7m 58s
3:	learn: 0.0056085	total: 1.86s	remaining: 7m 43s
4:	learn: 0.0049766	total: 2.23s	remaining: 7m 23s
5:	learn: 0.0047015	total: 2.68s	remaining: 7m 24s
6:	learn: 0.0044267	total: 3.1s	remaining: 7m 20s
7:	learn: 0.0042646	total: 3.48s	remaining: 7m 12s
8:	learn: 0.0041760	total: 3.91s	remaining: 7m 10s
9:	learn: 0.0040497	total: 4.33s	remaining: 7m 9s
10:	learn: 0.0039842	total: 4.76s	remaining: 7m 7s
11:	learn: 0.0038320	total: 5.17s	remaining: 7m 6s
12:	learn: 0.0037559	total: 5.58s	remaining: 7m 3s
13:	learn: 0.0036512	total: 6s	remaining: 7m 2s
14:	learn: 0.0035227	total: 6.39s	remaining: 6m 59s
15:	learn: 0.0034249	total: 6.79s	remaining: 6m 57s
16:	learn: 0.0034026	total: 7.2s	remaining: 6m 56s
17:	learn: 0.0033756	total: 7.63s	remaining: 6m 56s
18:	learn: 0.0033092	total: 8s	remaining: 6m 53s
19:	l

158:	learn: 0.0015348	total: 54.2s	remaining: 4m 46s
159:	learn: 0.0015348	total: 54.5s	remaining: 4m 46s
160:	learn: 0.0015348	total: 54.9s	remaining: 4m 45s
161:	learn: 0.0015348	total: 55.2s	remaining: 4m 45s
162:	learn: 0.0015348	total: 55.5s	remaining: 4m 44s
163:	learn: 0.0015348	total: 55.8s	remaining: 4m 44s
164:	learn: 0.0015348	total: 56.1s	remaining: 4m 43s
165:	learn: 0.0015348	total: 56.3s	remaining: 4m 43s
166:	learn: 0.0015348	total: 56.6s	remaining: 4m 42s
167:	learn: 0.0015348	total: 57s	remaining: 4m 42s
168:	learn: 0.0015348	total: 57.3s	remaining: 4m 41s
169:	learn: 0.0015348	total: 57.6s	remaining: 4m 41s
170:	learn: 0.0015348	total: 57.9s	remaining: 4m 40s
171:	learn: 0.0015348	total: 58.2s	remaining: 4m 40s
172:	learn: 0.0015348	total: 58.5s	remaining: 4m 39s
173:	learn: 0.0015348	total: 58.8s	remaining: 4m 39s
174:	learn: 0.0015348	total: 59.1s	remaining: 4m 38s
175:	learn: 0.0015348	total: 59.4s	remaining: 4m 38s
176:	learn: 0.0015348	total: 59.7s	remaining: 4m

312:	learn: 0.0015348	total: 1m 40s	remaining: 3m 41s
313:	learn: 0.0015348	total: 1m 41s	remaining: 3m 40s
314:	learn: 0.0015348	total: 1m 41s	remaining: 3m 40s
315:	learn: 0.0015348	total: 1m 41s	remaining: 3m 40s
316:	learn: 0.0015348	total: 1m 42s	remaining: 3m 39s
317:	learn: 0.0015348	total: 1m 42s	remaining: 3m 39s
318:	learn: 0.0015348	total: 1m 42s	remaining: 3m 39s
319:	learn: 0.0015348	total: 1m 42s	remaining: 3m 38s
320:	learn: 0.0015348	total: 1m 43s	remaining: 3m 38s
321:	learn: 0.0015348	total: 1m 43s	remaining: 3m 37s
322:	learn: 0.0015348	total: 1m 43s	remaining: 3m 37s
323:	learn: 0.0015348	total: 1m 44s	remaining: 3m 37s
324:	learn: 0.0015348	total: 1m 44s	remaining: 3m 37s
325:	learn: 0.0015348	total: 1m 44s	remaining: 3m 36s
326:	learn: 0.0015348	total: 1m 45s	remaining: 3m 36s
327:	learn: 0.0015348	total: 1m 45s	remaining: 3m 35s
328:	learn: 0.0015348	total: 1m 45s	remaining: 3m 35s
329:	learn: 0.0015348	total: 1m 45s	remaining: 3m 35s
330:	learn: 0.0015348	total:

466:	learn: 0.0015348	total: 2m 26s	remaining: 2m 47s
467:	learn: 0.0015348	total: 2m 26s	remaining: 2m 46s
468:	learn: 0.0015348	total: 2m 27s	remaining: 2m 46s
469:	learn: 0.0015348	total: 2m 27s	remaining: 2m 46s
470:	learn: 0.0015348	total: 2m 27s	remaining: 2m 46s
471:	learn: 0.0015348	total: 2m 28s	remaining: 2m 45s
472:	learn: 0.0015348	total: 2m 28s	remaining: 2m 45s
473:	learn: 0.0015348	total: 2m 28s	remaining: 2m 44s
474:	learn: 0.0015348	total: 2m 28s	remaining: 2m 44s
475:	learn: 0.0015348	total: 2m 29s	remaining: 2m 44s
476:	learn: 0.0015348	total: 2m 29s	remaining: 2m 43s
477:	learn: 0.0015348	total: 2m 29s	remaining: 2m 43s
478:	learn: 0.0015348	total: 2m 30s	remaining: 2m 43s
479:	learn: 0.0015348	total: 2m 30s	remaining: 2m 42s
480:	learn: 0.0015348	total: 2m 30s	remaining: 2m 42s
481:	learn: 0.0015348	total: 2m 31s	remaining: 2m 42s
482:	learn: 0.0015348	total: 2m 31s	remaining: 2m 41s
483:	learn: 0.0015348	total: 2m 31s	remaining: 2m 41s
484:	learn: 0.0015348	total:

620:	learn: 0.0015348	total: 3m 12s	remaining: 1m 57s
621:	learn: 0.0015348	total: 3m 12s	remaining: 1m 57s
622:	learn: 0.0015348	total: 3m 13s	remaining: 1m 56s
623:	learn: 0.0015348	total: 3m 13s	remaining: 1m 56s
624:	learn: 0.0015348	total: 3m 13s	remaining: 1m 56s
625:	learn: 0.0015348	total: 3m 14s	remaining: 1m 55s
626:	learn: 0.0015348	total: 3m 14s	remaining: 1m 55s
627:	learn: 0.0015348	total: 3m 14s	remaining: 1m 55s
628:	learn: 0.0015348	total: 3m 14s	remaining: 1m 55s
629:	learn: 0.0015348	total: 3m 15s	remaining: 1m 54s
630:	learn: 0.0015348	total: 3m 15s	remaining: 1m 54s
631:	learn: 0.0015348	total: 3m 15s	remaining: 1m 54s
632:	learn: 0.0015348	total: 3m 16s	remaining: 1m 53s
633:	learn: 0.0015348	total: 3m 17s	remaining: 1m 53s
634:	learn: 0.0015348	total: 3m 17s	remaining: 1m 53s
635:	learn: 0.0015348	total: 3m 18s	remaining: 1m 53s
636:	learn: 0.0015348	total: 3m 18s	remaining: 1m 53s
637:	learn: 0.0015348	total: 3m 19s	remaining: 1m 52s
638:	learn: 0.0015348	total:

772:	learn: 0.0015047	total: 3m 59s	remaining: 1m 10s
773:	learn: 0.0015047	total: 4m	remaining: 1m 10s
774:	learn: 0.0015047	total: 4m	remaining: 1m 9s
775:	learn: 0.0015047	total: 4m	remaining: 1m 9s
776:	learn: 0.0015046	total: 4m 1s	remaining: 1m 9s
777:	learn: 0.0015046	total: 4m 1s	remaining: 1m 8s
778:	learn: 0.0015046	total: 4m 1s	remaining: 1m 8s
779:	learn: 0.0015046	total: 4m 1s	remaining: 1m 8s
780:	learn: 0.0015046	total: 4m 2s	remaining: 1m 7s
781:	learn: 0.0015046	total: 4m 2s	remaining: 1m 7s
782:	learn: 0.0015046	total: 4m 2s	remaining: 1m 7s
783:	learn: 0.0015046	total: 4m 3s	remaining: 1m 7s
784:	learn: 0.0015045	total: 4m 3s	remaining: 1m 6s
785:	learn: 0.0015045	total: 4m 3s	remaining: 1m 6s
786:	learn: 0.0015045	total: 4m 4s	remaining: 1m 6s
787:	learn: 0.0015045	total: 4m 4s	remaining: 1m 5s
788:	learn: 0.0015045	total: 4m 4s	remaining: 1m 5s
789:	learn: 0.0015045	total: 4m 4s	remaining: 1m 5s
790:	learn: 0.0015045	total: 4m 5s	remaining: 1m 4s
791:	learn: 0.0015

928:	learn: 0.0015044	total: 4m 46s	remaining: 21.9s
929:	learn: 0.0015044	total: 4m 46s	remaining: 21.6s
930:	learn: 0.0015044	total: 4m 46s	remaining: 21.3s
931:	learn: 0.0015044	total: 4m 47s	remaining: 21s
932:	learn: 0.0015044	total: 4m 47s	remaining: 20.6s
933:	learn: 0.0015044	total: 4m 47s	remaining: 20.3s
934:	learn: 0.0015044	total: 4m 48s	remaining: 20s
935:	learn: 0.0015044	total: 4m 48s	remaining: 19.7s
936:	learn: 0.0015044	total: 4m 48s	remaining: 19.4s
937:	learn: 0.0015044	total: 4m 48s	remaining: 19.1s
938:	learn: 0.0015044	total: 4m 49s	remaining: 18.8s
939:	learn: 0.0015044	total: 4m 49s	remaining: 18.5s
940:	learn: 0.0015044	total: 4m 49s	remaining: 18.2s
941:	learn: 0.0015044	total: 4m 50s	remaining: 17.9s
942:	learn: 0.0015044	total: 4m 50s	remaining: 17.6s
943:	learn: 0.0015044	total: 4m 50s	remaining: 17.3s
944:	learn: 0.0015044	total: 4m 51s	remaining: 16.9s
945:	learn: 0.0015044	total: 4m 51s	remaining: 16.6s
946:	learn: 0.0015044	total: 4m 51s	remaining: 16.

<catboost.core.CatBoostClassifier at 0x13b97d760>

In [416]:
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    428234
           1       0.99      1.00      1.00     71766

    accuracy                           1.00    500000
   macro avg       1.00      1.00      1.00    500000
weighted avg       1.00      1.00      1.00    500000



In [417]:
test_sub = test
test_sub = test_sub.groupby('operation_st_id').agg({'target':np.sum})
test_sub = test_sub[test_sub.target != 0]
test_sub

Unnamed: 0_level_0,target
operation_st_id,Unnamed: 1_level_1
2000035090.00,2
2000035110.00,61
2000035130.00,29
2000035140.00,6
2000035162.00,12
...,...
2001933538.00,123
2002023503.00,2
2002023505.00,5
2002025275.00,1


In [418]:
X_tmp = X_test.copy()

In [419]:
X_tmp['target'] = y_pred

In [420]:
test_s = X_tmp.groupby('operation_st_id').agg({'target':np.sum})
test_s = test_s[test_s.target != 0]
test_s

Unnamed: 0_level_0,target
operation_st_id,Unnamed: 1_level_1
2000035090.00,2
2000035110.00,61
2000035130.00,29
2000035140.00,6
2000035162.00,12
...,...
2001933538.00,123
2002023503.00,2
2002023505.00,5
2002025275.00,1
