# Imports

In [161]:
from datetime import timedelta
import pandas as pd
import numpy as np
from holidays import country_holidays

# Установите максимальное количество отображаемых столбцов и ширину
pd.set_option('display.max_columns', None)  # Показать все столбцы
pd.set_option('display.expand_frame_repr', False)  # Не переносить таблицу на новую строку
pd.set_option('display.max_colwidth', None)  # Показать всю ширину столбцов


# Create Dataset

In [61]:
num_days = 20568
atm_per_day = 50

# Генерация данных для столбцов
dates = pd.date_range(start="2023-11-01", periods=num_days // atm_per_day, freq="D")  # Уникальные даты
dates = np.repeat(dates, atm_per_day)  # Повторение каждой даты 50 раз

# Генерация уникальных ATM ID
atm_ids = [f"ATM{str(10640 + i % 100).zfill(5)}" for i in range(atm_per_day)]  # 100 уникальных ID банкоматов
atm_ids = np.tile(atm_ids, len(dates) // atm_per_day)  # Повторяем их для всех дат

# Применим случайное уменьшение количества банкоматов для некоторых дат
mask = np.random.choice([True, False], size=len(dates), p=[0.95, 0.05])  # В 5% случаев будет меньше 50 банкоматов
dates = dates[mask]
atm_ids = atm_ids[mask]

# Генерация случайных данных для столбцов с префиксом 'o_'
data = np.random.randint(0, 25, size=(len(dates), 7))  # Случайные значения для столбцов с диапазоном 0-25

# Создание DataFrame
columns = ['DATE', 'ATM', 'o_500', 'o_200', 'o_100', 'o_50', 'o_20', 'o_10', 'o_5']
df = pd.DataFrame(np.column_stack([dates.astype(str), atm_ids, data]), columns=columns)

# Приведение числовых столбцов к типу int
for col in columns[2:]:
    df[col] = df[col].astype(int)

# Вывод первых строк для проверки
print(df.head())

# Проверка количества банкоматов для случайной даты
print(df.groupby('DATE').size())

         DATE       ATM  o_500  o_200  o_100  o_50  o_20  o_10  o_5
0  2023-11-01  ATM10640      0     17     10     1     2    14    7
1  2023-11-01  ATM10641      4     15     11    16    10    12    3
2  2023-11-01  ATM10643      8     18      0     5     2    14    7
3  2023-11-01  ATM10644     14      2      3    22     1    16    8
4  2023-11-01  ATM10645     23      3     11    14    15    19    1
DATE
2023-11-01    47
2023-11-02    46
2023-11-03    50
2023-11-04    49
2023-11-05    48
              ..
2024-12-11    44
2024-12-12    48
2024-12-13    47
2024-12-14    46
2024-12-15    47
Length: 411, dtype: int64


In [62]:
df.dtypes

DATE     object
ATM      object
o_500     int32
o_200     int32
o_100     int32
o_50      int32
o_20      int32
o_10      int32
o_5       int32
dtype: object

# Process Date

In [63]:
bel_holidays=country_holidays('BY')
bel_holidays.special_public_holidays

{1998: ((1, 2, 1, 10), (4, 27, 4, 25)),
 1999: ((1, 8, 1, 16), (4, 19, 4, 17)),
 2000: ((5, 8, 5, 13), (11, 6, 11, 11)),
 2001: ((1, 2, 1, 20),
  (3, 9, 3, 3),
  (4, 23, 4, 21),
  (4, 30, 4, 28),
  (7, 2, 7, 7),
  (12, 24, 12, 22),
  (12, 31, 12, 29)),
 2002: ((1, 2, 1, 5), (5, 10, 5, 18), (11, 8, 11, 16)),
 2003: ((1, 6, 1, 4), (5, 5, 5, 3)),
 2004: ((1, 2, 1, 10), (1, 5, 1, 17), (1, 6, 1, 31), (4, 19, 4, 17)),
 2005: (3, 7, 3, 12),
 2006: ((1, 2, 1, 21), (5, 8, 5, 6), (11, 6, 11, 4)),
 2007: ((1, 2, 12, 30, 2006),
  (3, 9, 3, 17),
  (4, 16, 4, 14),
  (4, 30, 5, 5),
  (7, 2, 7, 7),
  (12, 24, 12, 22),
  (12, 31, 12, 29)),
 2008: ((1, 2, 1, 12), (5, 5, 5, 3), (7, 4, 6, 28), (12, 26, 12, 20)),
 2009: ((1, 2, 1, 10), (4, 27, 4, 25)),
 2010: ((1, 8, 1, 23), (4, 12, 4, 17), (5, 10, 5, 15)),
 2011: ((3, 7, 3, 12), (5, 2, 5, 14)),
 2012: ((3, 9, 3, 11),
  (4, 23, 4, 28),
  (7, 2, 6, 30),
  (12, 24, 12, 22),
  (12, 31, 12, 29)),
 2013: ((1, 2, 1, 5), (5, 10, 5, 18)),
 2014: ((1, 2, 1, 4),
  (

In [163]:
def process_date(date: str, bel_holidays=country_holidays('BY')):
    def is_nearby_holiday_after(date):
        for i in range(1, 4):
            days_after = date + timedelta(days=i)
            if days_after in bel_holidays:
                return 1
        return 0

    def is_nearby_holiday_before(date):
        for i in range(1, 4):
            days_before = date - timedelta(days=i)
            if days_before in bel_holidays:
                return 1
        return 0

    dates = pd.date_range(start=date, end=date, freq='D')
    df = pd.DataFrame({'date': dates})

    df['weekday'] = df['date'].dt.weekday
    df['day_of_month'] = df['date'].dt.day
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    
    df['is_holiday'] = df['date'].apply(lambda x: 1 if x in bel_holidays else 0)
    df['nearby_holiday_before'] = df['date'].apply(is_nearby_holiday_before)
    df['nearby_holiday_after'] = df['date'].apply(is_nearby_holiday_after)

    weekday_ohe = np.zeros(7)
    weekday_ohe[df['weekday'].values[0]] = 1

    day_of_month_ohe = np.zeros(31)
    day_of_month_ohe[df['day_of_month'].values[0] - 1] = 1

    columns = ['is_weekend', 'is_holiday', 'nearby_holiday_before', 'nearby_holiday_after']
    weekday_columns = [f'w_{i}' for i in range(1, 8)]
    day_of_month_columns = [f'dm_{i+1}' for i in range(31)]
    
    output = np.concatenate([df[columns].values.reshape(-1), weekday_ohe, day_of_month_ohe])
    column_names = columns + weekday_columns + day_of_month_columns

    return output, column_names

In [164]:
tmp = df['DATE'].apply(lambda x: process_date(x)[0])
column_names = process_date(df['DATE'].iloc[0])[1]

date_features = pd.DataFrame(tmp.tolist(), columns=column_names)

train = pd.concat([df, date_features], axis=1)

In [165]:
train.iloc[0]

DATE                     2023-11-01
ATM                        ATM10640
o_500                             0
o_200                            17
o_100                            10
o_50                              1
o_20                              2
o_10                             14
o_5                               7
is_weekend                      0.0
is_holiday                      0.0
nearby_holiday_before           0.0
nearby_holiday_after            0.0
w_1                             0.0
w_2                             0.0
w_3                             1.0
w_4                             0.0
w_5                             0.0
w_6                             0.0
w_7                             0.0
dm_1                            1.0
dm_2                            0.0
dm_3                            0.0
dm_4                            0.0
dm_5                            0.0
dm_6                            0.0
dm_7                            0.0
dm_8                        

In [166]:
train[
    (train["DATE"] >= "2023-11-01")
    & (train["DATE"] <= "2023-11-30")
    & (train["ATM"] == "ATM10640")
].groupby(["DATE", "ATM"]).agg(
    {col: "first" if train[col].dtype == "O" else "mean" for col in train.columns}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,DATE,ATM,o_500,o_200,o_100,o_50,o_20,o_10,o_5,is_weekend,is_holiday,nearby_holiday_before,nearby_holiday_after,w_1,w_2,w_3,w_4,w_5,w_6,w_7,dm_1,dm_2,dm_3,dm_4,dm_5,dm_6,dm_7,dm_8,dm_9,dm_10,dm_11,dm_12,dm_13,dm_14,dm_15,dm_16,dm_17,dm_18,dm_19,dm_20,dm_21,dm_22,dm_23,dm_24,dm_25,dm_26,dm_27,dm_28,dm_29,dm_30,dm_31
DATE,ATM,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
2023-11-01,ATM10640,2023-11-01,ATM10640,0.0,17.0,10.0,1.0,2.0,14.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-02,ATM10640,2023-11-02,ATM10640,0.0,17.0,3.0,5.0,22.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-03,ATM10640,2023-11-03,ATM10640,23.0,22.0,9.0,2.0,2.0,24.0,14.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-04,ATM10640,2023-11-04,ATM10640,2.0,23.0,9.0,4.0,3.0,14.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-05,ATM10640,2023-11-05,ATM10640,5.0,11.0,18.0,12.0,8.0,8.0,12.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-06,ATM10640,2023-11-06,ATM10640,18.0,15.0,22.0,14.0,11.0,4.0,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-07,ATM10640,2023-11-07,ATM10640,21.0,4.0,1.0,11.0,1.0,7.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-08,ATM10640,2023-11-08,ATM10640,9.0,7.0,21.0,2.0,14.0,19.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-09,ATM10640,2023-11-09,ATM10640,13.0,4.0,3.0,0.0,12.0,12.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-11-10,ATM10640,2023-11-10,ATM10640,3.0,19.0,23.0,7.0,17.0,6.0,20.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
train[(train['DATE'] == '2023-11-01') & (train['ATM'] == 'ATM10640')]

Unnamed: 0,DATE,ATM,o_500,o_200,o_100,o_50,o_20,o_10,o_5,is_weekend,is_holiday,nearby_holiday_before,nearby_holiday_after,w_1,w_2,w_3,w_4,w_5,w_6,w_7,dm_1,dm_2,dm_3,dm_4,dm_5,dm_6,dm_7,dm_8,dm_9,dm_10,dm_11,dm_12,dm_13,dm_14,dm_15,dm_16,dm_17,dm_18,dm_19,dm_20,dm_21,dm_22,dm_23,dm_24,dm_25,dm_26,dm_27,dm_28,dm_29,dm_30,dm_31
0,2023-11-01,ATM10640,0,17,10,1,2,14,7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19486 entries, 0 to 19485
Data columns (total 51 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   DATE                   19486 non-null  object 
 1   ATM                    19486 non-null  object 
 2   o_500                  19486 non-null  int32  
 3   o_200                  19486 non-null  int32  
 4   o_100                  19486 non-null  int32  
 5   o_50                   19486 non-null  int32  
 6   o_20                   19486 non-null  int32  
 7   o_10                   19486 non-null  int32  
 8   o_5                    19486 non-null  int32  
 9   is_weekend             19486 non-null  float64
 10  is_holiday             19486 non-null  float64
 11  nearby_holiday_before  19486 non-null  float64
 12  nearby_holiday_after   19486 non-null  float64
 13  w_1                    19486 non-null  float64
 14  w_2                    19486 non-null  float64
 15  w_

In [171]:
train = pd.read_csv('amt/train.csv', index_col=0)

In [172]:
train.head(5)

Unnamed: 0,DATE,ATM,o_500,o_200,o_100,o_50,o_20,o_10,o_5,is_weekend,is_holiday,nearby_holiday_before,nearby_holiday_after,w_1,w_2,w_3,w_4,w_5,w_6,w_7,dm_1,dm_2,dm_3,dm_4,dm_5,dm_6,dm_7,dm_8,dm_9,dm_10,dm_11,dm_12,dm_13,dm_14,dm_15,dm_16,dm_17,dm_18,dm_19,dm_20,dm_21,dm_22,dm_23,dm_24,dm_25,dm_26,dm_27,dm_28,dm_29,dm_30,dm_31
0,2023-11-01,ATM10640,0,17,10,1,2,14,7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-11-01,ATM10641,4,15,11,16,10,12,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-11-01,ATM10643,8,18,0,5,2,14,7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-11-01,ATM10644,14,2,3,22,1,16,8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-11-01,ATM10645,23,3,11,14,15,19,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
