# Обучаем простой бейзлайн с использованием транзакционной модальности

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
from lightgbm import LGBMClassifier, Dataset
import lightgbm
import datetime
import pickle
import numpy as np
import polars as pl

RS = 2024
DATA_PATH = "/home/jupyter/datasphere/project/Sber/iProf"

In [2]:
# Загружаем данные для обучения

target_train = pl.read_parquet(f'{DATA_PATH}/train_target.parquet')
trx_train = pl.read_parquet(f'{DATA_PATH}/trx_train.parquet')
geo_train = pl.read_parquet(f'{DATA_PATH}/geo_train.parquet')
dial_train = pl.read_parquet(f'{DATA_PATH}/dial_train.parquet')


In [3]:
len(target_train.select(pl.col('client_id')).unique())

170778

# Transaction

In [4]:
target_train = pl.read_parquet(f'{DATA_PATH}/train_target.parquet')
target_train = target_train.with_columns(
    pl.col('mon').str.to_date("%Y-%m-%d")
)

trx_train = trx_train.with_columns(
    pl.col('event_time').dt.strftime("%Y-%m-%d"))
trx_train = trx_train.with_columns(
    pl.col('event_time').str.to_date("%Y-%m-%d")
)
trx_train

event_time,amount,client_id,event_type,event_subtype,currency,src_type11,src_type12,dst_type11,dst_type12,src_type21,src_type22,src_type31,src_type32,mon
date,f32,str,i32,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
2022-01-18,1.22203,"""522256f58adeaf13fbf098859fe562…",40,51,11.0,72.0,189.0,433.0,10049.0,29715.0,10.0,814.0,26.0,1
2022-01-16,7963.970703,"""522256f58adeaf13fbf098859fe562…",40,51,11.0,72.0,189.0,433.0,10049.0,29715.0,10.0,814.0,26.0,1
2022-01-07,363.814209,"""522256f58adeaf13fbf098859fe562…",35,5,11.0,19.0,344.0,1121.0,8945.0,29715.0,10.0,814.0,26.0,1
2022-01-16,16164.658203,"""522256f58adeaf13fbf098859fe562…",52,12,11.0,128.0,456.0,1302.0,8693.0,29715.0,10.0,814.0,26.0,1
2022-01-19,31682.630859,"""522256f58adeaf13fbf098859fe562…",52,12,11.0,128.0,456.0,1302.0,8693.0,29715.0,10.0,814.0,26.0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2022-09-05,14250.209961,"""2fc07b1747c11b5688e04a854160fd…",41,10,11.0,19.0,344.0,1166.0,30836.0,42730.0,24.0,1476.0,55.0,9
2022-09-13,19039.734375,"""2fc07b1747c11b5688e04a854160fd…",41,10,11.0,19.0,344.0,1166.0,30836.0,42730.0,24.0,1476.0,55.0,9
2022-09-18,5808.522949,"""2fc07b1747c11b5688e04a854160fd…",41,10,11.0,19.0,344.0,1166.0,30836.0,42730.0,24.0,1476.0,55.0,9
2022-09-17,21264.791016,"""2fc07b1747c11b5688e04a854160fd…",41,10,11.0,19.0,344.0,1166.0,30836.0,42730.0,24.0,1476.0,55.0,9


In [5]:
def amount_agregate(tg, trx_timeseries):
    # Преобразование столбца event_time в datetime
    feats = []
    # unique_months = tg.select(pl.col('mon')).unique()
    unique_months = tg.select(pl.col('mon')).unique().to_series().to_list()
    for mon in unique_months:
        # Фильтрация транзакций до текущего месяца
        trx = trx_timeseries.filter(pl.col('event_time') < mon)
        # mon_datetime = pl.datetime(mon.year, mon.month, 1)
        
        # Группировка и агрегация по клиентам
        feats_mon = trx.group_by('client_id').agg([
            pl.col('amount').sum().alias('sum_amount'),
            pl.col('amount').max().alias('max_amount'),
            pl.col('amount').min().alias('min_amount'),
            pl.col('amount').mean().alias('mean_amount')
        ])

        # Добавление текущего месяца в результаты агрегации
        feats_mon = feats_mon.with_columns(pl.lit(mon).alias('mon'))
        feats.append(feats_mon)

    # Объединение всех агрегированных результатов
    feats = pl.concat(feats)

    return feats

# Пример использования функции
# target_train и trx_train должны быть поларсовскими DataFrame
# target_train = pl.DataFrame(target_train)
# trx_train = pl.DataFrame(trx_train)

feats = amount_agregate(target_train, trx_train)


# Geo Hash

In [6]:
# Преобразование даты в последнюю дату месяца
geo_train = pl.read_parquet(f'{DATA_PATH}/geo_train.parquet')

geo_train = geo_train.with_columns(pl.col('event_time').dt.month_end().alias('mon'))

# Функция для получения последнего дня месяца
ge_train = geo_train.drop('event_time')


In [7]:
# Функция для агрегации геоданных
def geo_aggregate(tg, geo_timeseries):
    feats = []
    unique_months = tg.select(pl.col('mon')).unique().to_series().to_list()

    for mon in unique_months:
        # Фильтрация транзакций до текущего месяца
        geo = geo_timeseries.filter(pl.col('mon') < mon)

        # Группировка и агрегация по клиентам
        feats_mon = geo.group_by('client_id').agg(
            pl.col('client_id').count().alias('count')
        )

        # Добавление текущего месяца в результаты агрегации
        feats_mon = feats_mon.with_columns(pl.lit(mon).alias('mon'))
        feats.append(feats_mon)

    # Объединение всех агрегированных результатов
    feats = pl.concat(feats, how="vertical")

    return feats

# Создание агрегатов по геоданным
geo_feats = geo_aggregate(feats, ge_train)

geo_feats

client_id,count,mon
str,u32,date
"""7ce559e46bee06f95d11b9aac634b6…",65,2022-10-31
"""87e8b1b2382f0d025a9be38a8b49be…",5,2022-10-31
"""c16303fa410fdf81ec75d0c7d08d7d…",32,2022-10-31
"""2b66927be78567ce4ba6daf3c61eb3…",257,2022-10-31
"""c1ea2bc3874950b0ad552bfd204dc0…",146,2022-10-31
…,…,…
"""56f590ee06e454d88f9e219653e85a…",562,2023-01-31
"""d1c3dbed10257b14b6bee29fbfacf1…",231,2023-01-31
"""9dc42532c1793e59e2b20523835f6d…",729,2023-01-31
"""4ceab4c5b3333ad3305722d7897687…",23,2023-01-31


In [8]:
geo_feats.rename({"count": "geo_count"})

client_id,geo_count,mon
str,u32,date
"""7ce559e46bee06f95d11b9aac634b6…",65,2022-10-31
"""87e8b1b2382f0d025a9be38a8b49be…",5,2022-10-31
"""c16303fa410fdf81ec75d0c7d08d7d…",32,2022-10-31
"""2b66927be78567ce4ba6daf3c61eb3…",257,2022-10-31
"""c1ea2bc3874950b0ad552bfd204dc0…",146,2022-10-31
…,…,…
"""56f590ee06e454d88f9e219653e85a…",562,2023-01-31
"""d1c3dbed10257b14b6bee29fbfacf1…",231,2023-01-31
"""9dc42532c1793e59e2b20523835f6d…",729,2023-01-31
"""4ceab4c5b3333ad3305722d7897687…",23,2023-01-31


# Embeddings

In [9]:
# Преобразование даты в последнюю дату месяца
dial_train_path = f'{DATA_PATH}/dial_train.parquet'
dial_train = pl.read_parquet(dial_train_path)

dial_train = dial_train.with_columns(pl.col('event_time').dt.month_end().alias('mon'))
dial_train = dial_train.drop('event_time')
dial_train

client_id,embedding,mon
str,list[f32],datetime[μs]
"""b27b9c54e72728e7bbfbe96ef2f3d4…","[0.341713, -0.052266, … 0.394347]",2022-01-31 12:22:28.151649
"""bff7260208097c052cea083ddc9e96…","[0.251929, -0.057982, … 0.24899]",2022-01-31 10:19:36.296643
"""c977ed2889aacd9aa35420cce56522…","[0.104948, 0.128751, … -0.081843]",2022-01-31 11:56:28.229993
"""d2e003fda662d4362aed928dea8bda…","[0.341845, -0.006655, … 0.24816]",2022-01-31 09:54:14.315763
"""d887ecc28f596b1ccf4d9758c1974d…","[0.208994, -0.203628, … 0.316865]",2022-01-31 06:19:57.134893
…,…,…
"""3b28346c9687dc7b7f293f9a232b33…","[0.391926, -0.042448, … 0.380903]",2022-09-30 18:31:02.682112
"""3b28346c9687dc7b7f293f9a232b33…","[0.494272, -0.377852, … 0.330771]",2022-09-30 11:01:40.032310
"""3b28346c9687dc7b7f293f9a232b33…","[0.0059, -0.169089, … 0.358377]",2022-09-30 14:08:59.343379
"""3b28346c9687dc7b7f293f9a232b33…","[0.286338, 0.039379, … 0.114356]",2022-09-30 08:29:21.048320


In [10]:
dial_train = dial_train.with_columns(
    pl.col('mon').dt.strftime("%Y-%m-%d"))
dial_train = dial_train.with_columns(
    pl.col('mon').str.to_date("%Y-%m-%d")
)

In [11]:
def dio_aggregate(tg, dio_timeseries):
    feats = []
    unique_months = tg.select(pl.col('mon')).unique().to_series().to_list()

    for mon in unique_months:
        # Фильтрация транзакций до текущего месяца
        dio = dio_timeseries.filter(pl.col('mon') < mon)

        # Группировка и агрегация по клиентам
        feats_mon = dio.group_by('client_id').agg(
            pl.col('client_id').count().alias('count')
        )

        # Добавление текущего месяца в результаты агрегации
        feats_mon = feats_mon.with_columns(pl.lit(mon).alias('mon'))
        feats.append(feats_mon)

    # Объединение всех агрегированных результатов
    feats = pl.concat(feats, how="vertical")

    return feats

# Создание агрегатов по геоданным
dial_feats = dio_aggregate(feats, dial_train)
dial_feats

client_id,count,mon
str,u32,date
"""68629310e3da9c283bf71ed5a1d8ba…",2,2022-10-31
"""581863d19d4cf66d6eb65577783ba2…",1,2022-10-31
"""8655c45ed652821bf8389ed3c74556…",2,2022-10-31
"""ba1cf47f80f21a0d64697a6e4ff7c2…",6,2022-10-31
"""5ff65af869516e0f9d902e1805b37f…",3,2022-10-31
…,…,…
"""2e1da5a2dbdaf23708682d8a4cf78b…",1,2023-01-31
"""bd5e72d5e28e497898d826989c8a20…",1,2023-01-31
"""3523c619d944ea11126d83e2bf0a0c…",3,2023-01-31
"""7bd6e520ac23142c3aef469335814c…",12,2023-01-31


In [13]:
# Объединение всех фичей с таргетом
# Преобразование строки в дату
# target_train = target_train.with_column(
#     pl.col('mon').str.strptime(pl.Date, fmt="%Y-%m")
# )

# feats = feats.with_column(
#     pl.col('mon').str.strptime(pl.Date, fmt="%Y-%m")
# )

# Случайное перемешивание данных и установка индекса
# RS = 42
# target_train = target_train.sample(frac=1, seed=RS)

train_data = target_train.join(feats, on=['client_id', 'mon'], how='left')
train_data = train_data.join(geo_feats, on=['client_id', 'mon'], how='left')
train_data = train_data.join(dial_feats, on=['client_id', 'mon'], how='left')

# Объединение данных

# Получение первых 10000 уникальных client_id
ids = train_data.select(pl.col('client_id').unique()).to_series().to_list()[:10000]

# Разделение на обучающую и валидационную выборки
df_val = train_data.filter(pl.col('client_id').is_in(ids))
df_fit = train_data.filter(~pl.col('client_id').is_in(ids))


In [14]:
df_val = df_val.with_columns(
    pl.col('mon').dt.strftime("%Y-%m-%d"))
# df_val = df_val.with_columns(
#     pl.col('mon').str.to_date("%Y-%m-%d")
# )

df_fit = df_fit.with_columns(
    pl.col('mon').dt.strftime("%Y-%m-%d"))
# df_fit = df_fit.with_columns(
#     pl.col('mon').str.to_date("%Y-%m-%d")
# )

In [16]:
# df_train = train_data

df_val = df_val.to_pandas()
df_fit = df_fit.to_pandas()

In [17]:
# Делим на тренировочную и валидационную подвыборки
def encoding(series):
    codes, uniques = pd.factorize(series.values)
    series = pd.Series(codes)
    return series

# df_val = df_val.copy(deep=False)

objects = ['mon', 'client_id']

for object_ in  objects:
    df_val[object_] = encoding(df_val[object_])
    df_fit[object_] = encoding(df_fit[object_])


In [65]:
df_fit.shape

(1929336, 12)

In [66]:
# Для каждого таргета фитим модель
# n_estimat320
# maxdepth4

# Для каждого таргета фитим модель

models = {}

params = {'max_depth': 4, 'objective': 'binary',
          'learning_rate': 0.003, 'verbose': -1, 'random_state': RS}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_fit[target_col]
    X = df_fit.drop(columns=[f'target_{i}' for i in range(1, 5)])
    trds = Dataset(X, tg)

    tgv = df_val[target_col]
    Xv = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    vds = Dataset(Xv, tgv)
    print(X.shape)
    lgbm = lightgbm.train(params=params, num_boost_round=1000, train_set=trds, valid_sets=[vds],
                          callbacks=[lightgbm.early_stopping(stopping_rounds=5)])

    models[target_col] = lgbm
    print(f'{target_col} score train: {2 * roc_auc_score(tg, lgbm.predict(X)) - 1}')

print('======================================')
metrics = {}
for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = 2 * roc_auc_score(tg, models[target_col].predict(X)) - 1
    print(f'{target_col} score test: {metrics[target_col]}')

print('======================================')
print(f'avg gini: {sum(metrics.values())/4}')

(1929336, 8)
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0430259
target_1 score train: 0.5445558967200395
(1929336, 8)
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[950]	valid_0's binary_logloss: 0.00620748
target_2 score train: 0.7305086141271098
(1929336, 8)
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.035545
target_3 score train: 0.6697267034844958
(1929336, 8)
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0260455
target_4 score train: 0.6071560063921853
target_1 score test: 0.5402851077201825
target_2 score test: 0.656848746630502
target_3 score test: 0.653456998721635
target_4 score test: 0.5762021457408064
avg gini: 0.6066982497032815


In [67]:
with open(f'{DATA_PATH}/models.pkl', 'wb') as fl:
    pickle.dump(models, fl)

# Скорим тест

In [68]:
# Загружаем тестовые данные

models = pd.read_pickle(f'{DATA_PATH}/models.pkl')
target_test = pl.read_parquet(f'{DATA_PATH}/test_target.parquet')
trx_test = pl.read_parquet(f'{DATA_PATH}/trx_test.parquet')
geo_test = pl.read_parquet(f'{DATA_PATH}/geo_test.parquet')

In [22]:
# Для таргета каждого месяца делаем агрегаты объема транзакций за предшествующий доступные период
target_test = target_test.with_columns(
    pl.col('mon').str.to_date("%Y-%m-%d")
)


trx_test = trx_test.with_columns(
    pl.col('event_time').dt.strftime("%Y-%m-%d"))
trx_test = trx_test.with_columns(
    pl.col('event_time').str.to_date("%Y-%m-%d")
)

feats_test = amount_agregate(target_test, trx_test)

In [23]:
geo_test = geo_test.with_columns(pl.col('event_time').dt.month_end().alias('mon'))

# Функция для получения последнего дня месяца
geo_test = geo_test.drop('event_time')



In [24]:
dial_test = pl.read_parquet(f'{DATA_PATH}/dial_test.parquet')

dial_test = dial_test.with_columns(
    pl.col('event_time').dt.strftime("%Y-%m-%d"))
dial_test = dial_test.with_columns(
    pl.col('event_time').str.to_date("%Y-%m-%d")
)
dial_test = dial_test.with_columns(pl.col('event_time').dt.month_end().alias('mon'))

# Функция для получения последнего дня месяца
dial_test = dial_test.drop('event_time')


dial_test_feats = dio_aggregate(feats_test, dial_test)

dial_test.head(5)

client_id,embedding,mon
str,list[f32],date
"""08b3569cdfd015e4ef7ed3632cf86b…","[0.110589, -0.000545, … 0.134537]",2022-02-28
"""08b3569cdfd015e4ef7ed3632cf86b…","[0.00209, 0.072185, … 0.015157]",2022-05-31
"""08b3569cdfd015e4ef7ed3632cf86b…","[0.194512, -0.032053, … 0.160809]",2022-05-31
"""08b3569cdfd015e4ef7ed3632cf86b…","[0.045035, 0.042004, … -0.112547]",2022-03-31
"""08b3569cdfd015e4ef7ed3632cf86b…","[0.330715, -0.023786, … 0.342075]",2022-03-31


In [25]:
# Создание агрегатов по геоданным
geo_test_feats = geo_aggregate(feats_test, geo_test)

In [26]:
target_test = target_test.to_pandas()
feats_test = feats_test.to_pandas()
geo_test_feats = geo_test_feats.to_pandas()
dial_test_feats = dial_test_feats.to_pandas()

In [37]:
# target_test = target_test.drop('__index_level_0__')
test_data = target_test.merge(feats_test, how='left', on = ['client_id','mon'])
test_data = test_data.merge(geo_test_feats, how='left', on = ['client_id','mon'])
test_data = test_data.merge(dial_test_feats, how='left', on = ['client_id','mon'])


In [38]:
test_data = test_data.drop(['__index_level_0__'], axis=1)
test_data

Unnamed: 0,mon,target_1,target_2,target_3,target_4,client_id,sum_amount,max_amount,min_amount,mean_amount,count_x,count_y
0,2022-05-31,0,0,0,0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,1.113104e+08,9.398446e+06,4.911704,3.698021e+05,,
1,2022-05-31,0,0,0,0,0433d23e224b7a520656da6181efadb8d556bb293158c9...,,,,,1140.0,
2,2022-04-30,0,0,0,0,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,,,,,94.0,
3,2022-10-31,0,0,0,0,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,1.257907e+08,7.485376e+06,0.334257,2.500808e+05,407.0,
4,2022-10-31,0,0,0,0,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,5.621085e+07,3.044801e+07,5007.330566,4.323912e+06,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1407666,2022-10-31,0,0,0,0,10720d45fe5c441e85eb7ef5271e620be56edb2de87dc2...,9.021857e+05,1.135340e+05,382.662750,3.222092e+04,302.0,
1407667,2022-07-31,0,0,0,0,c879fae5376c00b5d56098cfe450c755330e44a351eaa4...,3.925381e+06,8.160497e+05,3408.454346,1.509762e+05,238.0,
1407668,2022-11-30,0,0,0,0,771c72f26c0036a4fa6d5e965628e32efa94ed141033db...,2.347081e+07,8.353859e+06,325.971436,6.343463e+05,1925.0,
1407669,2022-10-31,0,0,0,0,c3f8a8f4ff091e711ac102c77f46ec309b5ec27b3a0ed3...,4.231403e+08,4.087177e+07,1.427865,7.735655e+05,111.0,2.0


In [39]:
# df_test = df_test.to_pandas()
for object_ in  objects:
    test_data[object_] = encoding(test_data[object_])
test_data.head(5)

Unnamed: 0,mon,target_1,target_2,target_3,target_4,client_id,sum_amount,max_amount,min_amount,mean_amount,count_x,count_y
0,0,0,0,0,0,0,111310440.0,9398446.0,4.911704,369802.1,,
1,0,0,0,0,0,1,,,,,1140.0,
2,1,0,0,0,0,2,,,,,94.0,
3,2,0,0,0,0,3,125790664.0,7485375.5,0.334257,250080.8,407.0,
4,2,0,0,0,0,4,56210852.0,30448006.0,5007.330566,4323912.0,,


In [59]:
print(test_data.shape)
print(train_data.shape)

(1407671, 12)
(2049336, 12)


In [72]:
test_data = test_data.drop(columns=[f'target_{i}' for i in range(1, 5)])
test_data.shape

(1407671, 8)

In [74]:
scores = []

for t in range(1, 5):
    target_col = f'target_{t}'
    score = pd.Series(models[target_col].predict(test_data))
    scores.append(score)

scores = pd.concat(scores, axis=1)
scores['client_id'] = test_data.reset_index().client_id
scores_test = scores.rename(columns={0: 'target_1', 1: 'target_2', 2: 'target_3', 3: 'target_4'})

In [79]:
scores_test

Unnamed: 0,target_1,target_2,target_3,target_4,client_id
0,0.022673,0.005911,0.020586,0.010568,0
1,0.003129,0.000631,0.000529,0.000806,1
2,0.003129,0.000631,0.000560,0.000806,2
3,0.040583,0.017135,0.043626,0.020874,3
4,0.005491,0.001757,0.004643,0.002436,4
...,...,...,...,...,...
1407666,0.009844,0.000423,0.006700,0.005175,135558
1407667,0.003753,0.000658,0.002297,0.001968,2142
1407668,0.003805,0.001365,0.003950,0.004059,135482
1407669,0.054534,0.015283,0.078992,0.019330,1476


In [81]:
idshnik = pd.read_parquet(f'{DATA_PATH}/test_target_ids.parquet')
idshnik

Unnamed: 0,client_id
0,03478d5f75a2b651bfd3ae66836b0a54313d1cea05d75e...
1,4bf106c9764392df0850cd907daa93e97dad7df8b35cb9...
2,4c9f58011f50bef4ea99b4f22f5a3264ed1cfb60d23b9f...
3,51d17a1af833d5640f5402d450bdf16dea81329a73648d...
4,52c6fd670cfd93f9075fbdd580d3d4819afa2661a39253...
...,...
140483,c6041ce381f3df521d1dae3350ccf9b7a5c295270aaa65...
140484,c4995db29ee447c347d7b92619350762b26c93500b90ce...
140485,c45249a15c44bde22eec62b6881983769cd86bc6958cac...
140486,cc92973ca2f42eab12d0af7bc64a5489691af4a135f42f...


In [82]:
objects = ['client_id']

for object_ in  objects:
    idshnik[object_] = encoding(idshnik[object_])


In [85]:
# target_ids_submit = pd.read_parquet(f'{DATA_PATH}/test_target_ids.parquet').set_index('client_id')
# submission = target_ids_submit.join(scores_test.set_index('client_id'), how='inner')

target_ids_submit = idshnik.set_index('client_id')
submission = target_ids_submit.join(scores_test.set_index('client_id'), how='inner')

In [86]:
submission

Unnamed: 0_level_0,target_1,target_2,target_3,target_4
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.022673,0.005911,0.020586,0.010568
0,0.021474,0.007238,0.036095,0.009002
0,0.008283,0.002051,0.013524,0.007354
0,0.010236,0.004638,0.010972,0.008628
0,0.011715,0.005145,0.024464,0.008735
...,...,...,...,...
140487,0.004610,0.000339,0.002252,0.002105
140487,0.004610,0.000339,0.002252,0.002155
140487,0.003475,0.000530,0.001793,0.001583
140487,0.004940,0.000324,0.003178,0.002767


In [87]:

submission.to_csv(f'{DATA_PATH}/our_submission.csv', index=False)