### Гипотеза

Посмотреть как то влияет ли средний чек или стандартное отклонение в рамках клиента на то уйдет ли он от БК или нет. Также посмотрим количество позиций в рамках одного чека.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:85% !important; }</style>"))
display(HTML("<style>.prompt { min-width:10ex !important; }</style>"))
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.float_format', lambda x: '%.6f' % x) 

  from IPython.core.display import display, HTML, clear_output
  from IPython.core.display import display, HTML, clear_output


In [3]:
df_train = pd.read_parquet('../data/train_dataset_hackaton2023_train.gzip')
df_test = pd.read_parquet('../data/hackaton2023_test.gzip')

In [4]:
df_train.describe()

Unnamed: 0,customer_id,date_diff_post,buy_post,revenue,startdatetime,ownareaall_sqm
count,12129384.0,9660867.0,12129384.0,12129384.0,12129384,12129384.0
mean,19809542.263361,20.480386,0.796485,106.799476,2023-02-20 12:19:13.394691,215.513003
min,29891.0,0.0,0.0,0.01,2022-09-02 00:00:39,0.0
25%,10108525.0,8.0,1.0,44.99,2022-12-06 16:12:06,104.9
50%,21274586.0,17.0,1.0,79.98,2023-02-22 11:04:22,210.4
75%,28038562.0,30.0,1.0,131.7725,2023-05-09 18:48:02,300.0
max,46661804.0,60.0,1.0,14499.71,2023-08-01 23:57:46,978.7
std,11389040.436254,15.27091,0.402613,107.438512,,122.590689


In [5]:
def calculate_mean_std_feats(dataset, ids_cols):
    df_sum_agg = dataset.groupby(ids_cols, as_index=True).agg({"revenue": ["sum", "count"]})
    df_sum_agg.columns = ["revenue_sum", "items_count"]
    df_sum_agg.reset_index(inplace=True)
    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))
    df_by_customer_data = (
        df_sum_agg
        .groupby(new_ids_cols)
        .agg({
            "revenue_sum": ['mean', 'std', 'count'],
            "items_count": ['mean', 'std'],
        }).reset_index()
    )
    df_by_customer_data.columns = new_ids_cols + [
        'receipt_sum_mean', 
        'receipt_sum_std',
        'receipt_count', 
        'items_receipt_mean',
        'items_receipt_mean_std',
    ]
    return df_by_customer_data

In [6]:
df_train_processed = calculate_mean_std_feats(df_train, ["customer_id", "buy_post", "startdatetime"])
df_train_processed.describe()

Unnamed: 0,customer_id,buy_post,receipt_sum_mean,receipt_sum_std,receipt_count,items_receipt_mean,items_receipt_mean_std
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,20246346.095608,0.71819,492.152797,259.144721,5.761172,4.420151,2.179619
std,11496610.350691,0.449882,295.469257,208.546239,5.057282,2.199285,1.582691
min,29891.0,0.0,13.4975,0.0,2.0,1.0,0.0
25%,10731681.25,0.0,292.735,121.704043,3.0,2.846154,1.118034
50%,21725643.0,1.0,419.995833,207.891439,4.0,4.0,1.892969
75%,28452387.0,1.0,613.3,338.487607,6.0,5.6,2.886751
max,46661804.0,1.0,5990.02,9237.078619,245.0,49.75,93.510695


In [7]:
df_test_processed = calculate_mean_std_feats(df_test, ["customer_id", "startdatetime"])
df_test_processed.describe()

Unnamed: 0,customer_id,receipt_sum_mean,receipt_sum_std,receipt_count,items_receipt_mean,items_receipt_mean_std
count,112334.0,112334.0,112334.0,112334.0,112334.0,112334.0
mean,20748611.321951,493.286188,262.211572,5.272429,4.379857,2.171851
std,11582578.50584,295.966684,211.509668,4.396641,2.179753,1.592661
min,52341.0,24.392,0.0,2.0,1.0,0.0
25%,11041231.25,292.178,120.921234,3.0,2.8,1.070607
50%,22155898.0,421.269,209.435801,4.0,4.0,1.864454
75%,28861910.5,616.623333,342.937719,6.0,5.5,2.886751
max,46668221.0,5486.866667,3891.890319,148.0,30.666667,51.384174


### Information value

IV < 0,02 — отсутствует;

0,02 ≤ IV <0,1 — низкая;

0,1 ≤ IV < 0,3 — средняя;

IV ≥ 0,3 — высокая.

In [8]:
def calculate_information_value(dataset, feats, target_col):
    iv_by_feat = {}
    temp = dataset.copy()
    for feat in feats:
        temp["quant_range"] = pd.qcut(
            x=temp[feat], q=[0, 0.25, 0.5, 0.75, 1.0], duplicates="drop"
        )

        bins = {}
        for i, bin in enumerate(temp["quant_range"].unique()):
            bins[bin] = i

        temp["bin"] = temp["quant_range"].apply(lambda x: bins[x])

        iv = (
            pd.crosstab(temp["bin"], temp[target_col], normalize="columns")
            .assign(woe=lambda dfx: np.log(dfx[1] / dfx[0]))
            .assign(iv=lambda dfx: np.sum(dfx["woe"] * (dfx[1] - dfx[0])))
        )["iv"].unique()[0]

        iv_by_feat[feat] = [iv]
    df = pd.DataFrame(iv_by_feat).T
    df.reset_index(inplace=True)
    df.columns = ["feature", "IV"]
    return df

In [9]:
df_train_processed.columns

Index(['customer_id', 'buy_post', 'receipt_sum_mean', 'receipt_sum_std',
       'receipt_count', 'items_receipt_mean', 'items_receipt_mean_std'],
      dtype='object')

In [10]:
feats = [
    'receipt_sum_mean', 
    'receipt_sum_std',
    'receipt_count', 
    'items_receipt_mean',
    'items_receipt_mean_std',
]

In [11]:
iv_df = calculate_information_value(df_train_processed, feats, "buy_post")

In [12]:
iv_df.sort_values(by="IV", ascending=False)

Unnamed: 0,feature,IV
2,receipt_count,0.430457
3,items_receipt_mean,0.008995
1,receipt_sum_std,0.006076
4,items_receipt_mean_std,0.004762
0,receipt_sum_mean,0.001557
