In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

import requests

In [3]:
df = pd.read_parquet('train_dataset')


# `df_sim_by_checks` - мера схожести чеков у customer

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

df['item_id'], _ = pd.factorize(df['dish_name'])

df_check_items = df.groupby(['customer_id', 'startdatetime', 'buy_post']).item_id.apply(np.array)
df_check_items.columns = ['customer_id', 'startdatetime', 'buy_post', 'item_id']

df_checks = df.groupby(['customer_id', 'startdatetime', 'buy_post'], as_index=True).agg({
    "revenue": ["sum", "count"],
})
df_checks.columns = ["revenue_sum", "items_count"]
df_checks.reset_index(inplace=True)

df_checks = pd.merge(df_checks, df_check_items, on=['customer_id', 'startdatetime', 'buy_post'])

similarity_by_checks_scores = []
df_grouped_checks = df_checks.groupby(['customer_id'])

for _, group in df_grouped_checks:
  group_onehot = pd.get_dummies(group['item_id'].apply(pd.Series).stack())
  if group_onehot.shape[0] > 1:
    similarity_matrix = cosine_similarity(group_onehot)
    similarity_score = np.mean(similarity_matrix)
    similarity_by_checks_scores.append(similarity_score)
  else:
    similarity_by_checks_scores.append(0)


df_sim_by_checks = pd.DataFrame(list(zip(df_grouped_checks.groups.keys(), similarity_by_checks_scores)), columns=['customer_id', 'cosine_similarity'])

In [26]:
df_sim_by_checks.head()

Unnamed: 0,customer_id,cosine_similarity
0,29891,0.044983
1,30477,0.069067
2,31426,0.109519
3,44491,0.32
4,44939,0.0752


# `df_halves_means` - разница сумм в первой и второй половинах чеков по времени


In [38]:
def calculate_means(group):
    group = group.sort_values('startdatetime')
    middle_date_idx = group['startdatetime'].values.shape[0] // 2

    first_half_df = group.iloc[:middle_date_idx]
    second_half_df = group.iloc[middle_date_idx:]

    return pd.Series({'first_half_mean': first_half_df['revenue_sum'].mean(),
                      'first_half_max': first_half_df['revenue_sum'].max(),
                      'first_half_min': first_half_df['revenue_sum'].min(),
                      'first_half_std': first_half_df['revenue_sum'].std(),
                      'second_half_mean': second_half_df['revenue_sum'].mean(),
                      'second_half_max': second_half_df['revenue_sum'].max(),
                      'second_half_min': second_half_df['revenue_sum'].min(),
                      'second_half_std': second_half_df['revenue_sum'].std()})

df_halves_means = df_checks.groupby('customer_id').apply(calculate_means)
df_halves_means.reset_index(inplace=True)

In [53]:
df_halves_means['diff'] = df_halves_means.second_half_mean - df_halves_means.first_half_mean
df_halves_means['ratio'] = df_halves_means.second_half_mean / df_halves_means.first_half_mean

In [41]:
df_halves_means

Unnamed: 0,customer_id,first_half_mean,first_half_max,first_half_min,first_half_std,second_half_mean,second_half_max,second_half_min,second_half_std,diff
0,29891,208.805000,389.97,45.99,102.497430,198.591538,439.98,1.00,143.726628,-10.213462
1,30477,255.400833,399.99,99.99,124.740158,200.830000,499.95,44.99,124.115843,-54.570833
2,31426,365.152500,954.98,2.00,323.624182,417.646667,1079.97,1.00,358.058023,52.494167
3,44491,197.470000,344.97,49.97,208.596500,59.980000,69.99,49.97,14.156278,-137.490000
4,44939,604.930000,604.93,604.93,,529.950000,554.94,504.96,35.341197,-74.980000
...,...,...,...,...,...,...,...,...,...,...
499995,46624509,514.950000,514.95,514.95,,210.475000,320.96,99.99,156.249385,-304.475000
499996,46625265,349.960000,349.96,349.96,,424.955000,799.92,49.99,530.280588,74.995000
499997,46639170,169.990000,169.99,169.99,,459.965000,519.96,399.97,84.845743,289.975000
499998,46654016,469.960000,469.96,469.96,,374.985000,569.98,179.99,275.764574,-94.975000


In [42]:
def calculate_information_value(dataset, feats, target_col):
    iv_by_feat = {}
    temp = dataset.copy()
    for feat in feats:
        temp["quant_range"] = pd.qcut(
            x=temp[feat], q=[0, 0.25, 0.5, 0.75, 1.0], duplicates="drop"
        )

        bins = {}
        for i, bin in enumerate(temp["quant_range"].unique()):
            bins[bin] = i

        temp["bin"] = temp["quant_range"].apply(lambda x: bins[x])

        iv = (
            pd.crosstab(temp["bin"], temp[target_col], normalize="columns")
            .assign(woe=lambda dfx: np.log(dfx[1] / dfx[0]))
            .assign(iv=lambda dfx: np.sum(dfx["woe"] * (dfx[1] - dfx[0])))
        )["iv"].unique()[0]

        iv_by_feat[feat] = [iv]
    df = pd.DataFrame(iv_by_feat).T
    df.reset_index(inplace=True)
    df.columns = ["feature", "IV"]
    return df


In [54]:
calculate_information_value(df_halves_means, ['first_half_mean', 'first_half_max', 'first_half_min', 'first_half_std',
                                              'second_half_mean', 'second_half_max', 'second_half_min', 'second_half_std', 'diff', 'ratio'], 'buy_post')

Unnamed: 0,feature,IV
0,first_half_mean,0.00555
1,first_half_max,0.017305
2,first_half_min,0.059577
3,first_half_std,0.012299
4,second_half_mean,0.005947
5,second_half_max,0.016568
6,second_half_min,0.003584
7,second_half_std,0.012744
8,diff,0.027565
9,ratio,0.031495


Все фичи в `df_total_features` и в файл 4_9_11_features.parquet

In [55]:
df_total_features = pd.merge(df_halves_means, df_sim_by_checks, on='customer_id')

In [57]:
df_total_features.to_parquet('4_9_11_features.parquet')