### Сборка всех фичей

Сборка всех признаков, которые разработали и подсчет information value

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:85% !important; }</style>"))
display(HTML("<style>.prompt { min-width:10ex !important; }</style>"))
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.float_format', lambda x: '%.6f' % x) 

  from IPython.core.display import display, HTML, clear_output
  from IPython.core.display import display, HTML, clear_output


In [3]:
df_train = pd.read_parquet('../data/train_dataset_hackaton2023_train.gzip')
df_test = pd.read_parquet('../data/test_dataset_hackaton2023_test.gzip')

In [4]:
def calculate_mean_std_feats(dataset, ids_cols):
    df_sum_agg = dataset.groupby(ids_cols, as_index=True).agg({"revenue": ["sum", "count"]})
    df_sum_agg.columns = ["revenue_sum", "items_count"]
    df_sum_agg.reset_index(inplace=True)
    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))
    df_by_customer_data = (
        df_sum_agg
        .groupby(new_ids_cols)
        .agg({
            "revenue_sum": ['mean', 'std', 'count'],
            "items_count": ['mean', 'std'],
        }).reset_index()
    )
    df_by_customer_data.columns = new_ids_cols + [
        'receipt_sum_mean', 
        'receipt_sum_std',
        'receipt_count', 
        'items_receipt_mean',
        'items_receipt_mean_std',
    ]
    return df_by_customer_data

In [5]:
def calculate_dynamic_receipt_features(dataframe, ids_cols):
    df_sum_agg = dataframe.groupby(ids_cols, as_index=True).agg({"revenue": ["sum", "count"]})
    df_sum_agg.columns = ["revenue_sum", "items_count"]
    df_sum_agg.reset_index(inplace=True)
    df_sum_agg = df_sum_agg.sort_values(by=ids_cols)
    df_sum_agg["lag_sum"] = df_sum_agg.groupby(['customer_id'])['revenue_sum'].shift(1)
    df_sum_agg["lag_count"] = df_sum_agg.groupby(['customer_id'])['items_count'].shift(1)
    df_sum_agg["sum_delta"] = df_sum_agg["revenue_sum"] - df_sum_agg["lag_sum"]
    df_sum_agg["count_delta"] = df_sum_agg["items_count"] - df_sum_agg["lag_count"]
    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))
    df_by_customer_data = (
        df_sum_agg
        .groupby(new_ids_cols, as_index=False)
        .agg({
            "sum_delta": ["min", "max", "mean", "std"], 
            "count_delta": ["min", "max", "mean", "std"], 
        })
    )
    df_by_customer_data.columns = new_ids_cols + [
        f"{col}_{agg_func}" for col in ["sum_delta", "count_delta"]
        for agg_func in ["min", "max", "mean", "std"]
    ]
    df_by_customer_data["abs_sum_delta_mean"] = df_by_customer_data["sum_delta_mean"].abs()
    df_by_customer_data["abs_count_delta_mean"] = df_by_customer_data["count_delta_mean"].abs()
    return df_by_customer_data

In [6]:
def calculate_window_features(dataset, ids_cols, windows):
    df_sum_agg = dataset.groupby(ids_cols, as_index=True).agg({"revenue": ["sum", "count"]})
    df_sum_agg.columns = ["revenue_sum", "items_count"]
    df_sum_agg.reset_index(inplace=True)
    
    df_max_dt = df_sum_agg.groupby(['customer_id'], as_index=False).agg({"startdatetime": ["max"]})
    df_max_dt.columns = ["customer_id", "max_startdatetime"]
    df_sum_agg = df_sum_agg.merge(df_max_dt, on="customer_id", how='inner')
    df_sum_agg['delta_days'] = (df_sum_agg['max_startdatetime'] - df_sum_agg['startdatetime']) / np.timedelta64(1, 's') / 60 / 60 / 24
    
    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))
    agg_dfs = []
    for window in windows:
        filtered_df = df_sum_agg[df_sum_agg['delta_days'] < 30]
        filtered_df = (
            filtered_df
            .groupby(new_ids_cols, as_index=False)
            .agg({
                "revenue_sum": ["max", "mean", "std", "sum", "count"], 
                "items_count": ["max", "mean", "std", "sum"], 
            })
        )
        filtered_df.columns = new_ids_cols + [
            f"{col}_{agg_func}_{window}d" for col in ["revenue_sum", "items_count"]
            for agg_func in ["max", "mean", "std", "sum", "count"]
        ][:-1]
        agg_dfs.append(filtered_df)
    ids_df = df_sum_agg[new_ids_cols].drop_duplicates().reset_index(drop=True)
    for temp_df in agg_dfs:
        ids_df = ids_df.merge(temp_df, on=new_ids_cols, how='left')
    return ids_df

In [7]:
def calculate_places_features(dataset, ids_cols):
    df = dataset.groupby(ids_cols + ["ownareaall_sqm"], as_index=True).agg({"revenue": ["sum", "count"]})
    df.columns = ["revenue_sum", "items_count"]
    df.reset_index(inplace=True)
    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))
    df['ownareaall_sqm'] = np.where(df['ownareaall_sqm']==0, df['ownareaall_sqm'].median(), df['ownareaall_sqm'])
    df_feat = df.groupby(new_ids_cols, as_index=True).agg({"ownareaall_sqm": ["mean", "std", "sem", "var"]})
    df_feat.columns = [f"sqm_place_{func}" for func in ["mean", "std", "sem", "var"]]
    df_feat.reset_index(inplace=True)
    return df_feat

In [8]:
def calculate_time_features(dataset, ids_cols):
    df = dataset.groupby(ids_cols, as_index=True).agg({"revenue": ["sum", "count"]})
    df.columns = ["revenue_sum", "items_count"]
    df.reset_index(inplace=True)
    df['startdatetime'] = df['startdatetime'].dt.round('min')
    df["time"] = df["startdatetime"].dt.to_pydatetime()
    df["minutes"] = df["time"].apply(lambda x: x.hour * 60 + x.minute)
    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))
    df_feat = df.groupby(new_ids_cols, as_index=True).agg({"minutes": ["mean", "std", "sem", "var"]})
    df_feat.columns = [f"minutes_{func}" for func in ["mean", "std", "sem", "var"]]
    df_feat.reset_index(inplace=True)
    return df_feat

In [18]:
def create_base_and_hype_dish_df(df, ids_cols):
    df['fun'] = np.select(
      [ df['dish_name'].str.contains("Игрушка"),
          df['dish_name'].str.contains("Энергет"),
          df['dish_name'].str.contains("Up"),
          df['dish_name'].str.contains("Балтика"),
          df['dish_name'].str.contains("Сбер"),
          df['dish_name'].str.contains("1RUB"),
          df['dish_name'].str.contains("А4"),
          df['dish_name'].str.contains("Соус"),
          df['dish_name'].str.contains("Влажная салфетка"),
          df['dish_name'].str.contains("Пиво"),
          df['dish_name'].str.contains("GIFT")
        ],
     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      default=0
    )

    df_sum_agg_fun = df.groupby(ids_cols, as_index=True).agg({"fun": ["sum"]})
    df_sum_agg_fun.columns = ["fun_sum"]
    df_sum_agg_fun.reset_index(inplace=True)

    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))

    df_sum_agg_fun_sum = df_sum_agg_fun.groupby(new_ids_cols, as_index=True).agg({"fun_sum": ["mean", "sum", "std"]})
    df_sum_agg_fun_sum.columns = ["fun_mean", "fun_sum", "fun_std"]
    df_sum_agg_fun_sum.reset_index(inplace=True)


    return df_sum_agg_fun_sum

In [19]:
def create_toilet_df(df, ids_cols):
    df['no_toilet'] = np.select([ df['format_name'].str.contains("без туалета"),], [1], default=0)
    df['fudcort'] = np.select([ df['format_name'].str.contains("Фудкорт"),], [1], default=0)

    df_sum_1 = df.groupby(ids_cols, as_index=True).agg({"no_toilet": ["max"], "fudcort": ["max"]})
    df_sum_1.columns = ["no_toilet_", "fudcort_"]
    df_sum_1.reset_index(inplace=True)

    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))

    df_sum_1_a = df_sum_1.groupby(new_ids_cols, as_index=True).agg({"no_toilet_": ["mean", "std", "max"], "fudcort_": ["mean", "std", "max"]})
    df_sum_1_a.columns = [
      "no_toilet_mean",
      "no_toilet_std",
      "no_toilet_max",
      "fudcort_mean",
      "fudcort_std",
      "fudcort_max",
      ]
    df_sum_1_a.reset_index(inplace=True)


    return df_sum_1_a

In [20]:
def create_weekends_df(df, ids_cols):
    df['day_of_week_num'] = df['startdatetime'].dt.dayofweek

    df_dn = df.groupby(ids_cols, as_index=True).agg({"day_of_week_num": ["max"]})
    df_dn.columns = ["day_of_week_num_max"]
    df_dn.reset_index(inplace=True)

    df_dn['weekends'] = np.select(
      [df_dn['day_of_week_num_max'] > 2], [1],
      default=0
      )

    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))

    df_dn_median = df_dn.groupby(new_ids_cols, as_index=True).agg({"day_of_week_num_max": ["median", "mean", "std"], "weekends": ["median", "mean", "std"]})
    df_dn_median.columns = ["dow_median", "dow_mean", "dow_std", "weekends_median", "weekends_mean", "weekends_std"]
    df_dn_median.reset_index(inplace=True)


    return df_dn_median

In [21]:
def create_month_df(df, ids_cols):
    df['day_of_month_num'] = df['startdatetime'].dt.day

    df_dn = df.groupby(ids_cols, as_index=True).agg({"day_of_month_num": ["max"]})
    df_dn.columns = ["day_of_month_num_max"]
    df_dn.reset_index(inplace=True)

    df_dn['strange'] = np.select(
      [(df_dn['day_of_month_num_max'] < 22) & (df_dn['day_of_month_num_max'] > 5)], [1],
      default=0
      )

    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))

    df_dn_median = df_dn.groupby(new_ids_cols, as_index=True).agg({"day_of_month_num_max": ["median", "mean", "std"], "strange": ["median", "mean", "std"]})
    df_dn_median.columns = ["dom_median", "dom_mean", "dom_std", "strange_median", "strange_mean", "strange_std"]
    df_dn_median.reset_index(inplace=True)


    return df_dn_median

In [22]:
def get_all_features(dataset, ids_cols):
    windows = [7, 14, 28]
    df_mean_std = calculate_mean_std_feats(dataset, ids_cols)
    df_dynamic = calculate_dynamic_receipt_features(dataset, ids_cols)
    df_windows = calculate_window_features(dataset, ids_cols, windows)
    df_places = calculate_places_features(dataset, ids_cols)
    df_time = calculate_time_features(dataset, ids_cols)
    
    df_hype_dish = create_base_and_hype_dish_df(dataset, ids_cols)
    df_toilet = create_toilet_df(dataset, ids_cols)
    df_weekends = create_weekends_df(dataset, ids_cols)
    df_month = create_month_df(dataset, ids_cols)
    
    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))
    
    df_result = df_mean_std.merge(df_dynamic, on=new_ids_cols, how='left')
    df_result = df_result.merge(df_windows, on=new_ids_cols, how='left')
    df_result = df_result.merge(df_places, on=new_ids_cols, how='left')
    df_result = df_result.merge(df_time, on=new_ids_cols, how='left')
    df_result = df_result.merge(df_hype_dish, on=new_ids_cols, how='left')
    df_result = df_result.merge(df_toilet, on=new_ids_cols, how='left')
    df_result = df_result.merge(df_weekends, on=new_ids_cols, how='left')
    df_result = df_result.merge(df_month, on=new_ids_cols, how='left')
    return df_result

In [23]:
def calculate_promo_features(dataset, ids_cols, df_test, ids_cols_test):
    dataset["1rub"] = dataset["dish_name"].apply(lambda x: "1RUB" in x).astype(int)
    dataset["recommendation"] = dataset["dish_name"].apply(lambda x: "up " in x.lower() or "lim." in x.lower() or "ord." in x.lower()).astype(int)
    dataset['clean_dish'] = dataset["dish_name"].apply(lambda x: x.lower().replace("1rub.", "").replace("up ", "").replace("lim.", "").replace("ord.", ""))
    df_dish_sum = dataset.groupby(["clean_dish"], as_index=True).agg({"revenue": ["median"]})
    df_dish_sum.columns = ["price_median"]
    df_dish_sum.reset_index(inplace=True)
    
    dataset = dataset.merge(df_dish_sum, on='clean_dish', how='left')
    dataset["promo"] = (dataset["revenue"] < dataset["price_median"]).astype(int)
    
    df_sum_agg = dataset.groupby(ids_cols, as_index=True).agg({"promo": ["max"], "1rub": ["max"], "recommendation": ["max"]})
    df_sum_agg.columns = ["was_promo", "was_1rub", "was_rec"]
    df_sum_agg.reset_index(inplace=True)
    
    new_ids_cols = list(set(ids_cols).difference(["startdatetime"]))
    
    df_feat = df_sum_agg.groupby(new_ids_cols, as_index=True).agg({"was_promo": ["mean", "std"], "was_1rub": ["mean", "std"], "was_rec": ["mean", "std"]})
    df_feat.columns = [f"{feat}_{func}" for feat in ["was_promo", "was_1rub", "was_rec"] for func in ["mean", "std"]]
    df_feat.reset_index(inplace=True)
    
    df_test["1rub"] = df_test["dish_name"].apply(lambda x: "1RUB" in x).astype(int)
    df_test["recommendation"] = df_test["dish_name"].apply(lambda x: "up " in x.lower() or "lim." in x.lower() or "ord." in x.lower()).astype(int)
    df_test['clean_dish'] = df_test["dish_name"].apply(lambda x: x.lower().replace("1rub.", "").replace("up ", "").replace("lim.", "").replace("ord.", ""))
    
    df_test = df_test.merge(df_dish_sum, on='clean_dish', how='left')
    df_test["promo"] = (df_test["revenue"] < df_test["price_median"]).astype(int)
    
    df_sum_agg_test = df_test.groupby(ids_cols_test, as_index=True).agg({"promo": ["max"], "1rub": ["max"], "recommendation": ["max"]})
    df_sum_agg_test.columns = ["was_promo", "was_1rub", "was_rec"]
    df_sum_agg_test.reset_index(inplace=True)
    
    new_ids_cols = list(set(ids_cols_test).difference(["startdatetime"]))
    
    df_feat_test = df_sum_agg_test.groupby(new_ids_cols, as_index=True).agg({"was_promo": ["mean", "std"], "was_1rub": ["mean", "std"], "was_rec": ["mean", "std"]})
    df_feat_test.columns = [f"{feat}_{func}" for feat in ["was_promo", "was_1rub", "was_rec"] for func in ["mean", "std"]]
    df_feat_test.reset_index(inplace=True)
    
    return df_feat, df_feat_test

In [24]:
df_train_with_feats = get_all_features(df_train, ["customer_id", "buy_post", "startdatetime"])

  df["time"] = df["startdatetime"].dt.to_pydatetime()


In [25]:
df_test_with_feats = get_all_features(df_test, ["customer_id", "startdatetime"])

  df["time"] = df["startdatetime"].dt.to_pydatetime()


In [28]:
df_train_feat, df_test_feat = calculate_promo_features(df_train, ["customer_id", "buy_post", "startdatetime"], df_test, ["customer_id", "startdatetime"])

In [31]:
extra_feats = pd.read_parquet('../data/4_9_11_features.parquet')
additional_feats = pd.read_parquet('../data/train_additional_feats.parquet')

In [32]:
extra_feats_test = pd.read_parquet('../data/test4_9_11_features.parquet')
additional_feats_test = pd.read_parquet('../data/test_additional_feats.parquet')

In [34]:
df_train_with_feats = (
    df_train_with_feats.merge(extra_feats, on=["customer_id", "buy_post"], how='left')
    .merge(df_train_feat, on=["customer_id", "buy_post"], how='left')
    .merge(additional_feats, on=["customer_id"], how='left')
)

In [36]:
df_train_with_feats.to_parquet("../data/train_with_feats_v4.parquet")

In [35]:
df_test_with_feats = (
    df_test_with_feats.merge(extra_feats_test, on=["customer_id"], how='left')
    .merge(df_test_feat, on=["customer_id"], how='left')
    .merge(additional_feats_test, on=["customer_id"], how='left')
)

In [37]:
df_test_with_feats.to_parquet("../data/test_with_feats_v4.parquet")

In [38]:
df_train_with_feats.shape, df_test_with_feats.shape

((500000, 110), (112334, 109))

### IV

In [39]:
def calculate_information_value(dataset, feats, target_col):
    iv_by_feat = {}
    temp = dataset.copy()
    for feat in feats:
        temp["quant_range"] = pd.qcut(
            x=temp[feat], q=[0, 0.25, 0.5, 0.75, 1.0], duplicates="drop"
        )

        bins = {}
        for i, bin in enumerate(temp["quant_range"].unique()):
            bins[bin] = i

        temp["bin"] = temp["quant_range"].apply(lambda x: bins[x])

        iv = (
            pd.crosstab(temp["bin"], temp[target_col], normalize="columns")
            .assign(woe=lambda dfx: np.log(dfx[1] / dfx[0]))
            .assign(iv=lambda dfx: np.sum(dfx["woe"] * (dfx[1] - dfx[0])))
        )["iv"].unique()[0]

        iv_by_feat[feat] = [iv]
    df = pd.DataFrame(iv_by_feat).T
    df.reset_index(inplace=True)
    df.columns = ["feature", "IV"]
    return df

In [40]:
feats = list(set(df_train_with_feats.columns).difference(["customer_id", "buy_post"]))

In [41]:
iv_df = calculate_information_value(df_train_with_feats, feats, "buy_post")

In [42]:
iv_df.sort_values(by="IV", ascending=False)

Unnamed: 0,feature,IV
62,receipt_count,0.430457
30,weekends_std,0.24315
69,strange_std,0.237534
47,dom_std,0.16164
65,revenue_sum_count_28d,0.155249
1,revenue_sum_count_14d,0.155249
3,revenue_sum_count_7d,0.155249
35,was_promo_std,0.120205
46,fun_sum,0.105191
88,dow_std,0.097692


In [43]:
iv_df.shape

(108, 2)

In [44]:
iv_df.to_csv("../data/feature_information_value.csv", sep=";", index=False)