In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

input_path = Path('.')

## Amex Metric

This is a python version of the metric for the Amex competition. Additional details can be found on the competition [Evaluation page](https://www.kaggle.com/competitions/amex-default-prediction/overview/evaluation).

In [51]:
def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    df = (pd.concat([y_true, y_pred], axis='columns')
          .sort_values('prediction', ascending=False))
    df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
    four_pct_cutoff = int(0.04 * df['weight'].sum())
    df['weight_cumsum'] = df['weight'].cumsum()
    df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
    return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    df = (pd.concat([y_true, y_pred], axis='columns')
          .sort_values('prediction', ascending=False))
    df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
    df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
    total_pos = (df['target'] * df['weight']).sum()
    df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
    df['lorentz'] = df['cum_pos_found'] / total_pos
    df['gini'] = (df['lorentz'] - df['random']) * df['weight']
    return df['gini'].sum()

In [52]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:



    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## Simple Benchmark

We can create a simple benchark using the average of the feature `P_2` for each customer.

In [3]:
train_data = pd.read_csv(
    input_path / 'train_data.csv',
    index_col='customer_ID',
    usecols=['customer_ID', 'P_2'])

train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID')

In [4]:
ave_p2 = (train_data
          .groupby('customer_ID')
          .mean()
          .rename(columns={'P_2': 'prediction'}))

# Scale the mean P_2 by the max value and take the compliment
ave_p2['prediction'] = 1.0 - (ave_p2['prediction'] / ave_p2['prediction'].max())

In [5]:
print(amex_metric(train_labels, ave_p2)) # 0.572773

0.5729004331080327


In [10]:
pred = train_data.groupby('customer_ID').last().rename(columns={'P_2':'prediction'})

In [17]:
pred = 1-pred.clip(lower=0,upper=1)

In [18]:
amex_metric(train_labels,pred)

0.6355828140517417

In [49]:
pred = train_data.groupby('customer_ID').tail(1).groupby('customer_ID').mean()
pred = pred.rename(columns={'P_2':'prediction'})
pred = 1-pred.clip(lower=0,upper=1)

In [72]:
pred.prediction = pred.prediction.apply(lambda x: 0.99)

In [73]:
pred

Unnamed: 0_level_0,prediction
customer_ID,Unnamed: 1_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.99
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.99
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.99
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.99
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.99
...,...
ffff41c8a52833b56430603969b9ca48d208e7c192c6a4081a6acc28cf4f8af7,0.99
ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fdd3e5b57cfcbee30286,0.99
ffff9984b999fccb2b6127635ed0736dda94e544e67e026eee4d20f680639ff6,0.99
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461,0.99


In [74]:
top_four_percent_captured(train_labels,pred)

0.03958662941394284

In [75]:
y_true = train_labels
y_pred = pred
df = (pd.concat([y_true, y_pred], axis='columns')
      .sort_values('prediction', ascending=False))
df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
four_pct_cutoff = int(0.04 * df['weight'].sum())
df['weight_cumsum'] = df['weight'].cumsum()
df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
df_cutoff_final = (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

In [76]:
df_cutoff_final

0.03958662941394284

In [77]:
df_cutoff

Unnamed: 0_level_0,target,prediction,weight,weight_cumsum
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0,0.99,20,20
aab6cd60f43333ffbf74b6500af488da9966c9c5d7bb503c90db0ae695f907c4,0,0.99,20,40
aa5f0a0add6369c75c051f901120fbf459925ba2b4a8136b0c05edd48be2f0c5,1,0.99,1,41
aa5ee4d6e08edd3785feb1d22a2b743b360048733afe160963d1ea79e28ac0d3,0,0.99,20,61
aa5ee11132972c48030b6d70bb5581030881c6a719eed89a7d6c2fae7c00b0d5,1,0.99,1,62
...,...,...,...,...
a0570b601e3605438919ac2a88008f6f2380f3ac3c4129a42497c1213e54a5b5,1,0.99,1,276762
a056efaf66b61c203260d648b8e889736265557cb45c1194de6865b0556b9879,1,0.99,1,276763
a056ae85f49c97ec6d67274a8b3a3c5978f1711ea6722ed8c686458b224b1d78,1,0.99,1,276764
a0542f171f09c67feef1f0cb8a69f1323b5effe26f2d232e2282dd7f390ebd6d,0,0.99,20,276784
