In [1]:
import datetime as dt

import pandas as pd


cols = {
    'CustomerIdx': 'customer',
    'IsinIdx': 'bond',
    'BuySell': 'action',
    'NotionalEUR': 'notional',
    'Price': 'price',
    'TradeStatus': 'status',
    'CustomerInterest': 'interest',
}

def parse_date(s):
    return dt.datetime.strptime(s, '%Y%m%d')

train = pd.read_csv('data/Trade.csv', parse_dates=['TradeDateKey'], date_parser=parse_date)\
          .rename(columns={**cols, **{'TradeDateKey': 'date'}})

test = pd.read_csv('data/Challenge_20180423.csv', parse_dates=['DateKey'], date_parser=parse_date)\
         .rename(columns={**cols, **{'DateKey': 'date'}})

# Only keep the relevant part of train that also pertains to the test set
train = train[train['customer'].isin(test['customer']) | train['bond'].isin(test['bond'])].query('interest == 1')

## Features

### Customer features

In [2]:
columns = {
    'CustomerIdx': 'customer',
    'Sector': 'sector',
    'Subsector': 'subsector',
    'Region': 'region',
    'Country': 'country'
}
customers = pd.read_csv('data/Customer.csv').rename(columns=columns)

customers.head()

Unnamed: 0,customer,sector,subsector,region,country
0,2975,Banks and Intermediaries,Bank,Americas,BARBADOS
1,1594,Asset Managers & Hedge Funds,,Americas,BERMUDA
2,399,Corporation,Corp - Comm. & Prof. Services,Americas,BERMUDA
3,836,Asset Owners,Insurance,Americas,BERMUDA
4,816,Asset Owners,Insurance,Americas,BERMUDA


Clean the subsector variable.

In [3]:
most_common_subsectors = customers.groupby('sector').apply(lambda g: g['subsector'].value_counts().index[0])

def clean_subsector(row):
    subsector = row['subsector']
    if not isinstance(subsector, str):
        return most_common_subsectors[row['sector']]
    return subsector


customers['subsector'] = customers.apply(clean_subsector, axis='columns')
counts = customers['subsector'].value_counts()
uncommon = counts[counts < 30]
customers['subsector'] = customers['subsector'].apply(lambda x: 'Rare' if x in uncommon.index else x)

Clean the country column.

In [4]:
counts = customers['country'].value_counts()
uncommon = counts[counts < 15]
customers['country'] = customers['country'].apply(lambda x: 'Rare' if x in uncommon.index else x)

First trade date per customer.

In [5]:
customers = customers.join(train.groupby('customer')['date'].first().rename('first_trade_date'), on='customer')

Convert categorical features.

In [6]:
customers['sector'] = pd.Categorical(customers['sector'])
customers['subsector'] = pd.Categorical(customers['subsector'])
customers['region'] = pd.Categorical(customers['region'])
customers['country'] = pd.Categorical(customers['country'])

Drop unwanted columns.

In [7]:
to_drop = ['sector']
customers.drop(to_drop, axis='columns', inplace=True)

Prepare to join.

In [8]:
customers = customers.set_index('customer')
customers.columns = ['customer_{}'.format(col) for col in customers.columns]
customers.head()

Unnamed: 0_level_0,customer_subsector,customer_region,customer_country,customer_first_trade_date
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2975,Bank,Americas,Rare,2018-03-26
1594,Hedge Fund,Americas,Rare,2018-01-18
399,Rare,Americas,Rare,2017-10-20
836,Insurance,Americas,Rare,2018-02-22
816,Insurance,Americas,Rare,2016-02-02


### Bond features

In [9]:
columns = {
    'IsinIdx': 'bond',
    'TickerIdx': 'provider',
    'ActualMaturityDateKey': 'maturity_date',
    'IssueDateKey': 'issue_date',
    'Seniority': 'seniority',
    'Currency': 'currency',
    'ActivityGroup': 'activity_group',
    'Region': 'region',
    'Activity': 'activity',
    'RiskCaptain': 'risk_captain',
    'Owner': 'owner',
    'CompositeRating': 'composite_rating',
    'IndustrySector': 'industry',
    'IndustrySubgroup': 'subindustry',
    'MarketIssue': 'market_issue',
    'IssuedAmount': 'issued_amount',
    'CouponType': 'coupon_type'
}
bonds = pd.read_csv('data/Isin.csv', parse_dates=['ActualMaturityDateKey', 'IssueDateKey'], date_parser=parse_date)\
          .rename(columns=columns)

bonds.head()

Unnamed: 0,bond,provider,maturity_date,issue_date,seniority,currency,activity_group,region,activity,risk_captain,owner,composite_rating,industry,subindustry,market_issue,issued_amount,coupon_type
0,0,238,2038-12-31,2005-11-29,GOV,USD,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,NR,Government,Sovereign,Domestic,1246002000.0,STEP CPN
1,1,238,2033-12-31,2005-11-29,GOV,USD,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,NR,Government,Sovereign,Domestic,4901086000.0,FIXED
2,2,238,2033-12-31,2005-11-29,GOV,ARS,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,NR,Government,Sovereign,Domestic,15012450000.0,FIXED
3,3,236,2017-04-17,2007-04-17,GOV,USD,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,B,Government,Sovereign,Domestic,7340076000.0,FIXED
4,4,234,2022-10-04,2010-02-22,GOV,ARS,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,NR,Government,Sovereign,Domestic,3058452000.0,FLOATING


Difference between issue and maturity dates.

In [10]:
bonds['issue_maturity_diff'] = (bonds['maturity_date'] - bonds['issue_date']).dt.days

Clean currency column.

In [11]:
counts = bonds['currency'].value_counts()
uncommon = counts[counts < 10]
bonds['currency'] = bonds['currency'].apply(lambda x: 'Rare' if x in uncommon.index else x)

Clean `risk_captain` column.

In [12]:
counts = bonds['risk_captain'].value_counts()
uncommon = counts[counts < 50]
bonds['risk_captain'] = bonds['risk_captain'].apply(lambda x: 'Rare' if x in uncommon.index else x)

Clean `owner` column.

In [13]:
counts = bonds['owner'].value_counts()
uncommon = counts[counts < 10]
bonds['owner'] = bonds['owner'].apply(lambda x: 'Rare' if x in uncommon.index else x)

Handle bond rating.

In [14]:
bonds['rating_first_letter'] = bonds['composite_rating'].str.slice(0, 1)

ratings_order = list(reversed([
    'AAA', 'AA+', 'AA', 'AA-', 'A+', 'A', 'A-',
    'BBB+', 'BBB', 'BBB-', 'BB+', 'BB', 'BB-', 'B+', 'B', 'B-',
    'CCC+', 'CCC', 'CCC-', 'CC+', 'CC', 'CC-', 'C+', 'C', 'C-',
    'DDD+', 'DDD', 'DD+', 'D',
    'NR'
]))

bonds['composite_rating'] = bonds['composite_rating'].apply(lambda x: ratings_order.index(x))

Clean industry columns.

In [15]:
bonds['industry'] = bonds['industry'].fillna('Financial')
bonds['subindustry'] = bonds['subindustry'].fillna('Commer Banks Non-US')

Clean market issue.

In [16]:
bonds['market_issue'] = bonds['market_issue'].fillna('Global')
counts = bonds['market_issue'].value_counts()
uncommon = counts[counts < 60]
bonds['market_issue'] = bonds['market_issue'].apply(lambda x: 'Rare' if x in uncommon.index else x)

Convert categorical features.

In [17]:
bonds['seniority'] = pd.Categorical(bonds['currency'])
bonds['region'] = pd.Categorical(bonds['region'])
bonds['activity'] = pd.Categorical(bonds['activity'])
bonds['activity_group'] = pd.Categorical(bonds['activity_group'])
bonds['currency'] = pd.Categorical(bonds['currency'])
bonds['risk_captain'] = pd.Categorical(bonds['risk_captain'])
bonds['owner'] = pd.Categorical(bonds['owner'])
bonds['rating_first_letter'] = pd.Categorical(bonds['rating_first_letter'])
bonds['industry'] = pd.Categorical(bonds['industry'])
bonds['subindustry'] = pd.Categorical(bonds['subindustry'])
bonds['market_issue'] = pd.Categorical(bonds['market_issue'])
bonds['coupon_type'] = pd.Categorical(bonds['coupon_type'])

Only keep relevant features.

In [18]:
to_drop = ['rating_first_letter', 'provider', 'issued_amount', 'coupon_type',
           'industry', 'currency']
bonds.drop(to_drop, axis='columns', inplace=True)

Prepare to join.

In [19]:
bonds = bonds.set_index('bond')
bonds.columns = ['bond_{}'.format(col) for col in bonds.columns]
bonds.head()

Unnamed: 0_level_0,bond_maturity_date,bond_issue_date,bond_seniority,bond_activity_group,bond_region,bond_activity,bond_risk_captain,bond_owner,bond_composite_rating,bond_subindustry,bond_market_issue,bond_issue_maturity_diff
bond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2038-12-31,2005-11-29,USD,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,0,Sovereign,Domestic,12085
1,2033-12-31,2005-11-29,USD,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,0,Sovereign,Domestic,10259
2,2033-12-31,2005-11-29,ARS,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,0,Sovereign,Domestic,10259
3,2017-04-17,2007-04-17,USD,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,15,Sovereign,Domestic,3653
4,2022-10-04,2010-02-22,ARS,FLOW LOCAL MARKET,AMERICAS,ARGENTINA,ARGENTINA,EMK ARGENTINA,0,Sovereign,Domestic,4607


### Market features

In [20]:
columns = {
    'IsinIdx': 'bond',
    'DateKey': 'date',
    'Price': 'price',
    'Yield': 'yield',
    'ZSpread': 'zspread'
}
market = pd.read_csv('data/Market.csv', parse_dates=['DateKey'], date_parser=parse_date)\
           .rename(columns=columns)

market.head()

Unnamed: 0,bond,date,price,yield,zspread
0,1,2016-01-01,104.25,7.835,5.505
1,7,2016-01-01,107.5,7.52,5.541
2,102,2016-01-01,100.746,4.048,2.085
3,331,2016-01-01,112.79,-0.752,-0.215
4,345,2016-01-01,113.383,-0.667,-0.272


Prepare to join.

In [21]:
market.columns = ['market_{}'.format(col) for col in market.columns]
market.head()

Unnamed: 0,market_bond,market_date,market_price,market_yield,market_zspread
0,1,2016-01-01,104.25,7.835,5.505
1,7,2016-01-01,107.5,7.52,5.541
2,102,2016-01-01,100.746,4.048,2.085
3,331,2016-01-01,112.79,-0.752,-0.215
4,345,2016-01-01,113.383,-0.667,-0.272


## Training set preparation

In [22]:
import numpy as np


real_train = []
n_rows = 0

train['id'] = train['customer'].astype(str) + '_' + train['bond'].astype(str)

train_since = dt.datetime(2018, 1, 15)
train_until = dt.datetime(2018, 4, 16)
train_dates = [
    train_until - dt.timedelta(days=i)
    for i in range(0, int((train_until - train_since) / dt.timedelta(days=1) + 1), 7)
]

#train_since = dt.datetime(2017, 10, 1)
#train_until = dt.datetime(2017, 12, 11)
#train_dates += [
#    train_until - dt.timedelta(days=i)
#    for i in range(0, int((train_until - train_since) / dt.timedelta(days=1) + 1), 7)
#]

for date in train_dates:
    
    six_months = dt.timedelta(6 * 365 / 12)
    before = train[train['date'].between(date - six_months, date - dt.timedelta(days=1))]
    this_week = train[train['date'].between(date, date + dt.timedelta(days=6))]
    this_week = this_week.drop_duplicates(['customer', 'bond'])
    
    # Determine the (customer, bond) pairs that happen before the current date 
    combos = pd.Series(before['id'].unique()).str.split('_', expand=True)
    combos.columns = ['customer', 'bond']
    combos['customer'] = combos['customer'].astype(before['customer'].dtype)
    combos['bond'] = combos['bond'].astype(before['bond'].dtype)
    combos['date'] = date
    
    # Add the events that happen during the current week 
    combos = pd.merge(
        left=combos,
        right=this_week[['customer', 'bond', 'action', 'interest']],
        how='left',
        on=['customer', 'bond']
    )
    combos['interest'] = combos['interest'].fillna(0)
    
    # Downsample because I only have 8 GB of RAM
    rows = np.concatenate((
        combos.query('interest == 1').index,
        np.random.choice(combos.query('interest == 0').index, 150000)
    ))
    combos = combos.loc[rows]
    
    # Considering both buying and selling leads to a natural data augmentation
    combos['action'] = combos['action'].fillna('Buy')
    mirror = combos.copy()
    mirror['action'] = mirror['action'].map({'Buy': 'Sell', 'Sell': 'Buy'})
    mirror['interest'] = 0
    combos = pd.concat((combos, mirror), axis='rows', ignore_index=True)
    
    # Add customer/bond interaction information
    gb = before.groupby(['customer', 'bond'])
    last = gb.last()
    combos = combos.join(last['date'].rename('date_of_last_interaction'), on=['customer', 'bond'])
    combos = combos.join(last['action'].rename('last_action'), on=['customer', 'bond'])
    combos = combos.join(gb.size().rename('n_interactions'), on=['customer', 'bond'])
    
    # Add market information
    #past_week_market = market[market['market_date'].between(date - dt.timedelta(days=7), date - dt.timedelta(days=1))]
    #past_week_market = past_week_market.groupby(['market_bond']).mean().reset_index()
    #combos = pd.merge(
    #    left=combos,
    #    right=past_week_market,
    #    how='left',
    #    left_on='bond',
    #    right_on='market_bond'
    #)
    
    real_train.append(combos)
    n_rows += len(combos)
    
    print('Added {} rows for {} (total is {})'.format(len(combos), date.strftime('%Y-%m-%d'), n_rows))
    

gb = train.groupby(['customer', 'bond'])
last = gb.last()
test = test.join(last['date'].rename('date_of_last_interaction'), on=['customer', 'bond'])
test = test.join(last['action'].rename('last_action'), on=['customer', 'bond'])
test = test.join(gb.size().rename('n_interactions'), on=['customer', 'bond'])

data = pd.concat(real_train + [test], ignore_index=True)
data.head()

Added 316688 rows for 2018-04-16 (total is 316688)
Added 316722 rows for 2018-04-09 (total is 633410)
Added 314870 rows for 2018-04-02 (total is 948280)
Added 317198 rows for 2018-03-26 (total is 1265478)
Added 314960 rows for 2018-03-19 (total is 1580438)
Added 316818 rows for 2018-03-12 (total is 1897256)
Added 316332 rows for 2018-03-05 (total is 2213588)
Added 319062 rows for 2018-02-26 (total is 2532650)
Added 315140 rows for 2018-02-19 (total is 2847790)
Added 318466 rows for 2018-02-12 (total is 3166256)
Added 320872 rows for 2018-02-05 (total is 3487128)
Added 321000 rows for 2018-01-29 (total is 3808128)
Added 316744 rows for 2018-01-22 (total is 4124872)
Added 314116 rows for 2018-01-15 (total is 4438988)


Unnamed: 0,PredictionIdx,action,bond,customer,date,date_of_last_interaction,interest,last_action,n_interactions
0,,Sell,16024,2515,2018-04-16,2018-01-18,1.0,Buy,5
1,,Sell,18122,3122,2018-04-16,2018-01-24,1.0,Buy,4
2,,Buy,23240,2638,2018-04-16,2018-03-19,1.0,Buy,5
3,,Buy,2736,2265,2018-04-16,2018-02-19,1.0,Buy,2
4,,Buy,26101,2752,2018-04-16,2017-11-20,1.0,Buy,2


Counts.

In [23]:
data = data.join(data.groupby('bond').size().rename('bond_count'), on='bond')
data = data.join(data.groupby('customer').size().rename('customer_count'), on='customer')

Smooth target encoding.

In [24]:
def target_encode(df, on, by, m):
    """From https://www.wikiwand.com/en/Bayes_estimator#/Practical_example_of_Bayes_estimators"""
    
    agg = df.groupby(by)[on].agg(['count', 'sum'])
    R = agg['sum'] / agg['count']
    v = agg['count']
    C = df[on].mean()
    W = (R*v + C*m) / (v + m)
    
    # Automatically generate the feature's name
    if not isinstance(by, (list, tuple)):
        by = [by]
    name = f"{on}_mean_by_{'_and_'.join(by)}"  # e.g. 'interest_mean_by_bond_and_action'
    
    return W.rename(name)


data = data.join(target_encode(data, 'interest', 'bond', 100), on='bond')
data = data.join(target_encode(data, 'interest', 'customer', 100), on='customer')
data = data.join(target_encode(data, 'interest', ['bond', 'action'], 100), on=['bond', 'action'])
data = data.join(target_encode(data, 'interest', ['customer', 'action'], 100), on=['customer', 'action'])

Add bond and customer features.

In [25]:
data = data.join(customers, how='left', on='customer')

In [26]:
data = data.join(bonds, how='left', on='bond')

Calculate features that are only computable as of now.

In [27]:
data['days_since_last_interaction'] = (data['date'] - data['date_of_last_interaction']).dt.days
data.drop('date_of_last_interaction', axis='columns', inplace=True)

data['customer_days_since_first_trade'] = (data['date'] - data['customer_first_trade_date']).dt.days

data['bond_days_since_issued'] = (data['date'] - data['bond_issue_date']).dt.days
data['bond_days_until_maturation'] = (data['bond_maturity_date'] - data['date']).dt.days

data['day_of_month'] = data['date'].dt.day

In [28]:
data['bond_interaction_ratio'] = data['bond_count'] / data['bond_days_since_issued']
data['customer_interaction_ratio'] = data['customer_count'] / data['customer_days_since_first_trade']

Convert categorical columns.

In [29]:
data['action'] = pd.Categorical(data['action'])
data['last_action'] = pd.Categorical(data['last_action'])
data['matching_action'] = (data['action'] == data['last_action']).astype(int)

Handle missing values.

In [30]:
data['interest_mean_by_bond'].fillna(data['interest_mean_by_bond'].mean(), inplace=True)
data['interest_mean_by_customer'].fillna(data['interest_mean_by_customer'].mean(), inplace=True)
data['interest_mean_by_bond_and_action'].fillna(data['interest_mean_by_bond'].mean(), inplace=True)
data['interest_mean_by_customer_and_action'].fillna(data['interest_mean_by_customer'].mean(), inplace=True)

Create train and test sets.

In [31]:
not_needed = ['PredictionIdx', 'bond', 'customer', 'date', 'interest',
              'customer_first_trade_date', 'bond_maturity_date', 'bond_issue_date']

train_mask = data['interest'].notnull()
train = data[train_mask].reset_index(drop=True)
X_train = train.drop(not_needed, axis='columns')
y_train = train['interest']

test = data[~train_mask].reset_index(drop=True)
X_test = test.drop(not_needed, axis='columns')
submission = test['PredictionIdx'].to_frame().copy()
submission['CustomerInterest'] = 0

Sanity checks.

In [32]:
assert X_train.isnull().sum().sum() == 0
assert X_test.isnull().sum().sum() == 0
assert len(X_test) == 484758
assert len(submission) == 484758

In [33]:
#del customers
#del bonds
#del real_train
#del train
#del test
#del data

In [34]:
X_train.head()

Unnamed: 0,action,last_action,n_interactions,bond_count,customer_count,interest_mean_by_bond,interest_mean_by_customer,interest_mean_by_bond_and_action,interest_mean_by_customer_and_action,customer_subsector,...,bond_market_issue,bond_issue_maturity_diff,days_since_last_interaction,customer_days_since_first_trade,bond_days_since_issued,bond_days_until_maturation,day_of_month,bond_interaction_ratio,customer_interaction_ratio,matching_action
0,Sell,Buy,5,66,54298,0.040313,0.046165,0.042796,0.064002,Asset Mgr owned by Bank/Insur.,...,Global,10794,88,119,6137,4657,16,0.010754,456.285714,0
1,Sell,Buy,4,228,73694,0.025812,0.048154,0.033628,0.046663,Independent Asset Manager,...,Euro mtn,10957,82,746,6116,4841,16,0.037279,98.785523,0
2,Buy,Buy,5,160,5998,0.043819,0.04657,0.04472,0.06582,Insurance,...,Euro-zone,3653,28,7,1161,2492,16,0.137812,856.857143,1
3,Buy,Buy,2,182,6722,0.018472,0.028818,0.026508,0.027488,Bank,...,Euro mtn,5479,56,819,3139,2340,16,0.05798,8.20757,1
4,Buy,Buy,2,722,31956,0.027663,0.037203,0.039368,0.027505,Asset Mgr owned by Bank/Insur.,...,Euro-zone,2563,147,355,325,2238,16,2.221538,90.016901,1


## Machine learning

In [42]:
import numpy as np
from sklearn import model_selection


params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'num_leaves': 2 ** 5,
    'min_data_per_group': 20000,
    'cat_smooth': 500,
    'scale_pos_weight': 30,
    'min_data_in_leaf': 300,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.95,
    'bagging_seed': 42,
    'lambda_l1': 1,
    'lambda_l2': 2,
    'verbosity': 1
}

n_splits = 5
cv = model_selection.StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_split = pd.DataFrame(index=X_train.columns)
feature_importances_gain = pd.DataFrame(index=X_train.columns)

In [43]:
import lightgbm as lgbm


for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    y_fit = y_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    fit = lgbm.Dataset(X_fit, y_fit)
    val = lgbm.Dataset(X_val, y_val)

    evals_result = {}
    model = lgbm.train(
        params,
        fit,
        num_boost_round=30000,
        valid_sets=(fit, val),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=50,
        evals_result=evals_result
    )

    fit_scores[i] = evals_result['fit']['auc'][-1]
    val_scores[i] = evals_result['val']['auc'][-1]
    submission['CustomerInterest'] += model.predict(X_test)
    feature_importances_split[i] = model.feature_importance(importance_type='split')
    feature_importances_gain[i] = model.feature_importance(importance_type='gain')
    
submission['CustomerInterest'] /= n_splits

submission.to_csv(
    'submissions/lgbm_{:.5f}_{:.5f}_{:.5f}_{:.5f}.csv'.format(
        fit_scores.mean(),
        fit_scores.std(),
        val_scores.mean(),
        val_scores.std()
    ),
    index=False
)

print('Local fit ROC AUC: {:.5f} (±{:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Local val ROC AUC: {:.5f} (±{:.5f})'.format(val_scores.mean(), val_scores.std()))



Training until validation scores don't improve for 50 rounds.
[50]	fit's auc: 0.834008	val's auc: 0.829492
[100]	fit's auc: 0.841153	val's auc: 0.834879
[150]	fit's auc: 0.846127	val's auc: 0.837899
[200]	fit's auc: 0.849283	val's auc: 0.839296
[250]	fit's auc: 0.852026	val's auc: 0.84048
[300]	fit's auc: 0.854419	val's auc: 0.841278
[350]	fit's auc: 0.85669	val's auc: 0.842041
[400]	fit's auc: 0.858898	val's auc: 0.842671
[450]	fit's auc: 0.860985	val's auc: 0.8434
[500]	fit's auc: 0.862847	val's auc: 0.843876
[550]	fit's auc: 0.864675	val's auc: 0.844511
[600]	fit's auc: 0.866564	val's auc: 0.845194
[650]	fit's auc: 0.868146	val's auc: 0.845507
[700]	fit's auc: 0.869561	val's auc: 0.845748
[750]	fit's auc: 0.871069	val's auc: 0.846036
[800]	fit's auc: 0.872645	val's auc: 0.84647
[850]	fit's auc: 0.874196	val's auc: 0.846829
[900]	fit's auc: 0.875539	val's auc: 0.847077
[950]	fit's auc: 0.876936	val's auc: 0.847333
[1000]	fit's auc: 0.878389	val's auc: 0.847506
[1050]	fit's auc: 0.879

[100]	fit's auc: 0.84095	val's auc: 0.837169
[150]	fit's auc: 0.845675	val's auc: 0.840044
[200]	fit's auc: 0.849148	val's auc: 0.84175
[250]	fit's auc: 0.851848	val's auc: 0.84278
[300]	fit's auc: 0.854379	val's auc: 0.843676
[350]	fit's auc: 0.856554	val's auc: 0.844395
[400]	fit's auc: 0.858499	val's auc: 0.844867
[450]	fit's auc: 0.86045	val's auc: 0.845383
[500]	fit's auc: 0.862393	val's auc: 0.8459
[550]	fit's auc: 0.864188	val's auc: 0.846258
[600]	fit's auc: 0.865913	val's auc: 0.846581
[650]	fit's auc: 0.867692	val's auc: 0.846976
[700]	fit's auc: 0.86933	val's auc: 0.847391
[750]	fit's auc: 0.870872	val's auc: 0.847667
[800]	fit's auc: 0.872389	val's auc: 0.847854
[850]	fit's auc: 0.873837	val's auc: 0.847978
[900]	fit's auc: 0.875505	val's auc: 0.848476
[950]	fit's auc: 0.876992	val's auc: 0.848793
[1000]	fit's auc: 0.878324	val's auc: 0.84896
[1050]	fit's auc: 0.879622	val's auc: 0.849164
[1100]	fit's auc: 0.880963	val's auc: 0.849461
[1150]	fit's auc: 0.882268	val's auc: 0

In [40]:
y_train.value_counts(normalize=True)

0.0    0.973081
1.0    0.026919
Name: interest, dtype: float64

In [41]:
0.973081 / 0.026919

36.14848248449051