In [123]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


In [2]:
np.random.seed(3)

In [3]:
def etl(fn='train_ZoGVYWq.csv'):
    # Load X set
    X = pd.read_csv(fn)
    # Rename vars
    X.rename(columns={
        'Income' : 'income',
        'age_in_days' : 'age',
        'perc_premium_paid_by_cash_credit' : 'trad_payment',
        'Count_3-6_months_late' : 'late_3_6m',
        'Count_6-12_months_late' : 'late_6_12m',
        'Count_more_than_12_months_late' : 'late_12m',
        'application_underwriting_score' : 'app_score',
        'no_of_premiums_paid' : 'count_premiums_paid'
        },
        inplace=True)
    # Target variable
    y = X['renewal'].copy()
    X.drop(labels=['id','renewal'], axis=1, inplace=True)
#     X_nans = X.isnull().sum().reset_index(inplace=True)
#     X_nans.rename(columns={0:'count'},inplace=True)
    
    # Rescale non_zero columns
    X['age'] = X['age'] / 365
    X['income'] = np.log(X['income'])

    # binarize urban
    X['urban'] = X['residence_area_type'].isin(['Urban'])
    X.drop(labels=['residence_area_type'], axis=1, inplace=True)
    # encode sourcing - remember to drop E when running prediction (multicollinearity)
    X['source_a'] = X['sourcing_channel'].isin(['A'])
    X['source_b'] = X['sourcing_channel'].isin(['B'])
    X['source_c'] = X['sourcing_channel'].isin(['C'])
    X['source_d'] = X['sourcing_channel'].isin(['D'])
    X['source_e'] = X['sourcing_channel'].isin(['E'])

    X.drop(labels=['sourcing_channel'], axis=1, inplace=True)
    # Fill missing data
    # Predict missing data later with more time!
    # X data has missing values in late, app_score
    X['late_3_6m'].fillna(X['late_3_6m'].median(),inplace=True)
    X['late_6_12m'].fillna(X['late_6_12m'].median(),inplace=True)
    X['late_12m'].fillna(X['late_12m'].median(),inplace=True)
    X['app_score'].fillna(X['app_score'].median(),inplace=True)
    X.drop(labels=['source_e'], axis=1, inplace=True)
    X['inverse'] =  X['late_6_12m'] > (1/X['late_12m'])
    X['premium_differential'] = X['count_premiums_paid']/(X['age']-20) - X['late_3_6m'] - (1/2)* X['late_6_12m'] - (1/4) * X['late_12m']
    X['afford'] = X['premium'] / X['income']
    print(X.columns)

    # Split train test
    return train_test_split(X, y, test_size=0.25, random_state=11)

In [4]:
def feat_eng_train(fn='train_ZoGVYWq.csv'):
    # Load X set
    X = pd.read_csv(fn)
    # Rename vars
    X.rename(columns={
        'Income' : 'income',
        'age_in_days' : 'age',
        'perc_premium_paid_by_cash_credit' : 'trad_payment',
        'Count_3-6_months_late' : 'late_3_6m',
        'Count_6-12_months_late' : 'late_6_12m',
        'Count_more_than_12_months_late' : 'late_12m',
        'application_underwriting_score' : 'app_score',
        'no_of_premiums_paid' : 'count_premiums_paid'
        },
        inplace=True)
    # Target variable
    y = X['renewal'].copy()
    X.drop(labels=['id','renewal'], axis=1, inplace=True)
    
    # Rescale non_zero columns
    X['age'] = X['age'] / 365
    X['income'] = np.log(X['income'])

    # binarize urban
    X['urban'] = X['residence_area_type'].isin(['Urban'])
    X.drop(labels=['residence_area_type'], axis=1, inplace=True)
    # encode sourcing - remember to drop E when running prediction (multicollinearity)
    X['source_a'] = X['sourcing_channel'].isin(['A'])
    X['source_b'] = X['sourcing_channel'].isin(['B'])
    X['source_c'] = X['sourcing_channel'].isin(['C'])
    X['source_d'] = X['sourcing_channel'].isin(['D'])
    X['afford'] = X['income'] / (12 * X['premium'] )
    X.drop(labels=['sourcing_channel'], axis=1, inplace=True) 
    # Don't normalize binary variables
    bi_var = ['urban','source_a', 'source_b', 'source_c', 'source_d']
    nan_var = ['late_3_6m', 'late_6_12m', 'late_12m', 'app_score']
    scale_var = ['trad_payment', 'premium']
    # Rescale scale_var
    rescale_dict = dict()
    rescale_dict['trad_payment_med'] = X['trad_payment'].median()
    rescale_dict['trad_payment_std'] = X['trad_payment'].std()
    rescale_dict['premium_med'] = X['premium'].median()
    rescale_dict['premium_std'] = X['premium'].std()
    rescale_dict['afford_med'] = X['afford'].median()
    rescale_dict['afford_std'] = X['afford'].std()
    # Fill missing data
    # Predict missing data later with more time!
    # X data has missing values in late, app_score
    X['late_3_6m'].fillna(X['late_3_6m'].median(),inplace=True)
    X['late_6_12m'].fillna(X['late_6_12m'].median(),inplace=True)
    X['late_12m'].fillna(X['late_12m'].median(),inplace=True)
    X['app_score'].fillna(X['app_score'].median(),inplace=True)
    X['inverse'] =  X['late_6_12m'] > (1/X['late_12m'])
    X['premium_differential'] = X['count_premiums_paid']/(X['age']-20) - X['late_3_6m'] - (1/2)* X['late_6_12m'] - (1/4) * X['late_12m']
        
    # Rescale missingvlaues
    rescale_dict['late_3_6m_med'] = X['late_3_6m'].median()
    rescale_dict['late_3_6m_std'] = X['late_3_6m'].std()
    rescale_dict['late_6_12m_med'] = X['late_6_12m'].median()
    rescale_dict['late_6_12m_std'] = X['late_6_12m'].std()
    rescale_dict['late_12m_med'] = X['late_12m'].median()
    rescale_dict['late_12m_std'] = X['late_12m'].std()
    rescale_dict['app_score_med'] = X['app_score'].median()
    rescale_dict['app_score_std'] = X['app_score'].std()
    rescale_dict['premium_differential_med'] = X['premium_differential'].median()
    rescale_dict['premium_differential_std'] = X['premium_differential'].std()
    
    for col in nan_var + scale_var:
        X[col] = (X[col] - X[col].median())/X[col].std()
    print(X.columns)
    return X, y, rescale_dict

In [13]:
def feat_eng_test(fn='test_66516Ee.csv', rescale_dict=rescale_dict):
    # Load X set
    X = pd.read_csv(fn)
    # Rename vars
    X.rename(columns={
        'Income' : 'income',
        'age_in_days' : 'age',
        'perc_premium_paid_by_cash_credit' : 'trad_payment',
        'Count_3-6_months_late' : 'late_3_6m',
        'Count_6-12_months_late' : 'late_6_12m',
        'Count_more_than_12_months_late' : 'late_12m',
        'application_underwriting_score' : 'app_score',
        'no_of_premiums_paid' : 'count_premiums_paid'
        },
        inplace=True)
    # Target variable
    y  = X['id']
    X.drop(labels=['id'], axis=1, inplace=True) 
    # Rescale non_zero columns
    X['age'] = X['age'] / 365
    X['income'] = np.log(X['income'])

    # binarize urban
    X['urban'] = X['residence_area_type'].isin(['Urban'])
    X.drop(labels=['residence_area_type'], axis=1, inplace=True)
    # encode sourcing - remember to drop E when running prediction (multicollinearity)
    X['source_a'] = X['sourcing_channel'].isin(['A'])
    X['source_b'] = X['sourcing_channel'].isin(['B'])
    X['source_c'] = X['sourcing_channel'].isin(['C'])
    X['source_d'] = X['sourcing_channel'].isin(['D'])
    X['afford'] = X['income'] / (12 * X['premium'] )
    X.drop(labels=['sourcing_channel'], axis=1, inplace=True) 
    # Don't normalize binary variables

    # Fill missing data
    # Predict missing data later with more time!
    # X data has missing values in late, app_score
    X['late_3_6m'].fillna(X['late_3_6m'].median(),inplace=True)
    X['late_6_12m'].fillna(X['late_6_12m'].median(),inplace=True)
    X['late_12m'].fillna(X['late_12m'].median(),inplace=True)
    X['app_score'].fillna(X['app_score'].median(),inplace=True)
    X['inverse'] =  X['late_6_12m'] > (1/X['late_12m'])
    X['premium_differential'] = X['count_premiums_paid']/(X['age']-20) - X['late_3_6m'] - (1/2)* X['late_6_12m'] - (1/4) * X['late_12m']
    
    # Now use rescale_dict to rescale all variables
    X['trad_payment'] = (X['trad_payment'] - rescale_dict['trad_payment_med']) / rescale_dict['trad_payment_std']
    X['premium']  = (X['premium'] - rescale_dict['premium_med']) / rescale_dict['premium_std']
    X['late_3_6m'] = (X['late_3_6m'] - rescale_dict['late_3_6m_med']) / rescale_dict['late_3_6m_std']
    X['late_6_12m'] = (X['late_6_12m'] - rescale_dict['late_6_12m_med']) / rescale_dict['late_3_6m_std']
    X['late_12m'] = (X['late_12m'] - rescale_dict['late_12m_med']) / rescale_dict['late_12m_std']
    X['app_score'] = (X['app_score'] - rescale_dict['app_score_med']) / rescale_dict['app_score_std']
    X['premium_differential'] = (X['premium_differential'] - rescale_dict['premium_differential_med']) / rescale_dict['premium_differential_std']
    X['afford'] = (X['afford'] - rescale_dict['afford_med']) / rescale_dict['afford_std']
    
    print(X.columns)
    return X, y

In [None]:
fn='train_ZoGVYWq.csv'
X = pd.read_csv(fn)
X.rename(columns={
    'Income' : 'income',
    'age_in_days' : 'age',
    'perc_premium_paid_by_cash_credit' : 'trad_payment',
    'Count_3-6_months_late' : 'late_3_6m',
    'Count_6-12_months_late' : 'late_6_12m',
    'Count_more_than_12_months_late' : 'late_12m',
    'application_underwriting_score' : 'app_score',
    'no_of_premiums_paid' : 'count_premiums_paid'
    },
    inplace=True)
y = X['renewal'].copy()
X.drop(labels='renewal', axis=1, inplace=True)x
# Transform vars
# ln income
X['income'] = np.log(X['income'])
# binarize urban
X['urban'] = X['residence_area_type'].isin(['Urban'])
X.drop(labels=['residence_area_type'], axis=1, inplace=True)
# encode sourcing - remember to drop E when running prediction (multicollinearity)
X['source_a'] = X['sourcing_channel'].isin(['A'])
X['source_b'] = X['sourcing_channel'].isin(['B'])
X['source_c'] = X['sourcing_channel'].isin(['C'])
X['source_d'] = X['sourcing_channel'].isin(['D'])
X['source_e'] = X['sourcing_channel'].isin(['E'])
X['age'] = X['age'] / 365
X.drop(labels=['sourcing_channel'], axis=1, inplace=True)
# Fill missing data
# Predict missing data later with more time!
# X data has missing values in late, app_score
X['late_3_6m'].fillna(X['late_3_6m'].median(),inplace=True)
X['late_6_12m'].fillna(X['late_6_12m'].median(),inplace=True)
X['late_12m'].fillna(X['late_12m'].median(),inplace=True)
X['app_score'].fillna(X['app_score'].median(),inplace=True)
X.drop(labels=['source_e'], axis=1, inplace=True)

In [6]:
def get_accuracy(clf, x, y):
    preds =  clf.predict(x)
    return 1 - sum(np.abs(preds - y.values)) / len(preds)

In [7]:
def sgd(X, y):
    # Define classifier
    clf = SGDClassifier(fit_intercept=True, loss='log', penalty='elasticnet', max_iter=8)
    param_grid = {
    'alpha': [10 ** x for x in range(-6, 2)],
    'l1_ratio': [n * 0.05 for n in range(21)]}

    
    clf_grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc')
    pipeline = make_pipeline(StandardScaler(), clf_grid)
    pipeline.fit(X=X_train, y=y_train)
    best_clf = clf_grid.best_estimator_
    return best_clf, X_train, X_test, y_train, y_test

In [8]:
def rf(X, y):
    # Define classifier
    clf = RandomForestClassifier(criterion='gini')
    param_grid = {
    'n_estimators': [n*2 for n in range(5,10)],
    'max_depth': [n for n in range(4,8)],
    'min_samples_split':[256, 128],
    'min_samples_leaf':[81, 64]}
    
    
    clf_grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc')
    pipeline = make_pipeline(clf_grid)
    pipeline.fit(X=X, y=y)
    best_clf = clf_grid.best_estimator_
    return best_clf

In [104]:
def rf(X, y):
    # Define classifier
    clf = RandomForestClassifier(criterion='gini')
    param_grid = {
    'n_estimators': [n*2 for n in range(5,10)],
    'max_depth': [n for n in range(4,8)],
    'min_samples_split':[512, 256, 128],
    'min_samples_leaf':[81, 64, 32]}
    
    
    clf_grid = GridSearchCV(estimator=clf, cv=4, param_grid=param_grid, scoring='roc_auc')
    pipeline = make_pipeline(clf_grid)
    pipeline.fit(X=X, y=y)
    best_clf = clf_grid.best_estimator_
    return best_clf

In [103]:
def log(X, y):
    # Define classifier
    clf = LogisticRegression(penalty='l2')
    param_grid = {
    'n_estimators': [n*2 for n in range(5,10)],
    'max_depth': [n for n in range(4,8)],
    'min_samples_split':[256, 128],
    'min_samples_leaf':[81, 64]}
    
    
    clf_grid = GridSearchCV(estimator=clf, cv=5, param_grid=param_grid, scoring='roc_auc')
    pipeline = make_pipeline(StandardScaler(), clf_grid)
    pipeline.fit(X=X, y=y)
    best_clf = clf_grid.best_estimator_
    return best_clf, X_train, X_test, y_train, y_test

In [None]:
# best_sgd_clf, X_train, X_test, y_train, y_test =  sgd(X, y)
# sgd_acc = get_accuracy(best_sgd_clf, X_test, y_test)
# print(sgd_acc)

# Start Here

In [105]:
X, y, rescale_dict = feat_eng_train()
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

Index(['trad_payment', 'age', 'income', 'late_3_6m', 'late_6_12m', 'late_12m',
       'app_score', 'count_premiums_paid', 'premium', 'urban', 'source_a',
       'source_b', 'source_c', 'source_d', 'afford', 'inverse',
       'premium_differential'],
      dtype='object')


In [106]:
best_rf_clf =  rf(X, y)
rf_acc = get_accuracy(best_rf_clf, X_test, y_test)
print(rf_acc)

KeyboardInterrupt: 

In [121]:
roc_auc_score(y, np.array([x[1] for x in best_rf_clf.predict_proba(X)]))

0.8367510921274528

In [14]:
best_rf_clf.score(X_test, y_test)

0.9387898216790223

In [15]:
X_data, y_data = feat_eng_test(fn='test_66516Ee.csv', rescale_dict=rescale_dict)

Index(['trad_payment', 'age', 'income', 'late_3_6m', 'late_6_12m', 'late_12m',
       'app_score', 'count_premiums_paid', 'premium', 'urban', 'source_a',
       'source_b', 'source_c', 'source_d', 'afford', 'inverse',
       'premium_differential'],
      dtype='object')


In [16]:
y_pred = np.array([x[1] for x in best_rf_clf.predict_proba(X_data)])

In [17]:
ids = pd.read_csv('test_66516Ee.csv')['id']

In [18]:
y_submit = pd.DataFrame(data={'id':ids, 'renewal':y_pred})

In [19]:
def p_e(incentives):
    return 20*(1-np.exp(np.negative(10*(1-np.exp(np.negative(incentives)/400)))/5))

In [None]:
X['premium']

In [88]:
incent = np.array(range(0,20000))

In [89]:
ppp = p_e(incent)

In [90]:
effort_df = pd.DataFrame({'incent':incent, 'ppp':ppp})

In [91]:
effort_df.head()

Unnamed: 0,incent,ppp
0,0,0.0
1,1,0.099626
2,2,0.198509
3,3,0.296656
4,4,0.394073


In [23]:
y_train = np.array([x[1] for x in best_rf_clf.predict_proba(X)])

In [81]:
X_part_2  = X[['premium', 'prob','max_prob_add']].copy()

In [82]:
X_part_2['premium'] = (X_part_2['premium'] * rescale_dict['premium_std']) + rescale_dict['premium_med']

In [93]:
for k in X_part_2.itertuples():
    print(k.prob)
    break

0.9602367856672638


In [100]:
incentives = []
for k in X_part_2.itertuples():
    calcs_cost = effort_df[effort_df['incent']< k.premium/3].copy()
    calcs_cost_2 = effort_df[effort_df['ppp']/100< k.max_prob_add].copy()
    calc_df = min([calcs_cost, calcs_cost_2], key=len)
    calc_df['expected_revenue'] = (k.premium - calc_df['incent']) * (calc_df['ppp'] + k.prob)
    incentives.append(calc_df.loc[calc_df['expected_revenue'].idxmax()]['incent'])

In [101]:
X_part_2['incent_guess'] = incentives

In [102]:
X_part_2.head()

Unnamed: 0,premium,prob,max_prob_add,incent_guess
0,3300.0,0.960237,0.039763,46.0
1,18000.0,0.980168,0.019832,21.0
2,3300.0,0.427062,0.572938,545.0
3,9600.0,0.977982,0.022018,24.0
4,9600.0,0.984136,0.015864,16.0


In [99]:
calc_df.loc[calc_df['expected_revenue'].idxmax()]['incent']

46.0

In [84]:
calc_df.shape

(46, 2)

In [85]:
calcs_cost.shape

(1099, 2)

In [86]:
calcs_cost_2.shape

(46, 2)

In [87]:
X_part_2.head()

Unnamed: 0,premium,prob,max_prob_add
0,3300.0,0.960237,0.039763
1,18000.0,0.980168,0.019832
2,3300.0,0.427062,0.572938
3,9600.0,0.977982,0.022018
4,9600.0,0.984136,0.015864


In [109]:
ids = pd.read_csv('test_66516Ee.csv')['id']
y_submit = pd.DataFrame(data={'id':ids, 'renewal':y_pred})

In [111]:
# y_submit.to_csv('halfway.csv', index=False)

In [125]:
# Append probs to X_data
X_data['prob'] = y_pred
X_data['max_prob_add'] = 1 - X_data['prob']
X_part_2  = X_data[['premium', 'prob','max_prob_add']].copy()
# Rescale premium
X_part_2['premium'] = (X_part_2['premium'] * rescale_dict['premium_std']) + rescale_dict['premium_med']

# Create incentives df
incent = np.array(range(0,20000))
ppp = p_e(incent)
effort_df = pd.DataFrame({'incent':incent, 'ppp':ppp})
# Find appropriate level of incentives for each customer, based on expected premium
incentives = []
for k in X_part_2.itertuples():
    calcs_cost = effort_df[effort_df['incent']< k.premium/3].copy()
    calcs_cost_2 = effort_df[effort_df['ppp']/100< k.max_prob_add].copy()
    calc_df = min([calcs_cost, calcs_cost_2], key=len)
    calc_df['expected_revenue'] = k.premium * (calc_df['ppp'] + k.prob) - calc_df['incent']
    incentives.append(calc_df.loc[calc_df['expected_revenue'].idxmax()]['incent'])
X_part_2['incent_guess'] = incentives
y_submit['incentives'] = incentives

In [126]:
y_submit

Unnamed: 0,id,renewal,incentives
0,649,0.963997,41.0
1,81136,0.976873,25.0
2,70762,0.841654,613.0
3,53935,0.970230,33.0
4,15476,0.960003,47.0
5,64797,0.953171,57.0
6,67412,0.862372,349.0
7,44241,0.864731,332.0
8,5069,0.970004,33.0
9,16615,0.982796,18.0


In [120]:
y_submit.to_csv('submit_' + datetime.datetime.now().strftime("%Y%m%d%H%M") + '.csv', index=False)

In [116]:
import datetime

In [124]:
svc_clf = SVC()