In [None]:
import numpy as np
import pandas as pd

import sklearn as sk
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,average_precision_score,roc_curve,auc,precision_recall_curve
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.ensemble import BaggingClassifier 
from sklearn.calibration import CalibratedClassifierCV


%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.calibration import calibration_curve
from sklearn.externals import joblib

import xgboost as xgb
from scipy import interp

import random

from __future__ import division
from __future__ import print_function

from plotly import tools
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go
init_notebook_mode()

import colorlover as cl
from IPython.display import HTML

# Define useful functions

In [None]:
def get_importance(_bst, _importance_type):
    # if it's weight, then omap stores the number of missing values
    fmap = ''
    if _importance_type == 'weight':
        # do a simpler tree dump to save time
        trees = _bst.get_dump(fmap, with_stats=False)

        fmap = {}
        for tree in trees:
            for line in tree.split('\n'):
                # look for the opening square bracket
                arr = line.split('[')
                # if no opening bracket (leaf node), ignore this line
                if len(arr) == 1:
                    continue

                # extract feature name from string between []
                fid = arr[1].split(']')[0].split('<')[0]

                if fid not in fmap:
                    # if the feature hasn't been seen yet
                    fmap[fid] = 1
                else:
                    fmap[fid] += 1

        return fmap

    else:
        trees = _bst.get_dump(fmap, with_stats=True)

        _importance_type += '='
        fmap = {}
        gmap = {}
        for tree in trees:
            for line in tree.split('\n'):
                # look for the opening square bracket
                arr = line.split('[')
                # if no opening bracket (leaf node), ignore this line
                if len(arr) == 1:
                    continue

                # look for the closing bracket, extract only info within that bracket
                fid = arr[1].split(']')

                # extract gain or cover from string after closing bracket
                g = float(fid[1].split(_importance_type)[1].split(',')[0])

                # extract feature name from string before closing bracket
                fid = fid[0].split('<')[0]

                if fid not in fmap:
                    # if the feature hasn't been seen yet
                    fmap[fid] = 1
                    gmap[fid] = g
                else:
                    fmap[fid] += 1
                    gmap[fid] += g

        # calculate average value (gain/cover) for each feature
        for fid in gmap:
            gmap[fid] = gmap[fid] / fmap[fid]

        return gmap

# For Targeting

# Read files and preparation before cross-validation

In [None]:
dataset_modelA_clean = pd.read_csv("/group/jiannan/Data/dataset.csv",sep="|",na_values=["\N", "NULL"])

In [None]:
#Drop ModelB targets
new_set_flg = False

if new_set_flg:
    modelB_targets = ['target_rider_all_max','target_rider_waiver_premium_max','target_rider_health_max',
                      'target_rider_accident_max','target_rider_critical_illness_max','target_rider_term_max']
    dataset_modelA_clean = dataset_modelA_clean.drop(modelB_targets, axis=1)


In [None]:
dataset_modelA_clean=dataset_modelA_clean.rename(columns = {'target_whole_life_max':'target_whole_life',
                                 'target_endowment_max':'target_endowment',
                                 'target_retirement_max':'target_retirement',
                                 'target_short_term_saving_max':'target_short_term_saving',
                                 'target_term_life_max':'target_term_life',
                                 'target_universal_life_max':'target_universal_life',
                                 'target_health_max':'target_health',
                                 'target_all_max':'target_all'
                                               })

In [None]:
ALL_TARGETS = ['target_whole_life','target_endowment','target_retirement','target_short_term_saving','target_term_life',
               'target_universal_life','target_health','target_all']

In [None]:
targeting_features_file = '/group/jiannan/Cols/modelA_targeting_features.csv'
feature_df_targeting = pd.read_csv(targeting_features_file,na_values=["\N","NULL"])

features_columns_targeting = feature_df_targeting.targeting_features.tolist()

print(len(features_columns_targeting))

In [None]:
targeting_features_file = '/group/jiannan/Cols/modelA_reco_features.csv'
feature_df_reco = pd.read_csv(targeting_features_file,na_values=["\N","NULL"])

features_columns_reco = feature_df_reco.reco_features.tolist()

print(len(features_columns_reco))

In [None]:
# get data set for targeting
dataset_modelA_clean_targeting = dataset_modelA_clean[np.concatenate([features_columns_targeting, ALL_TARGETS])].copy()

# get data set for recommendation
dataset_modelA_clean_reco = dataset_modelA_clean[np.concatenate([features_columns_reco, ALL_TARGETS])].copy()

In [None]:
print(ALL_TARGETS)

for target in ALL_TARGETS:
    print("target proportion %s"%target)
    print(dataset_modelA_clean_targeting[target].mean())
    print(dataset_modelA_clean_targeting[target].mean())

# Start CV for xgb targeting

In [None]:
def run_cross_validation(_df, _classifier, _features_columns):
    # cross validation type can be changed here
    ss = sk.cross_validation.ShuffleSplit(len(_df.svocmasterid.unique()), n_iter=10, test_size=.3, random_state=0)
    target='target_all'
    prob_of = 'prob_of_all'
    
    results_cv_targeting = pd.DataFrame([], columns=['svocmasterid', target, 'fold', prob_of])

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    mean_lift = []
    mean_tp = 0.0
    mean_fp = range(0, 101)

    nb_calls_cv = pd.DataFrame([],columns=['nb_contacts', 'total_population', 'total_X_sellers', 'nb_Xsellers', 'Xsell_rate', 
                                           'Percentage_of_Xseller_found', 'Percentage_of_Population', 'Lift'])
    feature_importances = pd.DataFrame([], columns=['feature', 'importance', 'fold'])

    fig = plt.figure(figsize=(6, 12))
    fig.subplots_adjust(bottom=-0.5, left=-0.5, top=0.5, right=1.5)


    print ('modeling started')

    for i, (train_index, valid_index) in enumerate(ss):

        customer_id = _df.svocmasterid.unique().copy()
        shuffled_customer_id = np.array(sorted(customer_id, key=lambda k: random.random()))
        train_customer_id = shuffled_customer_id[train_index]
        valid_customer_id = shuffled_customer_id[valid_index]

        train = _df.loc[ _df.svocmasterid.isin(train_customer_id), np.concatenate([_features_columns, [target]],
                        axis=0)].copy().reset_index(drop=True)
        valid = _df.loc[_df.svocmasterid.isin(valid_customer_id), np.concatenate([_features_columns, [target]],
                        axis=0)].copy().reset_index(drop=True)

        temp = valid[['svocmasterid', target]].copy()
        temp['fold'] = i

        # modeling#
        train_X = train.drop(['svocmasterid', target], axis=1)
        valid_X = valid.drop(['svocmasterid', target], axis=1)

        train_Y = np.array(train[target].astype(np.uint8))
        valid_Y = np.array(valid[target].astype(np.uint8))

        probas_ = _classifier.fit(train_X, train_Y,eval_metric='auc', eval_set=[(valid_X, valid_Y)],early_stopping_rounds=40).predict_proba(valid_X) 
        probabilities = pd.DataFrame(data=probas_[:, 1], index=valid_X.index, columns=[prob_of])

        temp = temp.join(probabilities, how='left')
        results_cv_targeting = results_cv_targeting.append(temp)

        ###############################################################################
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = sk.metrics.roc_curve(valid_Y, probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = sk.metrics.auc(fpr, tpr)


        plt.subplot(2, 2, 1)
        plt.plot(fpr, tpr, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        ###############################################################################
        # compute lift at 10%#
        sorted_proba = np.array(list(reversed(np.argsort(probas_[:, 1]))))
        X_test = valid_X
        y_test = valid_Y
        centile = X_test.shape[0] / 100
        positives = sum(y_test)
        lift = [0]
        for q in xrange(1, 101):
            if q == 100:
                tp = sum(np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:X_test.shape[0]]])
            else:
                tp = sum(
                    np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:q * X_test.shape[0] / 100]])
            lift.append(lift[q - 1] + 100 * tp / float(positives))
        quantiles = range(0, 101)
        mean_tp += interp(mean_fp, mean_fp, lift)
        mean_tp[0] = 0.0
        mean_lift.append(lift[10] / 10.)


        plt.subplot(2, 2, 2)
        plt.plot(quantiles, lift, label='Lift fold %d at 10 = %0.2f' % (i, lift[10] / 10.))
        print ('shuffle: %i, AUC: %f, lift at 10 percent: %f' % (i, roc_auc, lift[10] / 10.))
        
        ###############################################################################
        # Calculate nb calls to make
        nb_calls = temp[['target_all','prob_of_all','fold']].copy()
        nb_calls = nb_calls.sort_values(by='prob_of_all', ascending=False).reset_index(drop=True)
        nb_calls['cum_Xsellers'] = np.cumsum(nb_calls.target_all)
        nb_calls = nb_calls.reset_index(drop=False)
        nb_calls = nb_calls.rename(columns={'index':'rank'})
        nb_calls['nb_calls_100'] = nb_calls.loc[nb_calls.cum_Xsellers==100,'rank'].min()
        nb_calls['nb_calls_200'] = nb_calls.loc[nb_calls.cum_Xsellers==200,'rank'].min()
        nb_calls['nb_calls_500'] = nb_calls.loc[nb_calls.cum_Xsellers==500,'rank'].min()
        nb_calls['nb_calls_1000'] = nb_calls.loc[nb_calls.cum_Xsellers==1000,'rank'].min()
        nb_calls['nb_calls_2000'] = nb_calls.loc[nb_calls.cum_Xsellers==2000,'rank'].min()
        nb_calls['nb_calls_3000'] = nb_calls.loc[nb_calls.cum_Xsellers==3000,'rank'].min()
        nb_calls['nb_calls_all'] = nb_calls.loc[nb_calls.cum_Xsellers==nb_calls.cum_Xsellers.max(),'rank'].min()
        nb_calls = nb_calls[['nb_calls_100','nb_calls_200', 'nb_calls_500','nb_calls_1000', 'nb_calls_2000','nb_calls_3000','nb_calls_all']].min()
        nb_calls = pd.DataFrame(nb_calls,columns=['nb_contacts'])
        nb_calls['total_population'] = temp.shape[0]
        nb_calls['total_X_sellers'] = temp.target_all.sum()
        nb_calls['nb_Xsellers']=[100,200,500,1000,2000,3000, temp.target_all.sum()]
        nb_calls['Xsell_rate'] = nb_calls.nb_Xsellers/nb_calls.nb_contacts
        nb_calls['Percentage_of_Xseller_found'] = nb_calls.nb_Xsellers/nb_calls.total_X_sellers
        nb_calls['Percentage_of_Population'] = nb_calls.nb_contacts/nb_calls.total_population
        nb_calls['Lift'] = nb_calls.Percentage_of_Xseller_found/nb_calls.Percentage_of_Population

        nb_calls_cv = nb_calls_cv.append(nb_calls)
        
        ###############################################################################
        feature_importances_data = []
        features = train_X.columns
        for feature_name, feature_importance in get_importance(_classifier.booster(), 'gain').iteritems():
            feature_importances_data.append({
                'feature': feature_name,
                'importance': feature_importance
            })

        temp = pd.DataFrame(feature_importances_data)
        temp['fold'] = i
        feature_importances = feature_importances.append(temp)
        
    
    nb_calls_cv = nb_calls_cv.reset_index().groupby('index').mean().sort_values(by='nb_Xsellers')
    results_cv_targeting = results_cv_targeting.reset_index(drop=True)
    
    feature_importances = feature_importances.groupby('feature')['importance'].agg([np.mean, np.std])
    feature_importances = feature_importances.sort_values(by='mean')
    feature_importances = feature_importances.reset_index()

    plt.subplot(2, 2, 1)
    mean_tpr /= len(ss)
    mean_tpr[-1] = 1.0
    mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.subplot(2, 2, 2)
    mean_tp /= len(ss)
    mean_tp[-1] = 100.0
    mean_lift10 = np.mean(mean_lift)
    print('Mean AUC: %f, Mean lift at 10 percent: %f' % (mean_auc, mean_lift10))
    plt.plot(mean_fp, mean_tp, 'k--', label='Mean lift at 10 = %0.2f' % mean_lift10, lw=2)

    plt.plot([0, 100], [0, 100], 'k--', color=(0.6, 0.6, 0.6))
    plt.xlim([-5, 105])
    plt.ylim([-5, 105])
    plt.xlabel('Percentage of population')
    plt.ylabel('Cumulative gain')
    plt.title('Lift', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.show()

    return results_cv_targeting, feature_importances, nb_calls_cv

In [None]:

classifier = xgb.XGBClassifier(objective='binary:logistic',max_depth=6,n_estimators=420, learning_rate=0.05,max_delta_step=1,
                        min_child_weight=25, gamma=0.1, scale_pos_weight=1, colsample_bytree=0.85, subsample=0.85,colsample_bylevel=0.85,
                        nthread=14, seed=27)

results_cv_targeting, feature_importances, nb_calls_cv = run_cross_validation(dataset_modelA_clean_targeting, classifier , features_columns_targeting) 

In [None]:
feature_importances_sort = feature_importances.sort_values(by='mean',ascending=False)

In [None]:
feature_importances_sort.to_csv('/home/jliu/xgb_feat_cv.txt',sep='|', index=False)

In [None]:
feature_importances_sort['relative_imp'] = feature_importances_sort['mean']/feature_importances_sort['mean'].sum()

In [None]:
feature_importances_top40 = feature_importances_sort[:40][::-1].reset_index(drop=True)

In [None]:
feature_importances_top40

In [None]:
plt.figure(figsize=(10, 20))
plt.title("Top40 Most Important Feature importances for ModelA")
plt.barh(feature_importances_top40.index, feature_importances_top40['relative_imp'],
         color='#348ABD', align="center", lw='3', edgecolor='#348ABD', alpha=0.6)
plt.yticks(feature_importances_top40.index, feature_importances_top40['feature'], fontsize=12,)
plt.ylim([-1, feature_importances_top40.index.max()+1])
plt.xlim([0, feature_importances_top40['relative_imp'].max()*1.1])
plt.show()

In [None]:
nb_calls_cv

# For Recommendation

# Transform target columns

In [None]:
print(dataset_modelA_clean_reco.shape)

In [None]:
target='target_multi'
probas_cols = ['prob_of_endowment', 'prob_of_universal_life', 'prob_of_retirement', 'prob_of_term_life',
               'prob_of_health', 'prob_of_short_term_saving', 'prob_of_whole_life']
target_cols = [u'target_endowment', u'target_universal_life', u'target_retirement',
               u'target_term_life', u'target_health', u'target_short_term_saving', u'target_whole_life']
products = ['endowment','universal_life','retirement','term_life','health','short_term_saving','whole_life']
products_name = ['endowment','universal_life','retirement','term_life','health','short_term_saving','whole_life']
target_map = {u'endowment': 0, u'universal_life': 1, u'retirement': 2,
              u'term_life': 3, u'health': 4, u'short_term_saving': 5, u'whole_life': 6}
inv_target_map = {0: 'endowment', 1: 'universal_life', 2: 'retirement', 3: 'term_life', 4: 'health',
                  5: 'short_term_saving', 6: 'whole_life'}

In [None]:
dataset_modelA_reco = dataset_modelA_clean_reco.loc[dataset_modelA_clean.target_all==1].copy()
dataset_modelA_reco = dataset_modelA_reco.reset_index(drop=True)
print('before transformation', dataset_modelA_reco.shape)

In [None]:
#Start to transform the matrix to do multi-classifier

dataset_modelA_reco_Y = pd.DataFrame([],columns=['svocmasterid','target_multi'])

#add multi-products as targets, if one customer buys multi products together, as many as rows will be created
for code in products:
    temp = dataset_modelA_reco.loc[dataset_modelA_reco['target_%s' %(code)]==1, ['svocmasterid']].reset_index(drop=True)
    temp['target_multi'] = code
    dataset_modelA_reco_Y = dataset_modelA_reco_Y.append(temp,ignore_index=True)

dataset_modelA_reco = dataset_modelA_reco[[col for col in dataset_modelA_reco.columns.values if not col.startswith('target')]].merge(
                    dataset_modelA_reco_Y, on='svocmasterid' , how='left')

dataset_modelA_reco['target_multi'] = dataset_modelA_reco['target_multi'].map(str).map(target_map)

print('after transformation', dataset_modelA_reco.shape)

In [None]:
print(np.unique(dataset_modelA_reco.target_multi))
print(dataset_modelA_reco.target_multi.value_counts(dropna=False))

# What has been indeed bought

In [None]:
products_bought = dataset_modelA_reco.target_multi.value_counts().reset_index()
products_bought.columns = ['product', 'nb_buy']
products_bought['product'] = products_bought['product'].apply(lambda x : inv_target_map[x] )
products_bought['p_buy']=products_bought.nb_buy/products_bought.nb_buy.sum()*100
products_bought = products_bought.sort_values(by='nb_buy', ascending=True)
products_bought['color'] = ['rgb(140,86,75)', 'rgb(31,119,180)', 'rgb(255,127,14)','rgb(214,39,40)','rgb(255,105,180)', 'rgb(44,160,44)', 'rgb(148,103,189)']
products_bought = products_bought.sort_values(by='nb_buy', ascending=False)

data = [go.Bar(
        x=products_bought['product'].astype(str).values,
        y=products_bought['nb_buy'].values,
        marker=dict(color=products_bought['color'].values))
       ]

layout = dict(
    title='Products bought by customer',
    height=600,
    xaxis=dict(autotick=False, ticks='outside', tick0=0, dtick=1, ticklen=8, tickwidth=1),
)

annotations = []

x_data = products_bought['product'].values
y_data = products_bought['p_buy'].values
y_pos = products_bought['nb_buy'].values

# Adding labels
for xd, yd, ypos in zip(x_data, y_data, y_pos):
    annotations.append(dict(xref='x1', yref='y1', x=xd, y=ypos,
                            text=str('%.2f') %yd,
                            font=dict(size=16, color='rgb(150,54,3)'),
                            xanchor='center', yanchor='bottom',
                            showarrow=False,))

layout['annotations'] = annotations


fig=go.Figure(data=data, layout=layout)
iplot(fig)

# Adding weights

In [None]:
weights = pd.DataFrame(dataset_modelA_reco.target_multi.value_counts(dropna=False)).reset_index()
weights.columns = ['target_multi', 'count']
weights['weight'] = [1, 15, 15, 15, 10, 10, 1]

In [None]:
dataset_modelA_reco = dataset_modelA_reco.merge(weights[['target_multi', 'weight']], on ='target_multi')

# Start CV for recommendation

In [None]:
def run_reco_cv(_df, _classifier , _features_columns, _target_cols, _probas_cols):
    
    results_cv_multi = pd.DataFrame([],columns=np.concatenate([['svocmasterid','target_multi','fold'],probas_cols]))
    ss = sk.cross_validation.ShuffleSplit(len(_df.svocmasterid.unique()), n_iter=10, test_size=.25, random_state=0)

    for i, (train_index, valid_index) in enumerate(ss):

        print ('shuffle %i'%(i))

        customer_id = _df.svocmasterid.unique().copy()
        shuffled_customer_id = np.array(sorted(customer_id, key = lambda k: random.random()))
        train_customer_id = shuffled_customer_id[train_index]
        valid_customer_id = shuffled_customer_id[valid_index]

        train = _df.loc[_df.svocmasterid.isin(train_customer_id),np.concatenate([_features_columns,['target_multi', 'weight']],axis=0)].copy().reset_index(drop=True)
        valid = _df.loc[_df.svocmasterid.isin(valid_customer_id),np.concatenate([_features_columns,['target_multi', 'weight']],axis=0)].copy().reset_index(drop=True)

        temp = valid[['svocmasterid', 'target_multi']].copy()
        temp['fold'] = i

        #modeling#
        train_X = train.drop(['svocmasterid', 'target_multi', 'weight'], axis=1)
        valid_X = valid.drop(['svocmasterid', 'target_multi', 'weight'], axis=1)

        train_Y = np.array(train['target_multi'].astype(np.uint8))
        valid_Y = np.array(valid['target_multi'].astype(np.uint8))

        _classifier.fit(train_X, train_Y, sample_weight = train['weight'])

        _probas = _classifier.predict_proba(valid_X)
        probabilities = pd.DataFrame(data=_probas, index=valid_X.index, columns=_probas_cols)

        _preds = _classifier.predict(valid_X)
        predictions = pd.DataFrame(data=_preds, index=valid_X.index, columns=['prediction'])

        temp = temp.join(probabilities, how='left')
        temp = temp.join(predictions, how='left')
        results_cv_multi = results_cv_multi.append(temp)

    results_cv_multi = results_cv_multi.reset_index(drop=True)
    binary_targets = pd.get_dummies(results_cv_multi.target_multi)
    binary_targets.columns = _target_cols

    results_cv_multi = results_cv_multi.join(binary_targets,how='left')
    print ('Done!')
    return results_cv_multi

In [None]:
for col in dataset_modelA_reco.columns.values:
    if dataset_modelA_reco[col].isnull().values.any():
        print(col)

In [None]:
clf = xgb.XGBClassifier(objective = "multi:softprob", max_depth=7, n_estimators=200, learning_rate=0.1,
                        max_delta_step=0,min_child_weight=20, gamma=1, scale_pos_weight=1, colsample_bytree=0.85,
                        subsample=0.85,colsample_bylevel=0.85, nthread=14, seed=27)

results_cv_multi= run_reco_cv(dataset_modelA_reco, clf, features_columns_reco, target_cols, probas_cols)

# Checking results

In [None]:
#### To check the performance results
results_cv_multi_2 = results_cv_multi.copy()
results_cv_multi_2 = results_cv_multi_2.groupby(['svocmasterid'], as_index=False).max()

def results_recommendation_cv(_results_test, _probas_cols, _target_cols):
    dict_reco = dict({'prob_of_endowment': 'endowment',
                      'prob_of_universal_life': 'universal_life',
                      'prob_of_retirement': 'retirement',
                      'prob_of_term_life': 'term_life',
                      'prob_of_health': 'health',
                      'prob_of_short_term_saving': 'short_term_saving',
                      'prob_of_whole_life': 'whole_life'
                      })

    results_reco = _results_test[np.concatenate([['svocmasterid'], _probas_cols])].copy()
    results_reco = results_reco.set_index(['svocmasterid'])

    arank = results_reco.apply(np.argsort, axis=1)
    ranked_cols = results_reco.columns.to_series()[arank.values[:, ::-1][:, :len(_probas_cols)]]
    results_reco2 = pd.DataFrame(ranked_cols, index=results_reco.index)
    results_reco2 = results_reco2.reset_index(drop=False)
    reco_cols = ['recommendation%i' % (i + 1) for i in range(len(_probas_cols))]
    results_reco2.columns = np.concatenate([['svocmasterid'], reco_cols])

    results_test2 = _results_test.merge(results_reco2, on=['svocmasterid'], how='left')

    for i in range(1, len(_probas_cols) + 1):
        results_test2['recommendation%i' % (i)] = results_test2['recommendation%i' % (i)].apply(
            lambda x: dict_reco[x])

    results_test2['nb_product_bought'] = results_test2[_target_cols].sum(axis=1)
    return results_test2

results_cv_multi_2 = results_recommendation_cv(results_cv_multi_2, probas_cols, target_cols)


def results_product_bought(_df, _products_code, _products_name):

    _products_name2 = [str.replace(prod, ' ', '') for prod in _products_name]

    _results_test = _df.copy()

    for code, code_name in zip(_products_code, _products_name2):
        _results_test['product_%s' % (code)] = ''
        _results_test.loc[_results_test['target_%s' % (code)] == 1, 'product_%s' % (code)] = code_name + ' '

    _results_test['product_bought'] = _results_test[['product_' + prod for prod in _products_code]].sum(axis=1)
    _results_test.loc[_results_test.nb_product_bought == 0, 'product_bought'] = 'Nothing'
    _results_test.loc[_results_test.nb_product_bought == 1, 'product_bought'] = _results_test.product_bought.apply(
        lambda x: str.replace(x, ' ', ''))
    _results_test = _results_test.drop(['product_' + prod for prod in _products_code], axis=1)
    return _results_test

def results_reco_efficiency(_results_test, _print):

    effective_buy_product = _results_test.copy()

    effective_buy_product['correct_reco1'] = effective_buy_product.apply(lambda x: x['recommendation1'] in x['product_bought'], axis=1)
    effective_buy_product['correct_reco2'] = effective_buy_product.apply(lambda x: (x['recommendation1'] in x['product_bought']) |
                                                                         (x['recommendation2'] in x['product_bought']), axis=1)
    effective_buy_product['correct_reco3'] = effective_buy_product.apply(lambda x: (x['recommendation1'] in x['product_bought']) | 
                                                                         (x['recommendation2'] in x['product_bought']) | 
                                                                         (x['recommendation3'] in x['product_bought']), axis=1)
    if _print == 1:
        print('out of %i customers, %i (%.2f perc.) bought the product recommended'
              % (effective_buy_product.shape[0], effective_buy_product.correct_reco1.sum(),
                 effective_buy_product.correct_reco1.mean() * 100))
        print('out of %i customers, %i (%.2f perc.) bought the product recommended'
              % (effective_buy_product.shape[0], effective_buy_product.correct_reco2.sum(),
                 effective_buy_product.correct_reco2.mean() * 100))
        print('out of %i customers, %i (%.2f perc.) bought the product recommended'
              % (effective_buy_product.shape[0], effective_buy_product.correct_reco3.sum(),
                 effective_buy_product.correct_reco3.mean() * 100))

    return effective_buy_product


results_cv_multi_3 = results_product_bought(results_cv_multi_2, products, products_name)

effective_buy_cv_multi = pd.DataFrame(['correct_reco1','correct_reco2','correct_reco3'], columns=['nb_product_recommend'])

effective_buy_cv_multi = results_reco_efficiency(results_cv_multi_3, 1)
effective_buy_cv_multi = effective_buy_cv_multi[['correct_reco1','correct_reco2','correct_reco3']].mean()*100
effective_buy_cv_multi = effective_buy_cv_multi.reset_index()
effective_buy_cv_multi = effective_buy_cv_multi.rename(columns={'index':'nb_product_recommend', 0 : 'correct_reco_mean'})

effective_buy_cv_multi[['nb_product_recommend', 'correct_reco_mean']]

# Recommendation diversity

In [None]:
def figure_diversity(_df_diversity, _model_title):
    diversity_stat = _df_diversity.copy()

    fig = {
        "data": [  
            {"values": diversity_stat['reco1'].values,
             "labels": diversity_stat['product'].values,
             "name": 'reco1',
             "hole": .4,
             "type": "pie",
             "rotation":90,
             "textinfo":"percent+value+label",
             "textposition":"inside",
             "marker": {'colors': diversity_stat['color'].values},
             "domain": {'x': [0, .45],
                        'y': [.55, 1]}
            },
            {"values": diversity_stat['reco2'].values,
             "labels": diversity_stat['product'].values,
             "name": 'reco2',
             "hole": .4,
             "type": "pie",
             "rotation":90,
             "textinfo":"percent+value+label",
             "textposition":"inside",
             "marker": {'colors': diversity_stat['color'].values},
             "domain": {'x': [.55, 1],
                        'y': [.55, 1]}
            },
            {"values": diversity_stat['reco3'].values,
             "labels": diversity_stat['product'].values,
             "name": 'reco3',
             "hole": .4,
             "type": "pie",
             "rotation":90,
             "textinfo":"percent+value+label",
             "textposition":"inside",
             "marker": {'colors': diversity_stat['color'].values},
             "domain": {'x': [0, .45],
                        'y': [0, .45]}
            },
            {"values": diversity_stat['sum'].values,
             "labels": diversity_stat['product'].values,
             "name": 'sum',
             "hole": .4,
             "type": "pie",
             "rotation":90,
             "textinfo":"percent+value+label",
             "textposition":"inside",
             "marker": {'colors': diversity_stat['color'].values},
             "domain": {'x': [.55, 1],
                        'y': [0, .45]}
            }
        ],
        "layout": {
            "title": _model_title,
            "width":950,
            "height":1000,
            "margin":go.Margin(
                l=0,
                r=0,
                b=100,
                t=100,
                pad=4
            ),
            "showlegend":True,
            "legend":dict(
                x=0.43,
                y=0.5
            ),
            "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "1st Reco",
                "x": 0.17,
                "y": 0.79
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "2nd Reco",
                "x": 0.83,
                "y": 0.79
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "3rd Reco",
                "x": 0.17,
                "y": 0.21
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Sum Reco",
                "x": 0.84,
                "y": 0.21
            }
        ]
        }
    }
    iplot(fig)

In [None]:
dict_color=dict({
        'universal_life' : 'rgb(31,119,180)',
        'retirement' :'rgb(255,127,14)',
        'short_term_saving' : 'rgb(44,160,44)',
        'term_life' : 'rgb(214,39,40)',
        'whole_life' : 'rgb(148,103,189)',
        'endowment' : 'rgb(140,86,75)',
        'health': 'rgb(255,105,180)'
    })

In [None]:
recommend_info = pd.DataFrame(dict_color.keys())
recommend_info.columns = ['product']
recommend_info['color'] = recommend_info['product'].apply(lambda x : dict_color[x])

reco1 = results_cv_multi_3.groupby(['recommendation1'], as_index=False).count()[['recommendation1', 'svocmasterid']]
reco2 = results_cv_multi_3.groupby(['recommendation2'], as_index=False).count()[['recommendation2', 'svocmasterid']]
reco3 = results_cv_multi_3.groupby(['recommendation3'], as_index=False).count()[['recommendation3', 'svocmasterid']]

reco1.columns = ['product', 'reco1']
reco2.columns = ['product', 'reco2']
reco3.columns = ['product', 'reco3']

recommend_info = recommend_info.merge(reco1, on='product', how='left')
recommend_info = recommend_info.merge(reco2, on='product', how='left')
recommend_info = recommend_info.merge(reco3, on='product', how='left')

recommend_info = recommend_info.fillna(0)
recommend_info['sum'] = recommend_info['reco1'] + recommend_info['reco2'] + recommend_info['reco3']

recommend_info

In [None]:
figure_diversity(recommend_info, "Recommendation Diversify")