title: Onboarding speed and user satisfaction   
author: Fabio Schmidt-Fischbach    
date: 2020-01-01
region: EU
link: https://docs.google.com/presentation/d/1anUHRFVdtRz5MpVS_yzI37KQJs2ryypNWYEaZv8DdUE/edit?usp=sharing    
tags: kyc, acquire, onboarding, ux, speed, yellow flow
summary: In this research, we found that the quicker safened gets back to our customers the more likely they are to turn MAUs. Initial yellow flow customers that get feedback early have KYC completion rates of up to 50%. After 2 hours this is 10pp lower. Suggestive evidence that speed matters at the start (first two hours) - results for wider time horizon (etc days) are much noisier/less clear.

In [None]:
from IPython import get_ipython
import os 
get_ipython().system('pip install -U scikit-learn matplotlib')
get_ipython().system('pip install -U fuzzywuzzy')
get_ipython().system('pip install -U altair')
get_ipython().system('pip install -U seaborn')
get_ipython().system('pip install -U statsmodels')
get_ipython().system('pip install -U vega')
get_ipython().system('pip install -U altair vega_datasets notebook vega')
import os
os.chdir('/app')
# Import Libraries
import pandas as pd
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
from utils.datalib_database import df_from_sql
from multiprocessing import Pool
import time
import gc 
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from scipy.stats import ks_2samp
import scipy.stats as stats
import statsmodels
from pylab import savefig
from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.proportion import proportions_ztest
import altair as alt
alt.renderers.enable('notebook')
overall_start = time.time()

# time from kyc init to kyc complete

In [112]:
# sample

query = """ 


with first_kycc  as ( 
    select user_created, kyc_first_initiated, kyc_first_completed, country_tnc_legal
    from dbt.zrh_users 
    where user_created between current_date - interval '12 months' 
            and current_date - interval '35 days'
), 
kyc_experience as ( 
    select first_kycc.*,
            datediff(minutes, kyc_first_initiated, kyc_first_completed) as time_diff, 
            count(1) as attempts
    from cmd_kyc_process as cmd 
    inner join first_kycc on first_kycc.user_created = cmd.user_created 
                                 and cmd.created <= kyc_first_completed
    group by 1,2,3,4,5 
),
final as ( 
select  kyc_experience.*,
        sum(n_ext_total_out+n_ext_total_in) as total,
        sum(amount_cents_ext_total_out+amount_cents_ext_total_in)::float/100 as volume
from kyc_experience 
left join dbt.zrh_txn_day as ztd on ztd.user_created = kyc_experience.user_created 
                                and ztd.txn_date between kyc_experience.kyc_first_completed::date 
                                                    and  kyc_experience.kyc_first_completed::date + interval '35 days'

group by 1,2,3,4,5,6 
)

select final.*,
    case when total > 0 then 1 else 0 end as activated 
from final 



"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("kyc_time.pkl")

OperationalError: (psycopg2.OperationalError) could not connect to server: Connection refused
	Is the server running on host "n26-dwh.cfxsmcyyfcch.eu-central-1.redshift.amazonaws.com" (172.30.0.27) and accepting
	TCP/IP connections on port 5439?

(Background on this error at: http://sqlalche.me/e/e3q8)

In [207]:
data = pd.read_pickle("kyc_time.pkl")

data["c"] = 1
data = data.groupby("time_diff").agg("count").reset_index()

data["group"] = 1
data["total"] = data.groupby(["group"])["c"].transform("sum")

data["perc"] = 100 * data["c"] / data["total"]
data["percentile"] = data["perc"].cumsum()

data = data.loc[data["time_diff"] < 120, :]

alt.Chart(data).mark_line().encode(
    x=alt.X("time_diff:Q", axis=alt.Axis(title="Minutes for KYCc")),
    y=alt.Y("percentile:Q", axis=alt.Axis(title="Percentile")),
).properties(title="How many minutes do people take to do KYC?")

In [16]:
data.loc[:, ["time_diff", "percentile"]].head(6)

Unnamed: 0,time_diff,percentile
0,0,0.213098
1,1,0.213441
2,2,0.213869
3,3,0.25889
4,4,0.856507
5,5,3.833443
6,6,10.862664
7,7,19.826486
8,8,28.367715
9,9,35.857947


In [10]:
data = pd.read_pickle("kyc_time.pkl")

data = data.groupby(["time_diff"]).agg("count").reset_index()

data["total"] = data.groupby(["activated"])["attempts"].transform("sum")

data["perc"] = 100 * data["attempts"] / data["total"]
data["percentile"] = data["perc"].cumsum()

data = data.loc[data["time_diff"] < 100, :]

alt.Chart(data).mark_line().encode(
    x=alt.X("time_diff:Q", axis=alt.Axis(title="Minutes for KYCc")),
    y=alt.Y("percentile:Q", axis=alt.Axis(title="Percentile")),
    color=alt.Color("activated:N"),
).properties(
    title="Users that did a transaction within 35 days of KYCC vs those that did not"
)

In [199]:
data = pd.read_pickle("kyc_time.pkl")


data["time_diff"] = data["time_diff"]
data["time_diff"] = data["time_diff"].round(0)

data = data.groupby(["time_diff"]).agg("mean").reset_index()


data = data.loc[data["time_diff"] < 120, :]

original = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("time_diff:Q", axis=alt.Axis(title="Minutes until KYCc")),
        y=alt.Y("activated:Q", axis=alt.Axis(format="%", title="% ever MAU")),
    )
    .properties(title="Probability of ever turning MAU", width=400)
)


ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(activated)", frame=[-3, 3])
    .encode(x="time_diff:Q", y="rolling_mean:Q")
)

original + ma

In [200]:
data = pd.read_pickle("kyc_time.pkl")


data["time_diff"] = data["time_diff"] / 60
data["time_diff"] = data["time_diff"].round(0)

data = data.groupby(["time_diff"]).agg("mean").reset_index()


data = data.loc[data["time_diff"] < 80, :]

original = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("time_diff:Q", axis=alt.Axis(title="Hours until KYCc")),
        y=alt.Y("activated:Q", axis=alt.Axis(format="%", title="% ever MAU")),
    )
    .properties(title="Probability of ever turning MAU", width=400)
)


ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(activated)", frame=[-3, 3])
    .encode(x="time_diff:Q", y="rolling_mean:Q")
)

original + ma

In [204]:
data = pd.read_pickle("kyc_time.pkl")


data = data.loc[data["total"] > 0, :]
data["time_diff"] = data["time_diff"].round(0)

data = data.groupby(["time_diff"]).agg("mean").reset_index()


data = data.loc[data["time_diff"] < 120, :]

original = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("time_diff:Q", axis=alt.Axis(title="Minutes until KYCc")),
        y=alt.Y("total:Q", axis=alt.Axis(title="# transactions within first 35 days")),
    )
    .properties(
        title="# transactions within first 35 days (for those that ever turned MAU)",
        width=400,
    )
)


ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(total)", frame=[-3, 3])
    .encode(x="time_diff:Q", y="rolling_mean:Q")
)

original + ma

In [205]:
data = pd.read_pickle("kyc_time.pkl")


data = data.loc[data["total"] > 0, :]
data["time_diff"] = data["time_diff"].round(0)

data = data.groupby(["time_diff"]).agg("mean").reset_index()


data = data.loc[data["time_diff"] < 120, :]

original = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("time_diff:Q", axis=alt.Axis(title="Minutes until KYCc")),
        y=alt.Y("volume:Q", axis=alt.Axis(title="volume within first 35 days")),
    )
    .properties(
        title="volume within first 35 days (for those that ever turned MAU)", width=400
    )
)


ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(volume)", frame=[-3, 3])
    .encode(x="time_diff:Q", y="rolling_mean:Q")
)

original + ma

In [111]:
# of failed attempts

data = pd.read_pickle("kyc_time.pkl")


data = data.groupby("attempts").agg("mean").reset_index()

data = data.loc[data["attempts"] < 20, :]

alt.Chart(data).mark_line().encode(
    alt.X("attempts:Q", axis=alt.Axis(title="Number of attempts before completion")),
    alt.Y("total:Q", axis=alt.Axis(title="Number of transactions (35 days)")),
).properties(title="# attempts and avg. transactions")

# time to response (instead of time from kyci to kyc c).

In [116]:
# sample

query = """ 


with all_tries as ( 

    select user_created, 
            created, 
            date_add('ms',updated,'1970-01-01') as updated,
            row_number() over(partition by user_created order by created) as rn 
    from sb_verification 
    where 1=1 
        and status ilike 'VERIFICATION%'
        and created_on_partner is not null 
        and user_created >= current_date - interval '12 months'
        and user_created < current_date - interval '2 months'
), 
first_try as ( 
    select user_created, 
            created, 
            datediff(hours, created, updated) as hours 
    from all_tries 
    where rn =1  
),
final as ( 
select  kyc_experience.*,
        sum(n_ext_total_out+n_ext_total_in) as total,
        sum(amount_cents_ext_total_out+amount_cents_ext_total_in)::float/100 as volume
from first_try as kyc_experience 
left join dbt.zrh_txn_day as ztd on ztd.user_created = kyc_experience.user_created 
                                and ztd.txn_date between kyc_experience.created::date 
                                                    and  kyc_experience.created::date + interval '35 days'

group by 1,2,3
)

select final.*,
    case when total > 0 then 1 else 0 end as activated 
from final 



"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("response_time.pkl")

{"message": "started", "db": "redshiftreader", "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 114, "funcName": "df_from_sql", "created": "20200224T113930", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "82f12c7b-199c-49de-bb5a-6247983a255d", "hostname": "172.19.0.4"}
{"message": "success", "db": "redshiftreader", "duration": 158.2244, "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 124, "funcName": "df_from_sql", "created": "20200224T114208", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "82f12c7b-199c-49de-bb5a-6247983a255d", "hostname": "172.19.0.4"}


In [121]:
data = pd.read_pickle("response_time.pkl")

data = data.groupby(["hours", "activated"]).agg("count").reset_index()

data["all"] = data.groupby("activated")["total"].transform("sum")
data["perc"] = 100 * data["total"] / data["all"]

data["percentile"] = data.groupby("activated")["perc"].cumsum()

data = data.loc[data["hours"] < 24, :]

alt.Chart(data).mark_line().encode(
    x=alt.X("hours:Q", axis=alt.Axis(title="Hours until first response")),
    y=alt.Y("percentile:Q", axis=alt.Axis(title="Percentile")),
    color=alt.Color("activated:N"),
).properties(
    title="Users that did a transaction within 35 days of KYCC vs those that did not"
)

In [194]:
data = pd.read_pickle("response_time.pkl")

data = data.groupby(["activated"]).agg("mean").reset_index()


alt.Chart(data).mark_bar().encode(
    x=alt.X("activated:N", axis=alt.Axis(title="% ever MAU")),
    y=alt.Y("hours:Q", axis=alt.Axis(title="Avg. hours until first response")),
).properties(title="How long do MAU vs non-MAUs take to get their first response?")

In [189]:
# estimate probit.

data = pd.read_pickle("response_time.pkl")

data = data.groupby(["hours"]).agg("mean").reset_index()

data = data.loc[data["hours"] < 48, :]


original = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("hours:Q", axis=alt.Axis(title="Hours until first feedback")),
        y=alt.Y("activated:Q", axis=alt.Axis(format="%", title="% ever MAU")),
    )
    .properties(title="Probability of ever turning MAU", width=400)
)


ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(activated)", frame=[-3, 3])
    .encode(x="hours:Q", y="rolling_mean:Q")
)

original + ma

In [161]:
# sample

query = """ 


with all_tries as ( 

    select user_created, 
            created, 
            date_add('ms',updated,'1970-01-01') as updated,
            row_number() over(partition by user_created order by created) as rn,
            status
    from sb_verification 
    where 1=1 
        and status ilike 'VERIFICATION%'
        and created_on_partner is not null 
        and user_created >= current_date - interval '12 months'
        and user_created < current_date - interval '2 months'
), 
first_try as ( 
    select user_created, 
            created, 
            datediff(minutes, created, updated) as minutes,
            status, 
            datediff(hours, created, updated) as hours
    from all_tries 
    where rn =1  
),
final as ( 
select  kyc_experience.*,
        sum(n_ext_total_out+n_ext_total_in) as total,
        sum(amount_cents_ext_total_out+amount_cents_ext_total_in)::float/100 as volume
from first_try as kyc_experience 
left join dbt.zrh_txn_day as ztd on ztd.user_created = kyc_experience.user_created 
                                and ztd.txn_date between kyc_experience.created::date 
                                                    and  kyc_experience.created::date + interval '35 days'

group by 1,2,3,4,5
)

select final.*,
    case when total > 0 then 1 else 0 end as activated 
from final 



"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("response_time_minutes.pkl")

{"message": "started", "db": "redshiftreader", "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 114, "funcName": "df_from_sql", "created": "20200224T134119", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "82f12c7b-199c-49de-bb5a-6247983a255d", "hostname": "172.19.0.4"}
{"message": "success", "db": "redshiftreader", "duration": 199.634, "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 124, "funcName": "df_from_sql", "created": "20200224T134439", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "82f12c7b-199c-49de-bb5a-6247983a255d", "hostname": "172.19.0.4"}


In [190]:
# estimate probit.

data = pd.read_pickle("response_time_minutes.pkl")

data = data.groupby(["minutes"]).agg("mean").reset_index()

data = data.loc[data["minutes"] < 120, :]

original = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("minutes:Q", axis=alt.Axis(title="Minutes until first feedback")),
        y=alt.Y("activated:Q", axis=alt.Axis(format="%", title="% ever MAU")),
    )
    .properties(title="Probability of ever turning MAU", width=400)
)


ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(activated)", frame=[-3, 3])
    .encode(x="minutes:Q", y="rolling_mean:Q")
)

original + ma

# yellow flow

In [156]:
# sample

query = """ 


with all_tries as ( 

    select user_created, 
            created, 
            date_add('ms',updated,'1970-01-01') as updated,
            row_number() over(partition by user_created order by created) as rn,
            status
    from sb_verification 
    where 1=1 
        and status ilike 'VERIFICATION%'
        and created_on_partner is not null 
        and user_created >= current_date - interval '12 months'
        and user_created < current_date - interval '2 months'
), 
first_try as ( 
    select  user_created, 
            created, 
            datediff(minutes, created, updated) as minutes,
            status, 
            datediff(hours, created, updated) as hours 
    from all_tries 
    where rn =1 and status ilike 'VERIFICATION_MORE_INFO%'
)

select first_try.*, 
        case when kyc_first_completed is not null then 1 else 0 end as kyc_complete 
from first_try 
left join dbt.zrh_users using(user_created) 


"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("yellow_flow.pkl")

{"message": "started", "db": "redshiftreader", "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 114, "funcName": "df_from_sql", "created": "20200224T133926", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "82f12c7b-199c-49de-bb5a-6247983a255d", "hostname": "172.19.0.4"}
{"message": "success", "db": "redshiftreader", "duration": 40.746, "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 124, "funcName": "df_from_sql", "created": "20200224T134007", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "82f12c7b-199c-49de-bb5a-6247983a255d", "hostname": "172.19.0.4"}


In [152]:
data = pd.read_pickle("yellow_flow.pkl")

data = data.groupby(["minutes"]).agg("mean").reset_index()

data = data.loc[data["minutes"] < 120, :]
data = data.loc[data["minutes"] > 1, :]

original = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X(
            "minutes:Q",
            axis=alt.Axis(title="Minutes until first feedback (first two hours)"),
        ),
        y=alt.Y("kyc_complete:Q", axis=alt.Axis(format="%", title="% KYC complete")),
    )
    .properties(title="% complete kyc after first try ended in yellow flow", width=400)
)


ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(kyc_complete)", frame=[-3, 3])
    .encode(x="minutes:Q", y="rolling_mean:Q")
)

original + ma

In [155]:
data = pd.read_pickle("yellow_flow.pkl")

data = data.groupby(["hours"]).agg("mean").reset_index()

data = data.loc[data["hours"] < 80, :]


original = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("hours:Q", axis=alt.Axis(title="Hours until first feedback")),
        y=alt.Y("kyc_complete:Q", axis=alt.Axis(format="%", title="% KYC complete")),
    )
    .properties(title="% complete kyc after first try ended in yellow flow", width=400)
)


ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(kyc_complete)", frame=[-3, 3])
    .encode(x="hours:Q", y="rolling_mean:Q")
)

original + ma

In [208]:
# by initial status

data = pd.read_pickle("response_time_minutes.pkl")

data = data.groupby(["hours", "status"]).agg("mean").reset_index()

data = data.loc[data["hours"] < 48, :]

data = data.loc[
    (data["status"] == "VERIFICATION_MORE_INFO_CORE")
    | (data["status"] == "VERIFICATION_MORE_INFO_DOCS")
    | (data["status"] == "VERIFICATION_SUCCESS_STORED"),
    :,
]

ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(activated)", frame=[-3, 3])
    .encode(
        x="hours:Q",
        y=alt.Y("activated:Q", axis=alt.Axis(format="%", title="% ever MAU")),
        color="status",
    )
    .properties(
        title="% MAU by status of first kyc request and processing time", width=500
    )
)

ma

In [188]:
# by initial status

data = pd.read_pickle("response_time_minutes.pkl")

data = data.groupby(["minutes", "status"]).agg("mean").reset_index()

data = data.loc[data["minutes"] < 120, :]


data = data.loc[
    (data["status"] == "VERIFICATION_MORE_INFO_DOCS")
    | (data["status"] == "VERIFICATION_SUCCESS_STORED"),
    :,
]

ma = (
    alt.Chart(data)
    .mark_line(color="red", size=3)
    .transform_window(rolling_mean="mean(activated)", frame=[-10, 10])
    .encode(
        x="minutes:Q",
        y=alt.Y("activated:Q", axis=alt.Axis(format="%", title="% ever MAU")),
        color="status",
    )
    .properties(title="% MAU by status of first kyc request and processing time")
)

ma

Unnamed: 0,minutes,status,hours,total,volume,activated
0,1,VERIFICATION_MORE_INFO_DOCS,0.000000,5.000000,158.050000,0.500000
3,2,VERIFICATION_MORE_INFO_DOCS,0.026987,10.277363,1967.060429,0.264325
7,3,VERIFICATION_MORE_INFO_DOCS,0.043032,9.907029,2277.238416,0.275819
9,3,VERIFICATION_SUCCESS_STORED,0.250000,8.333333,477.986667,0.750000
12,4,VERIFICATION_MORE_INFO_DOCS,0.063693,9.669769,1754.748426,0.274191
14,4,VERIFICATION_SUCCESS_STORED,0.053463,11.167126,1469.852733,0.609356
17,5,VERIFICATION_MORE_INFO_DOCS,0.083238,10.069488,1784.347129,0.265085
19,5,VERIFICATION_SUCCESS_STORED,0.079930,10.927669,1812.815880,0.603772
22,6,VERIFICATION_MORE_INFO_DOCS,0.096825,10.124588,1667.724181,0.255988
24,6,VERIFICATION_SUCCESS_STORED,0.099008,10.920834,1627.622662,0.589677
