title: Travel spend investigation pt1   
author: Fabio Schmidt-Fischbach   
date: 2020-04-01
region: EU
link: https://docs.google.com/presentation/d/11lt9yUq5WyNWrIGETxpXyMW6Wz3xsF0Bivmpt4aj5yY/edit?usp=sharing   
tags: cashback, travel, premium, memberships
summary: Here we aim to understand the travel spend of our customers for the upcoming cashback feature, namely who spends, how do they spend and where do they spend. We found that a lot of travel (69%) is short-term (likely to be an inflated number due to falsely labelled ecommerce transactions) - 11% of trips are longer than 14 days. 60% of users travel more than once - 8% of users travel more than 10 times a year. Top 10% of travellers in non-business premium tears would receive > 20 Euros per year. Top 10% of Standard travellers (at Metal cashback rate) would make > 8 Euro per year: this amounts to 45k users: big upgrade potential. 

In [None]:
from IPython import get_ipython
import os 
get_ipython().system('pip install -U scikit-learn matplotlib')
get_ipython().system('pip install -U fuzzywuzzy')
get_ipython().system('pip install -U altair')
get_ipython().system('pip install -U seaborn')
get_ipython().system('pip install -U statsmodels')
get_ipython().system('pip install -U vega')
get_ipython().system('pip install -U altair vega_datasets notebook vega')
import os
os.chdir('/app')
# Import Libraries
import pandas as pd
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
from utils.datalib_database import df_from_sql
from multiprocessing import Pool
import time
import gc 
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from scipy.stats import ks_2samp
import scipy.stats as stats
import statsmodels
from pylab import savefig
from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.proportion import proportions_ztest
import altair as alt
alt.renderers.enable('notebook')
overall_start = time.time()

# how much people spend and how often 
--> by product and market




In [232]:
# sample

query = """ 

with start_data as ( 
	select user_created, created, amount_cents_eur, region_group, lag(region_group,1) over(partition by user_created order by created) as region_lag 
	from dbt.zrh_card_transactions
	where card_tx_type in ('cardpresent')  and created >= current_date - interval '12 months'
        and type = 'PT'
),
cluster as ( 
select user_created, 
		created, 
		region_group, 
		region_lag,
        amount_cents_eur,
		sum(case when region_group != region_lag then 1 else 0 end )
			over(partition by user_created order by created rows unbounded preceding) as group_id   
from start_data 
),
counts as ( 
select  user_id, 
		group_id, 
		region_group, 
        country_tnc_legal, 
        zup.product_id, 
        sum(amount_cents_eur::float/100) as volume,
		count(1) as transactions, 
		max(cluster.created)  as last_purchase, 
		min(cluster.created)  as first_purchase,
		datediff(days, first_purchase, last_purchase) as days_spent 
from cluster 
inner join dbt.zrh_users on cluster.user_created = dbt.zrh_users.user_created
inner join dbt.zrh_user_product as zup on zup.user_created = cluster.user_created 
        and created between subscription_valid_from and subscription_valid_until 
where region_group = 'inter' and kyc_first_completed <= current_date - interval '12 months'
group by 1,2,3,4,5 
)

select *
from counts 
"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("travel_periods.pkl")

## how long do they go? 

In [234]:
##show distribution of days_spent


data = df.groupby("days_spent").agg("count").reset_index()

data["group_column"] = 0

data["total"] = data.groupby("group_column")["group_id"].transform("sum")

data["perc"] = 100 * (data["group_id"]) / data["total"]

## add cumulative sum
data["cum_sum"] = data["perc"].cumsum()

# drop very long travels
data["days_spent"] = data["days_spent"] + 1
data = data.loc[data["days_spent"] < 30, :]

## plotting results.

bars = (
    alt.Chart(data)
    .mark_bar()
    .encode(
        x=alt.X("days_spent:Q", axis=alt.Axis(title="Days on travel")),
        y=alt.Y("cum_sum:Q", axis=alt.Axis(title="Percentile")),
    )
    .properties(width=600, height=400, title="How long do people travel?")
)

bars

In [237]:
##show distribution of days_spent

data = df

data["days_category"] = ""
data.loc[data["days_spent"] <= 1, ["days_category"]] = "1-2 days"
data.loc[
    (data["days_spent"] > 1) & (data["days_spent"] <= 3), "days_category"
] = "2-3 days"
data.loc[
    (data["days_spent"] > 3) & (data["days_spent"] <= 7), "days_category"
] = "4-7 days"
data.loc[
    (data["days_spent"] > 7) & (data["days_spent"] <= 14), "days_category"
] = "7-14 days"
data.loc[(data["days_spent"] > 14), "days_category"] = ">14 days"

data = data.groupby("days_category").agg("count").reset_index()

data["group_column"] = 0
data["total"] = data.groupby("group_column")["group_id"].transform("sum")
data["perc"] = 100 * (data["group_id"]) / data["total"]

data["perc"] = data["perc"].round(0)
## plotting results.

bars = (
    alt.Chart(data)
    .mark_bar()
    .encode(
        x=alt.X("days_category:O", axis=alt.Axis(title="Days on travel")),
        y=alt.Y("perc:Q", axis=alt.Axis(title="% of travels")),
    )
    .properties(width=600, height=400, title="How long do people travel?")
)

text = bars.mark_text(
    align="left",
    baseline="middle",
    dx=0,
    dy=-5,  # Nudges text to right so it doesn't appear on top of the bar
).encode(text="perc:Q")

(bars + text).properties(height=900)

In [238]:
# how does this differ by product / market

##show distribution of days_spent

data = df

data = data.loc[data["product_id"] != "FLEX_ACCOUNT_MONTHLY", :]

data["days_category"] = ""
data.loc[data["days_spent"] <= 1, ["days_category"]] = "1-2 days"
data.loc[
    (data["days_spent"] > 1) & (data["days_spent"] <= 3), "days_category"
] = "2-3 days"
data.loc[
    (data["days_spent"] > 3) & (data["days_spent"] <= 7), "days_category"
] = "4-7 days"
data.loc[
    (data["days_spent"] > 7) & (data["days_spent"] <= 14), "days_category"
] = "7-14 days"
data.loc[(data["days_spent"] > 14), "days_category"] = ">14 days"

data = data.groupby(["days_category", "product_id"]).agg("count").reset_index()

data["group_column"] = 0
data["total"] = data.groupby(["group_column", "product_id"])["group_id"].transform(
    "sum"
)
data["perc"] = 100 * (data["group_id"]) / data["total"]

data["perc"] = data["perc"].round(0)
## plotting results.

bars = (
    alt.Chart(data)
    .mark_bar()
    .encode(
        x=alt.X("days_category:O", axis=alt.Axis(title="Days on travel")),
        y=alt.Y("perc:Q", axis=alt.Axis(title="% of travels")),
        column="product_id:N",
    )
    .properties(width=100, height=400, title="How long do people travel?")
)

bars

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [239]:
data.loc[:, ["days_category", "product_id", "perc"]].head(50)

Unnamed: 0,days_category,product_id,perc
0,1-2 days,BLACK_CARD_MONTHLY,65.0
1,1-2 days,BUSINESS_BLACK,76.0
2,1-2 days,BUSINESS_CARD,75.0
3,1-2 days,METAL_CARD_MONTHLY,76.0
4,1-2 days,STANDARD,69.0
5,2-3 days,BLACK_CARD_MONTHLY,5.0
6,2-3 days,BUSINESS_BLACK,5.0
7,2-3 days,BUSINESS_CARD,4.0
8,2-3 days,METAL_CARD_MONTHLY,5.0
9,2-3 days,STANDARD,4.0


## how often? 

In [241]:
data = pd.read_pickle("travel_periods.pkl")


data["count"] = 1

data = data.groupby(["user_id", "product_id"]).agg("sum").reset_index()
data["freq"] = 1
data = data.groupby("count").agg("sum").reset_index()

data["group_column"] = 0
data["total"] = data.groupby("group_column")["freq"].transform("sum")
data["perc"] = 100 * (data["freq"]) / data["total"]


data = data.loc[:, ["count", "perc"]]
data["cum_sum"] = data["perc"].cumsum()
# visualize

data = data.loc[data["count"] < 20, :]

bars = (
    alt.Chart(data)
    .mark_bar()
    .encode(
        x=alt.X("count:Q", axis=alt.Axis(title="Travel periods in a year")),
        y=alt.Y("cum_sum:Q", axis=alt.Axis(title="Percentile")),
    )
    .properties(width=600, height=400, title="How often do people travel in one year?")
)

bars

In [200]:
data.head(10)

Unnamed: 0,count,perc,cum_sum
0,1,40.588361,40.588361
1,2,18.441697,59.030058
2,3,10.745399,69.775457
3,4,6.689405,76.464862
4,5,4.842962,81.307823
5,6,3.502096,84.80992
6,7,2.60037,87.410289
7,8,1.992823,89.403112
8,9,1.584239,90.987352
9,10,1.26128,92.248632


In [242]:
data = pd.read_pickle("travel_periods.pkl")

data = data.loc[data["product_id"] != "FLEX_ACCOUNT_MONTHLY", :]

data["count"] = 1

data = data.groupby(["user_id", "product_id"]).agg("sum").reset_index()
data["freq"] = 1
data = data.groupby(["count", "product_id"]).agg("sum").reset_index()
data["total"] = data.groupby("product_id")["freq"].transform("sum")
data["perc"] = 100 * (data["freq"]) / data["total"]

data["cum_sum"] = data.groupby("product_id")["perc"].cumsum()

data = data.loc[data["count"] < 20, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("count:Q", axis=alt.Axis(title="Travel periods in a year")),
        y=alt.Y("cum_sum:Q", axis=alt.Axis(title="Percentile")),
        color=alt.Color("product_id"),
    )
    .properties(width=600, height=400, title="How often do people travel in one year?")
)

bars

## how many transactions/volume

In [243]:
data = pd.read_pickle("travel_periods.pkl")


# how much do they spend per trip.

data["volume"] = data["volume"].round(-2)

# compute frequency
data["count"] = 1
data = data.groupby("volume").agg("sum").reset_index()
# compute relative freq.
data["group_column"] = 0
data["total"] = data.groupby("group_column")["count"].transform("sum")
data["perc"] = 100 * (data["count"]) / data["total"]

data["percentile"] = data["perc"].cumsum()

data = data.loc[data["volume"] < 2000, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("volume:Q", axis=alt.Axis(title="Average travel spend per trip")),
        y=alt.Y("percentile:Q", axis=alt.Axis(title="Percentile")),
    )
    .properties(width=600, height=400, title="Travel spend per trip")
)

bars
data.loc[:, ["volume", "percentile"]].head(30)

In [3]:
data = pd.read_pickle("travel_periods.pkl")

# compute frequency
data["count"] = 1
data = data.groupby("transactions").agg("sum").reset_index()
# compute relative freq.
data["group_column"] = 0
data["total"] = data.groupby("group_column")["count"].transform("sum")
data["perc"] = 100 * (data["count"]) / data["total"]

data["percentile"] = data["perc"].cumsum()

data = data.loc[data["transactions"] < 100, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X(
            "transactions:Q", axis=alt.Axis(title="Number of transactions per trip")
        ),
        y=alt.Y("percentile:Q", axis=alt.Axis(title="Percentile")),
    )
    .properties(width=600, height=400, title="Transactions per trip")
)
bars

In [12]:
data = pd.read_pickle("travel_periods.pkl")

data["days_category"] = ""
data.loc[data["days_spent"] <= 1, ["days_category"]] = "1-2 days"
data.loc[
    (data["days_spent"] > 1) & (data["days_spent"] <= 3), "days_category"
] = "2-3 days"
data.loc[
    (data["days_spent"] > 3) & (data["days_spent"] <= 7), "days_category"
] = "4-7 days"
data.loc[
    (data["days_spent"] > 7) & (data["days_spent"] <= 14), "days_category"
] = "7-14 days"
data.loc[(data["days_spent"] > 14), "days_category"] = ">14 days"

data["count"] = 1
data["volume"] = data["volume"].round(-1)


data = data.groupby(["volume", "days_category"]).agg("sum").reset_index()

data["total"] = data.groupby("days_category")["count"].transform("sum")
data["perc"] = 100 * (data["count"]) / data["total"]

data["percentile"] = data.groupby("days_category")["perc"].cumsum()
data = data.loc[data["volume"] < 2000, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("volume:Q", axis=alt.Axis(title="Average travel spend per trip")),
        y=alt.Y("percentile:Q", axis=alt.Axis(title="Percentile")),
        color=alt.Color("days_category"),
    )
    .properties(width=600, height=400, title="Travel spend per trip - by trip length")
)

bars

In [14]:
data = pd.read_pickle("travel_periods.pkl")

data["days_category"] = ""
data.loc[data["days_spent"] <= 1, ["days_category"]] = "1-2 days"
data.loc[
    (data["days_spent"] > 1) & (data["days_spent"] <= 3), "days_category"
] = "2-3 days"
data.loc[
    (data["days_spent"] > 3) & (data["days_spent"] <= 7), "days_category"
] = "4-7 days"
data.loc[
    (data["days_spent"] > 7) & (data["days_spent"] <= 14), "days_category"
] = "7-14 days"
data.loc[(data["days_spent"] > 14), "days_category"] = ">14 days"

data["count"] = 1
data["cashback"] = data["volume"] * 0.01
data["cashback"] = data["cashback"].round(-1)

data = data.groupby(["cashback", "days_category"]).agg("sum").reset_index()

data["total"] = data.groupby("days_category")["count"].transform("sum")
data["perc"] = 100 * (data["count"]) / data["total"]

data["percentile"] = data.groupby("days_category")["perc"].cumsum()
data = data.loc[data["cashback"] < 50, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("cashback:Q", axis=alt.Axis(title="Travel cashback per trip")),
        y=alt.Y("percentile:Q", axis=alt.Axis(title="Percentile")),
        color=alt.Color("days_category"),
    )
    .properties(width=600, height=400, title="Cashback per trip - by trip length")
)

bars

## volume spend per year

In [246]:
data = pd.read_pickle("travel_periods.pkl")

data = data.groupby("user_id").agg("sum").reset_index()
# get total spend per user

data["count"] = 1
##count how often each volume occurs: round before doing so
data["volume"] = data["volume"].round(-1)
data = data.groupby("volume").agg("sum").reset_index()

data["group_column"] = 0
data["total"] = data.groupby("group_column")["count"].transform("sum")
data["perc"] = 100 * (data["count"]) / data["total"]

data["percentile"] = data["perc"].cumsum()

data = data.loc[data["volume"] < 5000, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("volume:Q", axis=alt.Axis(title="Spend per year")),
        y=alt.Y("percentile:Q", axis=alt.Axis(title="% of users who spent $ abroad")),
    )
    .properties(width=600, height=400, title="Total travel spend per year")
)

bars

In [250]:
data.loc[data["percentile"] > 88, ["volume", "percentile"]].head(1000)

Unnamed: 0,volume,percentile
697,1160.0,88.084416
703,1170.0,88.279221
709,1180.0,88.409091
715,1190.0,88.571429
721,1200.0,88.636364
727,1210.0,88.798701
733,1220.0,88.863636
739,1230.0,89.025974
745,1240.0,89.123377
751,1250.0,89.253247


In [251]:
data = pd.read_pickle("travel_periods.pkl")

data = data.groupby(["user_id", "product_id"]).agg("sum").reset_index()
# get total spend per user

data["count"] = 1
##count how often each volume occurs: round before doing so
data["volume"] = data["volume"].round(-1)
data = data.groupby(["volume", "product_id"]).agg("sum").reset_index()

data["total"] = data.groupby("product_id")["count"].transform("sum")
data["perc"] = 100 * (data["count"]) / data["total"]

data["percentile"] = data.groupby("product_id")["perc"].cumsum()

data = data.loc[data["volume"] < 5000, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("volume:Q", axis=alt.Axis(title="Spend per year")),
        y=alt.Y("percentile:Q", axis=alt.Axis(title="% of users who spent $ abroad")),
        color=alt.Color("product_id"),
    )
    .properties(width=600, height=400, title="Total travel spend per year")
)

bars

In [252]:
data = pd.read_pickle("travel_periods.pkl")

data = data.groupby(["user_id", "product_id"]).agg("sum").reset_index()
# get total spend per user

data["count"] = 1
##count how often each volume occurs: round before doing so
data["ic"] = 0
data.loc[data["product_id"] == "METAL_CARD_MONTHLY", "ic"] = 0.01
data.loc[data["product_id"] == "BUSINESS_CARD", "ic"] = 0.0025
data.loc[data["product_id"] == "BLACK_CARD_MONTHLY", "ic"] = 0.005
data.loc[data["product_id"] == "BUSINESS_BLACK", "ic"] = 0.0075

data["cashback"] = data["volume"] * data["ic"]
data["cashback"] = data["cashback"].round(0)
data = data.groupby(["cashback", "product_id"]).agg("sum").reset_index()


data["total"] = data.groupby("product_id")["count"].transform("sum")
data["perc"] = 100 * (data["count"]) / data["total"]

data["percentile"] = data.groupby("product_id")["perc"].cumsum()

data = data.loc[data["cashback"] < 100, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("cashback:Q", axis=alt.Axis(title="Cashback per year")),
        y=alt.Y(
            "percentile:Q", axis=alt.Axis(title="% of users who get $ in cashback")
        ),
        color=alt.Color("product_id"),
    )
    .properties(
        width=600,
        height=400,
        title="Distribution of travel cashback in 12 months (no ecommerce)",
    )
)

bars

In [253]:
# sample

query = """ 

with start_data as ( 
	select user_created, created, amount_cents_eur, region_group, lag(region_group,1) over(partition by user_created order by created) as region_lag 
	from dbt.zrh_card_transactions
	where created >= current_date - interval '12 months' and type = 'PT' and card_tx_type in ('ecomm', 'cardpresent')
),
cluster as ( 
select user_created, 
		created, 
		region_group, 
		region_lag,
        amount_cents_eur,
		sum(case when region_group != region_lag then 1 else 0 end )
			over(partition by user_created order by created rows unbounded preceding) as group_id   
from start_data 
),
counts as ( 
select  user_id, 
		group_id, 
		region_group, 
        country_tnc_legal, 
        zup.product_id, 
        sum(amount_cents_eur::float/100) as volume,
		count(1) as transactions, 
		max(cluster.created)  as last_purchase, 
		min(cluster.created)  as first_purchase,
		datediff(days, first_purchase, last_purchase) as days_spent 
from cluster 
inner join dbt.zrh_users on cluster.user_created = dbt.zrh_users.user_created
inner join dbt.zrh_user_product as zup on zup.user_created = cluster.user_created 
        and created between subscription_valid_from and subscription_valid_until 
where region_group = 'inter' and kyc_first_completed <= current_date - interval '12 months'
group by 1,2,3,4,5 
)

select *
from counts 
"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("all_abroad_spend.pkl")

In [254]:
data = pd.read_pickle("all_abroad_spend.pkl")

data = data.groupby(["user_id", "product_id"]).agg("sum").reset_index()
# get total spend per user

data["count"] = 1
##count how often each volume occurs: round before doing so
data["ic"] = 0
data.loc[data["product_id"] == "METAL_CARD_MONTHLY", "ic"] = 0.01
data.loc[data["product_id"] == "BUSINESS_CARD", "ic"] = 0.0025
data.loc[data["product_id"] == "BLACK_CARD_MONTHLY", "ic"] = 0.005
data.loc[data["product_id"] == "BUSINESS_BLACK", "ic"] = 0.0075

data["cashback"] = data["volume"] * data["ic"]
data["cashback"] = data["cashback"].round(0)
data = data.groupby(["cashback", "product_id"]).agg("sum").reset_index()


data["total"] = data.groupby("product_id")["count"].transform("sum")
data["perc"] = 100 * (data["count"]) / data["total"]

data["percentile"] = data.groupby("product_id")["perc"].cumsum()

data = data.loc[data["cashback"] < 100, :]

bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("cashback:Q", axis=alt.Axis(title="Cashback per year")),
        y=alt.Y(
            "percentile:Q", axis=alt.Axis(title="% of users who get $ in cashback")
        ),
        color=alt.Color("product_id"),
    )
    .properties(
        width=600,
        height=400,
        title="Distribution of travel cashback in 12 months (with ecommerce)",
    )
)

bars

## when do they travel

In [139]:
# get mau data.

# sample

query = """ 

select dwh.start_time, 
        product_id, 
        count(distinct act.user_created) as mau
from dwh_cohort_months as dwh 
left join dbt.zrh_user_activity_txn as act on dwh.end_time between act.activity_start and act.activity_end 
                                    and act.activity_type = '1_tx_35'
left join dbt.zrh_user_product as zup on zup.user_created = act.user_created and dwh.end_time between zup.subscription_valid_from and zup.subscription_valid_until 
where start_time >= current_date - interval '12 months' and start_time < current_date
group by 1,2 

"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("mau_timeseries.pkl")

In [212]:
data = pd.read_pickle("travel_periods.pkl")
mau = pd.read_pickle("mau_timeseries.pkl")

mau["ym"] = pd.to_datetime(mau["start_time"]).dt.to_period("M")
mau = mau.groupby("ym").agg("sum").reset_index()

data["count"] = 1

data["ym"] = pd.to_datetime(data["first_purchase"]).dt.to_period("M")

data = data.groupby("ym").agg("sum").reset_index()

# merge to mau

data = pd.merge(data, mau, how="inner", on="ym")

data["ym"] = data["ym"].astype(str)
data["periods_per_mau"] = data["count"] / data["mau"]


bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("ym:O", axis=alt.Axis(title="Month")),
        y=alt.Y("periods_per_mau:Q", axis=alt.Axis(title="% of travel periods")),
    )
    .properties(width=600, height=400, title="Travel periods by month")
)

bars

In [217]:
data = pd.read_pickle("travel_periods.pkl")
mau = pd.read_pickle("mau_timeseries.pkl")

mau["ym"] = pd.to_datetime(mau["start_time"]).dt.to_period("M")

data["count"] = 1

data["ym"] = pd.to_datetime(data["first_purchase"]).dt.to_period("M")

data = data.groupby(["product_id", "ym"]).agg("sum").reset_index()

# merge to mau

data = pd.merge(data, mau, how="inner", on=["ym", "product_id"])

data["ym"] = data["ym"].astype(str)
data["periods_per_mau"] = data["count"] / data["mau"]


bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("ym:O", axis=alt.Axis(title="Month")),
        y=alt.Y("count:Q", axis=alt.Axis(title="Total travel periods")),
        color=alt.Color("product_id"),
    )
    .properties(width=600, height=400, title="Travel periods by month")
)

bars

In [219]:
data = pd.read_pickle("travel_periods.pkl")
mau = pd.read_pickle("mau_timeseries.pkl")

mau["ym"] = pd.to_datetime(mau["start_time"]).dt.to_period("M")

data["count"] = 1

data["ym"] = pd.to_datetime(data["first_purchase"]).dt.to_period("M")

data = data.groupby(["product_id", "ym"]).agg("sum").reset_index()

# merge to mau

data = pd.merge(data, mau, how="inner", on=["ym", "product_id"])

data["ym"] = data["ym"].astype(str)
data["volume_per_mau"] = data["volume"] / data["mau"]


bars = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x=alt.X("ym:O", axis=alt.Axis(title="Month")),
        y=alt.Y("volume:Q", axis=alt.Axis(title="Total travel volume")),
        color=alt.Color("product_id"),
    )
    .properties(width=600, height=400, title="Total travel volume")
)

bars

In [256]:
##get current premium customer base that started early enough.

query = """ 

select user_id, product_id 
from dbt.zrh_users 
where kyc_first_completed is not null and kyc_first_completed <= current_date - interval '12 months'
        and is_mau = True

"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("user_product_map.pkl")

In [258]:
users = pd.read_pickle("user_product_map.pkl")

data = pd.read_pickle("all_abroad_spend.pkl")

# compute cashback.

data = data.groupby(["user_id", "product_id"]).agg("sum").reset_index()
data["ic"] = 0
data.loc[data["product_id"] == "METAL_CARD_MONTHLY", "ic"] = 0.01
data.loc[data["product_id"] == "BUSINESS_CARD", "ic"] = 0.0025
data.loc[data["product_id"] == "BLACK_CARD_MONTHLY", "ic"] = 0.005
data.loc[data["product_id"] == "BUSINESS_BLACK", "ic"] = 0.0075

data["cashback"] = data["volume"] * data["ic"]

# aggregate once more on the user level. (this accounts for product changes)
data = data.groupby(["user_id"]).agg("sum").reset_index()
data = data.loc[:, ["user_id", "cashback"]]

# merge to total users.
data["user_id"] = data["user_id"].astype(str)
users["user_id"] = users["user_id"].astype(str)

final = pd.merge(users, data, how="left", on="user_id")

final["cashback"] = final["cashback"].fillna(0)


# count how many people benefit to which extent.

final["cashback"] = final["cashback"].round(0)
final["count"] = 1

final = final.groupby(["cashback", "product_id"]).agg("sum").reset_index()

final["total"] = final.groupby("product_id")["count"].transform("sum")
final["perc"] = 100 * final["count"] / final["total"]

# cumulative sum within product.
final["cum"] = final.groupby("product_id")["perc"].cumsum()


final = final.loc[final["product_id"] != "STANDARD", :]
final = final.loc[final["product_id"] != "FLEX_ACCOUNT_MONTHLY", :]
final = final.loc[final["cashback"] < 50, :]

bars = (
    alt.Chart(final)
    .mark_line()
    .encode(
        x=alt.X("cashback:Q", axis=alt.Axis(title="Cashback return (annual)")),
        y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
        color=alt.Color("product_id"),
    )
    .properties(
        width=600,
        height=400,
        title="Annual cashback for current MAUs (that are at least one year old)",
    )
)

bars

In [5]:
users = pd.read_pickle("user_product_map.pkl")

data = pd.read_pickle("all_abroad_spend.pkl")

# compute volume eligible for cashback by product

data = data.groupby(["user_id", "product_id"]).agg("sum").reset_index()
data["ic"] = 1
data.loc[data["product_id"] == "METAL_CARD_MONTHLY", "ic"] = 1
data.loc[data["product_id"] == "BUSINESS_CARD", "ic"] = 1
data.loc[data["product_id"] == "BLACK_CARD_MONTHLY", "ic"] = 1
data.loc[data["product_id"] == "BUSINESS_BLACK", "ic"] = 1

data["cashback"] = data["volume"] * data["ic"]

# aggregate once more on the user level. (this accounts for product changes)
data = data.groupby(["user_id"]).agg("sum").reset_index()
data = data.loc[:, ["user_id", "cashback"]]

# merge to total users.
data["user_id"] = data["user_id"].astype(str)
users["user_id"] = users["user_id"].astype(str)

final = pd.merge(users, data, how="left", on="user_id")

final["cashback"] = final["cashback"].fillna(0)


# count how many people benefit to which extent.

final["cashback"] = final["cashback"].round(-1)
final["count"] = 1

final = final.groupby(["cashback", "product_id"]).agg("sum").reset_index()

final["total"] = final.groupby("product_id")["count"].transform("sum")
final["perc"] = 100 * final["count"] / final["total"]

# cumulative sum within product.
final["cum"] = final.groupby("product_id")["perc"].cumsum()


# final = final.loc[final["product_id"]!= 'STANDARD',:]
final = final.loc[final["product_id"] != "FLEX_ACCOUNT_MONTHLY", :]
final = final.loc[final["cashback"] < 2000, :]

bars = (
    alt.Chart(final)
    .mark_line()
    .encode(
        x=alt.X(
            "cashback:Q", axis=alt.Axis(title="Volume eligible for cashback (annual)")
        ),
        y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
        color=alt.Color("product_id"),
    )
    .properties(
        width=600,
        height=400,
        title="Volume eligible for cashback for current MAUs (annual)",
    )
)

bars

In [43]:
users = pd.read_pickle("user_product_map.pkl")

data = pd.read_pickle("all_abroad_spend.pkl")

# compute volume eligible for cashback by product

data = data.groupby(["user_id", "product_id"]).agg("sum").reset_index()
data["ic"] = 1
data.loc[data["product_id"] == "METAL_CARD_MONTHLY", "ic"] = 1
data.loc[data["product_id"] == "BUSINESS_CARD", "ic"] = 1
data.loc[data["product_id"] == "BLACK_CARD_MONTHLY", "ic"] = 1
data.loc[data["product_id"] == "BUSINESS_BLACK", "ic"] = 1

data["cashback"] = data["volume"] * data["ic"]

# aggregate once more on the user level. (this accounts for product changes)
data = data.groupby(["user_id"]).agg("sum").reset_index()
data = data.loc[:, ["user_id", "cashback"]]

# merge to total users.
data["user_id"] = data["user_id"].astype(str)
users["user_id"] = users["user_id"].astype(str)

final = pd.merge(users, data, how="left", on="user_id")

final["cashback"] = final["cashback"].fillna(0)


# count how many people benefit to which extent.

final["volume"] = final["cashback"].round(-2)
final["count"] = 1

final = final.groupby(["volume", "product_id"]).agg("sum").reset_index()

final["total"] = final.groupby("product_id")["count"].transform("sum")
final["perc"] = 100 * final["count"] / final["total"]

# cumulative sum within product.
final["cum"] = final.groupby("product_id")["perc"].cumsum()


# final = final.loc[final["product_id"]!= 'STANDARD',:]
final = final.loc[final["product_id"] != "FLEX_ACCOUNT_MONTHLY", :]
final = final.loc[final["volume"] > 800, :]
final = final.loc[final["volume"] < 5000, :]

bars = (
    alt.Chart(final)
    .mark_line()
    .encode(
        x=alt.X(
            "volume:Q", axis=alt.Axis(title="Volume eligible for cashback (annual)")
        ),
        y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
        color=alt.Color("product_id"),
    )
    .properties(
        width=600,
        height=400,
        title="Volume eligible for cashback for current MAUs (annual)",
    )
)

bars

final.loc[
    final["product_id"] == "STANDARD", ["volume", "product_id", "cum", "count"]
].head(100)

Unnamed: 0,volume,product_id,cum,count
59,900.0,STANDARD,91.008515,3735
65,1000.0,STANDARD,91.752843,3375
71,1100.0,STANDARD,92.405867,2961
77,1200.0,STANDARD,92.990964,2653
83,1300.0,STANDARD,93.518059,2390
89,1400.0,STANDARD,93.962451,2015
95,1500.0,STANDARD,94.376187,1876
101,1600.0,STANDARD,94.754857,1717
107,1700.0,STANDARD,95.096255,1548
113,1800.0,STANDARD,95.410527,1425


In [37]:
users = pd.read_pickle("user_product_map.pkl")

users["MAU"] = 1
users = users.groupby("product_id").agg("sum")

users.head(10)

Unnamed: 0_level_0,MAU
product_id,Unnamed: 1_level_1
BLACK_CARD_MONTHLY,60754
BUSINESS_BLACK,7098
BUSINESS_CARD,60571
FLEX_ACCOUNT_MONTHLY,12296
METAL_CARD_MONTHLY,29591
STANDARD,453429
