title: Travel spend investigation Pt2    
author: Fabio Schmidt-Fischbach    
date: 2020-04-01
region: EU
link: https://docs.google.com/presentation/d/1lx45oV4j141gv7lxvBTxWL1rV9orJreRJP9GtAItCrk/edit?usp=sharing   
tags: memberships, cashback, travel, cluster, ux, user behavior
summary: This is a follow up to our “Travel spend investigation pt1” research. By using a k-means with 5-clusters, we found 4 sizeable groups. Cluster 0 : “bureau-bob that spends his limited 2-week vacations abroad.” Cluster 1: “backpack-betty that goes on lengthy trips abroad.” Cluster 2-3: “moderate-mike with little travelling activity.” Cluster 4: “carry-on-kiah travels frequently abroad for work./ecommerce?”

In [None]:
from IPython import get_ipython
import os 
get_ipython().system('pip install -U scikit-learn matplotlib')
get_ipython().system('pip install -U fuzzywuzzy')
get_ipython().system('pip install -U altair')
get_ipython().system('pip install -U seaborn')
get_ipython().system('pip install -U statsmodels')
get_ipython().system('pip install -U vega')
get_ipython().system('pip install -U altair vega_datasets notebook vega')
import os
os.chdir('/app')
# Import Libraries
import pandas as pd
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
from utils.datalib_database import df_from_sql
from multiprocessing import Pool
import time
import gc 
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from scipy.stats import ks_2samp
import scipy.stats as stats
import statsmodels
from pylab import savefig
from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.proportion import proportions_ztest
import altair as alt
alt.renderers.enable('notebook')
overall_start = time.time()
from sklearn.preprocessing import StandardScaler

In [None]:
# sample

query = """ 

with start_data as ( 
	select user_created, created, amount_cents_eur, region_group, lag(region_group,1) over(partition by user_created order by created) as region_lag 
	from dbt.zrh_card_transactions
	where card_tx_type in ('cardpresent')  and created >= current_date - interval '12 months'
        and type = 'PT'
),
cluster as ( 
select user_created, 
		created, 
		region_group, 
		region_lag,
        amount_cents_eur,
		sum(case when region_group != region_lag then 1 else 0 end )
			over(partition by user_created order by created rows unbounded preceding) as group_id   
from start_data 
),
counts as ( 
select  user_id, 
		group_id, 
		region_group, 
        country_tnc_legal, 
        zup.product_id, 
        sum(amount_cents_eur::float/100) as volume,
		count(1) as transactions, 
		max(cluster.created)  as last_purchase, 
		min(cluster.created)  as first_purchase,
		datediff(days, first_purchase, last_purchase) as days_spent 
from cluster 
inner join dbt.zrh_users on cluster.user_created = dbt.zrh_users.user_created
inner join dbt.zrh_user_product as zup on zup.user_created = cluster.user_created 
        and created between subscription_valid_from and subscription_valid_until 
where region_group = 'inter' and kyc_first_completed <= current_date - interval '12 months'
group by 1,2,3,4,5 
)

select *
from counts 
"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("travel_periods.pkl")

In [1]:
data = pd.read_pickle("travel_periods.pkl")

data["trips"] = 1

data = data.loc[:, ["user_id", "volume", "transactions", "days_spent", "trips"]]
data = data.groupby("user_id").agg("sum").reset_index()
data["days_spent"] = data["days_spent"] / data["trips"]
data["volume"] = data["volume"] / data["trips"]
data["transactions"] = data["transactions"] / data["trips"]

NameError: name 'pd' is not defined

## standardize data

In [77]:
from sklearn.preprocessing import StandardScaler

X = data.loc[:, ["volume", "transactions", "days_spent"]]

# Standardizing the features
X = StandardScaler().fit_transform(X)

# k means

In [78]:
# find correct number of clusters.
numbers = range(1, 10)

models = [KMeans(n_clusters=i, random_state=0).fit(X) for i in numbers]
scores = [model.score(X) for model in models]

In [79]:
# put both arrays into one data set.

df = pd.DataFrame()
df["number"] = numbers
df["scores"] = scores


# visuzalize
alt.Chart(df).mark_line().encode(
    x=alt.X("number:Q", axis=alt.Axis(title="Number of clusters")),
    y=alt.Y("scores:Q", axis=alt.Axis(title="Score")),
).properties(title="Number of clusters and model fit")

In [80]:
# visualize groups
df = pd.DataFrame(models[4].predict(X), columns=["cluster"])

df["count"] = 1

df = df.groupby("cluster").agg("sum").reset_index()
df["group"] = 1
df["total"] = df.groupby("group")["count"].transform("sum")
df["perc"] = 100 * df["count"] / df["total"]


alt.Chart(df).mark_bar().encode(
    x=alt.X("cluster:O", axis=alt.Axis(title="Cluster")),
    y=alt.Y("perc:Q", axis=alt.Axis(title="% of users")),
).properties(title="Number of clusters and model fit", width=400)

In [82]:
# show clusters across dimensions

result = pd.concat(
    [data, pd.DataFrame(models[4].predict(X), columns=["cluster"])], axis=1, sort=False
)

# visualize in one dimensional space

result = result.sample(n=5000)


# plot results

vt = (
    alt.Chart(result)
    .mark_point()
    .encode(
        x=alt.X("volume:Q", axis=alt.Axis(title="Transaction volume")),
        y=alt.Y("transactions:Q", axis=alt.Axis(title="Number of transactions")),
        color="cluster:N",
    )
)

dt = (
    alt.Chart(result)
    .mark_point()
    .encode(
        x=alt.X("days_spent:Q", axis=alt.Axis(title="Days spent")),
        y=alt.Y("transactions:Q", axis=alt.Axis(title="Number of transactions")),
        color="cluster:N",
    )
)

vd = (
    alt.Chart(result)
    .mark_point()
    .encode(
        x=alt.X("days_spent:Q", axis=alt.Axis(title="Days spent")),
        y=alt.Y("volume:Q", axis=alt.Axis(title="Volume spent")),
        color="cluster:N",
    )
)

In [83]:
vd

In [84]:
vt

In [85]:
dt

# use log versions of data

In [2]:
data = pd.read_pickle("travel_periods.pkl")

data["trips"] = 1

data = data.loc[:, ["user_id", "volume", "transactions", "days_spent", "trips"]]
data = data.groupby("user_id").agg("sum").reset_index()
data["days_spent"] = np.log((data["days_spent"] + 1) / data["trips"])
data["volume"] = np.log((data["volume"] + 1) / data["trips"])
data["transactions"] = np.log((data["transactions"] + 1) / data["trips"])

NameError: name 'pd' is not defined

In [172]:
from sklearn.preprocessing import StandardScaler

X = data.loc[:, ["volume", "transactions", "days_spent", "trips"]]

# Standardizing the features
X = StandardScaler().fit_transform(X)

# find correct number of clusters.
numbers = range(1, 10)

models = [KMeans(n_clusters=i, random_state=0).fit(X) for i in numbers]
scores = [model.score(X) for model in models]

In [235]:
# put both arrays into one data set.

df = pd.DataFrame()
df["number"] = [number + 1 for number in numbers]
df["scores"] = scores


# visuzalize
alt.Chart(df).mark_line().encode(
    x=alt.X("number:Q", axis=alt.Axis(title="Number of clusters")),
    y=alt.Y("scores:Q", axis=alt.Axis(title="Score")),
).properties(title="Number of clusters and model fit")

In [236]:
# visualize groups
df = pd.DataFrame(models[2].predict(X), columns=["cluster"])

df["count"] = 1

df = df.groupby("cluster").agg("sum").reset_index()
df["group"] = 1
df["total"] = df.groupby("group")["count"].transform("sum")
df["perc"] = 100 * df["count"] / df["total"]


alt.Chart(df).mark_bar().encode(
    x=alt.X("cluster:O", axis=alt.Axis(title="Cluster")),
    y=alt.Y("perc:Q", axis=alt.Axis(title="% of users")),
).properties(title="Number of clusters and model fit", width=400)

In [185]:
# show clusters across dimensions

result = pd.concat(
    [data, pd.DataFrame(models[2].predict(X), columns=["cluster"])], axis=1, sort=False
)

# visualize in one dimensional space

result = result.sample(n=5000)


# plot results

vt = (
    alt.Chart(result)
    .mark_point()
    .encode(
        x=alt.X("volume:Q", axis=alt.Axis(title="log Transaction volume")),
        y=alt.Y("transactions:Q", axis=alt.Axis(title="log Number of transactions")),
        color="cluster:N",
    )
)

dt = (
    alt.Chart(result)
    .mark_point()
    .encode(
        x=alt.X("days_spent:Q", axis=alt.Axis(title="log Days spent")),
        y=alt.Y("transactions:Q", axis=alt.Axis(title="log Number of transactions")),
        color="cluster:N",
    )
)

vd = (
    alt.Chart(result)
    .mark_point()
    .encode(
        x=alt.X("days_spent:Q", axis=alt.Axis(title="log Days spent")),
        y=alt.Y("volume:Q", axis=alt.Axis(title="log Volume spent")),
        color="cluster:N",
    )
)

td = (
    alt.Chart(result)
    .mark_point()
    .encode(
        x=alt.X("days_spent:Q", axis=alt.Axis(title="log Days spent")),
        y=alt.Y("trips:Q", axis=alt.Axis(title="Trips")),
        color="cluster:N",
    )
)

In [186]:
vt

In [187]:
dt

In [188]:
vd

In [189]:
td

In [190]:
# show univariate distributions across clusters

# show clusters across dimensions

result = pd.concat(
    [data, pd.DataFrame(models[2].predict(X), columns=["cluster"])], axis=1, sort=False
)


result["volume"] = np.exp(result["volume"])


result["volume"] = result["volume"].round(-1)
result["count"] = 1
# cluster spending
result = result.groupby(["cluster", "volume"]).agg("sum").reset_index()

result["total"] = result.groupby("cluster")["count"].transform("sum")
result["perc"] = 100 * result["count"] / result["total"]
result["cum"] = result.groupby("cluster")["perc"].cumsum()


result = result.loc[result["volume"] < 1500, :]

alt.Chart(result).mark_line().encode(
    x=alt.X("volume:Q", axis=alt.Axis(title="Volume per trip")),
    y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
    color="cluster:N",
).properties(title="Volume spent per trip across groups")

In [191]:
# show univariate distributions across clusters

# show clusters across dimensions

result = pd.concat(
    [data, pd.DataFrame(models[2].predict(X), columns=["cluster"])], axis=1, sort=False
)


result["transactions"] = np.exp(result["transactions"])

result["count"] = 1
# cluster spending
result = result.groupby(["cluster", "transactions"]).agg("sum").reset_index()

result["total"] = result.groupby("cluster")["count"].transform("sum")
result["perc"] = 100 * result["count"] / result["total"]
result["cum"] = result.groupby("cluster")["perc"].cumsum()


result = result.loc[result["transactions"] < 10, :]

alt.Chart(result).mark_line().encode(
    x=alt.X("transactions:Q", axis=alt.Axis(title="Transactions per trip")),
    y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
    color="cluster:N",
).properties(title="Transactions per trip across groups")

In [192]:
# show univariate distributions across clusters

# show clusters across dimensions

result = pd.concat(
    [data, pd.DataFrame(models[2].predict(X), columns=["cluster"])], axis=1, sort=False
)


result["days_spent"] = np.exp(result["days_spent"])

result["count"] = 1
# cluster spending
result = result.groupby(["cluster", "days_spent"]).agg("sum").reset_index()

result["total"] = result.groupby("cluster")["count"].transform("sum")
result["perc"] = 100 * result["count"] / result["total"]
result["cum"] = result.groupby("cluster")["perc"].cumsum()


result = result.loc[result["days_spent"] < 10, :]

alt.Chart(result).mark_line().encode(
    x=alt.X("days_spent:Q", axis=alt.Axis(title="Days per trip")),
    y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
    color="cluster:N",
).properties(title="Days per trip across groups")

In [193]:
# show univariate distributions across clusters

# show clusters across dimensions

result = pd.concat(
    [data, pd.DataFrame(models[2].predict(X), columns=["cluster"])], axis=1, sort=False
)


result["count"] = 1
# cluster spending
result = result.groupby(["cluster", "trips"]).agg("sum").reset_index()

result["total"] = result.groupby("cluster")["count"].transform("sum")
result["perc"] = 100 * result["count"] / result["total"]
result["cum"] = result.groupby("cluster")["perc"].cumsum()


result = result.loc[result["trips"] < 30, :]

alt.Chart(result).mark_line().encode(
    x=alt.X("trips:Q", axis=alt.Axis(title="Trips per year")),
    y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
    color="cluster:N",
).properties(title="Trips per year across groups")

In [196]:
# check whether

X

array([[ 0.1576524 , -0.04883327, -0.43104519, -0.31130684],
       [ 1.89340718,  1.27915318,  1.59577445, -0.31130684],
       [ 0.74096203, -0.04883327, -0.093543  ,  0.32900205],
       ...,
       [ 0.16741745,  0.87253201,  2.20849293, -0.47138407],
       [-2.47735481, -1.31691904, -1.7199126 ,  0.32900205],
       [ 0.26522118, -0.24121178, -0.2622941 , -0.15122962]])

In [233]:
result = pd.concat(
    [data, pd.DataFrame(models[2].predict(X), columns=["cluster"])], axis=1, sort=False
)


result["transactions"] = np.exp(result["transactions"])
result["volume"] = np.exp(result["volume"])
result["days_spent"] = np.exp(result["days_spent"])

result = result.groupby("cluster").agg("mean").reset_index()


volume = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("volume:Q", axis=alt.Axis(title="Average volume per trip")),
    )
    .properties(width=150, title="Volume")
)

trips = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("trips:Q", axis=alt.Axis(title="Average number of trips per year")),
    )
    .properties(width=150, title="Frequency")
)

transactions = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y(
            "transactions:Q", axis=alt.Axis(title="Average # transactions per trip")
        ),
    )
    .properties(width=150, title="Transactions")
)


days_spent = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("days_spent:Q", axis=alt.Axis(title="Average length of trip")),
    )
    .properties(width=150, title="Length")
)


chart = trips | days_spent | volume | transactions

chart

In [237]:
result = pd.concat(
    [data, pd.DataFrame(models[4].predict(X), columns=["cluster"])], axis=1, sort=False
)


result["transactions"] = np.exp(result["transactions"])
result["volume"] = np.exp(result["volume"])
result["days_spent"] = np.exp(result["days_spent"])

result = result.groupby("cluster").agg("mean").reset_index()


volume = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("volume:Q", axis=alt.Axis(title="Average volume per trip")),
    )
    .properties(width=150, title="Volume")
)

trips = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("trips:Q", axis=alt.Axis(title="Average number of trips per year")),
    )
    .properties(width=150, title="Frequency")
)

transactions = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y(
            "transactions:Q", axis=alt.Axis(title="Average # transactions per trip")
        ),
    )
    .properties(width=150, title="Transactions")
)


days_spent = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("days_spent:Q", axis=alt.Axis(title="Average length of trip")),
    )
    .properties(width=150, title="Length")
)


chart = trips | days_spent | volume | transactions

chart

In [239]:
# visualize groups
df = pd.DataFrame(models[4].predict(X), columns=["cluster"])

df["count"] = 1

df = df.groupby("cluster").agg("sum").reset_index()
df["group"] = 1
df["total"] = df.groupby("group")["count"].transform("sum")
df["perc"] = 100 * df["count"] / df["total"]


alt.Chart(df).mark_bar().encode(
    x=alt.X("cluster:O", axis=alt.Axis(title="Cluster")),
    y=alt.Y("perc:Q", axis=alt.Axis(title="% of users")),
).properties(title="Number of clusters and model fit", width=400)

# run model without 1 transactions trips

In [3]:
data = pd.read_pickle("travel_periods.pkl")


data["trips"] = 1

data = data.loc[:, ["user_id", "volume", "transactions", "days_spent", "trips"]]

# drop trips that are just one transactions
data = data.loc[data["transactions"] > 1, :]

data = data.groupby("user_id").agg("sum").reset_index()
data["days_spent"] = np.log((data["days_spent"] + 1) / data["trips"])
data["volume"] = np.log((data["volume"] + 1) / data["trips"])
data["transactions"] = np.log((data["transactions"] + 1) / data["trips"])

NameError: name 'pd' is not defined

In [246]:
from sklearn.preprocessing import StandardScaler

X = data.loc[:, ["volume", "transactions", "days_spent", "trips"]]

# Standardizing the features
X = StandardScaler().fit_transform(X)

# find correct number of clusters.
numbers = range(1, 10)

models = [KMeans(n_clusters=i, random_state=0).fit(X) for i in numbers]
scores = [model.score(X) for model in models]

# put both arrays into one data set.

df = pd.DataFrame()
df["number"] = [number + 1 for number in numbers]
df["scores"] = scores


# visuzalize
alt.Chart(df).mark_line().encode(
    x=alt.X("number:Q", axis=alt.Axis(title="Number of clusters")),
    y=alt.Y("scores:Q", axis=alt.Axis(title="Score")),
).properties(title="Number of clusters and model fit")

In [243]:
result = pd.concat(
    [data, pd.DataFrame(models[3].predict(X), columns=["cluster"])], axis=1, sort=False
)


result["transactions"] = np.exp(result["transactions"])
result["volume"] = np.exp(result["volume"])
result["days_spent"] = np.exp(result["days_spent"])

result = result.groupby("cluster").agg("mean").reset_index()


volume = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("volume:Q", axis=alt.Axis(title="Average volume per trip")),
    )
    .properties(width=150, title="Volume")
)

trips = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("trips:Q", axis=alt.Axis(title="Average number of trips per year")),
    )
    .properties(width=150, title="Frequency")
)

transactions = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y(
            "transactions:Q", axis=alt.Axis(title="Average # transactions per trip")
        ),
    )
    .properties(width=150, title="Transactions")
)


days_spent = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("days_spent:Q", axis=alt.Axis(title="Average length of trip")),
    )
    .properties(width=150, title="Length")
)


chart = trips | days_spent | volume | transactions

chart

In [244]:
# visualize groups
df = pd.DataFrame(models[3].predict(X), columns=["cluster"])

df["count"] = 1

df = df.groupby("cluster").agg("sum").reset_index()
df["group"] = 1
df["total"] = df.groupby("group")["count"].transform("sum")
df["perc"] = 100 * df["count"] / df["total"]


alt.Chart(df).mark_bar().encode(
    x=alt.X("cluster:O", axis=alt.Axis(title="Cluster")),
    y=alt.Y("perc:Q", axis=alt.Axis(title="% of users")),
).properties(title="Number of clusters and model fit", width=400)

## cross-check with demographics

In [79]:
##current user base.
query = """ 

with users as ( 
select  user_id, 
        country_tnc_legal,
        product_id, 
        is_mau,
        age_group, 
        kyc_first_completed,
        user_created 
from dbt.zrh_users 
where kyc_first_completed <= current_date - interval '12 months' 
),
activity as ( 
select  users.*,
        dwh.month, 
        count(act.id) as transactions, 
        sum(amount_cents)::float/100 as volume 
from dwh_cohort_months as dwh 
left join users on dwh.end_time >= kyc_first_completed 
left join dbt.zrh_card_transactions as act 
    on act.user_created = users.user_created 
    and last_day(created) = end_time::date 
    and type = 'PT'
    and region_group in ('intra', 'dom') 
where end_time < current_date and start_time >= current_date - interval '12 months' 
group by 1,2,3,4,5,6,7,8
)


select * 
from activity 

"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("non_travel_activity.pkl")

{"message": "started", "db": "redshiftreader", "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 114, "funcName": "df_from_sql", "created": "20200226T162858", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "dc5f7d4c-5365-4687-8ec4-04ecc1110bff", "hostname": "172.19.0.4"}
{"message": "DB Exception Triggered", "db": "redshiftreader", "exception_message": "OperationalError('(psycopg2.OperationalError) server closed the connection unexpectedly\\n\\tThis probably means the server terminated abnormally\\n\\tbefore or while processing the request.\\n')", "exc_info": "Traceback (most recent call last):\n  File \"/usr/local/lib/python3.7/site-packages/sqlalchemy/engine/base.py\", line 1246, in _execute_context\n    cursor, statement, parameters, context\n  File \"/usr/local/lib/python3.7/site-packages/sqlalchemy/engi

OperationalError: (psycopg2.OperationalError) server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.

[SQL:  

with users as ( 
select  user_id, 
        country_tnc_legal,
        product_id, 
        is_mau,
        age_group, 
        kyc_first_completed,
        user_created 
from dbt.zrh_users 
where kyc_first_completed <= current_date - interval '12 months' 
),
activity as ( 
select  users.*,
        dwh.month, 
        count(act.id) as transactions, 
        sum(amount_cents)::float/100 as volume 
from dwh_cohort_months as dwh 
left join users on dwh.end_time >= kyc_first_completed 
left join dbt.zrh_card_transactions as act 
    on act.user_created = users.user_created 
    and last_day(created) = end_time::date 
    and type = 'PT'
    and region_group in ('intra', 'dom') 
where end_time < current_date and start_time >= current_date - interval '12 months' 
group by 1,2,3,4,5,6,7,8
)


select * 
from activity 

]
(Background on this error at: http://sqlalche.me/e/e3q8)

In [2]:
##current user base.
query = """ 

select  user_id, 
        country_tnc_legal,
        product_id, 
        is_mau,
        age_group, 
        kyc_first_completed,
        user_created 
from dbt.zrh_users 
where kyc_first_completed <= current_date - interval '12 months' 


"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("user_base.pkl")

{"message": "started", "db": "redshiftreader", "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 114, "funcName": "df_from_sql", "created": "20200226T141801", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "dc5f7d4c-5365-4687-8ec4-04ecc1110bff", "hostname": "172.19.0.4"}
{"message": "success", "db": "redshiftreader", "duration": 363.3811, "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 124, "funcName": "df_from_sql", "created": "20200226T142405", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "dc5f7d4c-5365-4687-8ec4-04ecc1110bff", "hostname": "172.19.0.4"}


In [23]:
from sklearn.preprocessing import StandardScaler

# estimate model again
data = pd.read_pickle("travel_periods.pkl")

data["trips"] = 1
data = data.loc[:, ["user_id", "volume", "transactions", "days_spent", "trips"]]
# drop trips that are just one transactions
data = data.loc[data["transactions"] > 1, :]

# aggregate
data = data.groupby("user_id").agg("sum").reset_index()
# take log
data["days_spent"] = np.log((data["days_spent"] + 1) / data["trips"])
data["volume"] = np.log((data["volume"] + 1) / data["trips"])
data["transactions"] = np.log((data["transactions"] + 1) / data["trips"])
# normalize
X = data.loc[:, ["volume", "transactions", "days_spent", "trips"]]
X = StandardScaler().fit_transform(X)
# fit model
model = KMeans(n_clusters=4, random_state=0).fit(X)
# concatenate user_ids to model output.
result = pd.concat(
    [data, pd.DataFrame(model.predict(X), columns=["cluster"])], axis=1, sort=False
)
result.to_pickle("travel_clusters.pkl")

In [25]:
# merge result with non_travel_activity
result = pd.read_pickle("travel_clusters.pkl")
user_base = pd.read_pickle("user_base.pkl")


final = pd.merge(user_base, result, on=["user_id"], how="left")
final["cluster"] = final["cluster"].fillna("No travel")

In [26]:
result = pd.read_pickle("travel_clusters.pkl")
result["transactions"] = np.exp(result["transactions"])
result["volume"] = np.exp(result["volume"])
result["days_spent"] = np.exp(result["days_spent"])

result = result.groupby("cluster").agg("mean").reset_index()


volume = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("volume:Q", axis=alt.Axis(title="Average volume per trip")),
    )
    .properties(width=150, title="Volume")
)

trips = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("trips:Q", axis=alt.Axis(title="Average number of trips per year")),
    )
    .properties(width=150, title="Frequency")
)

transactions = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y(
            "transactions:Q", axis=alt.Axis(title="Average # transactions per trip")
        ),
    )
    .properties(width=150, title="Transactions")
)


days_spent = (
    alt.Chart(result)
    .mark_bar()
    .encode(
        x=alt.X("cluster:N", axis=alt.Axis(title="Cluster")),
        y=alt.Y("days_spent:Q", axis=alt.Axis(title="Average length of trip")),
    )
    .properties(width=150, title="Length")
)


chart = trips | days_spent | volume | transactions

chart

In [27]:
# abstracting away from transactions, let's first look at demographics.
final = (
    final.groupby(
        ["user_id", "country_tnc_legal", "product_id", "is_mau", "age_group", "cluster"]
    )
    .size()
    .reset_index()
    .rename(columns={0: "count"})
)

final = final.loc[final["is_mau"] == True, :]

data = final.groupby("cluster").agg("sum").reset_index()

data["group"] = 1
data["total"] = data.groupby("group")["count"].transform("sum")

data["perc"] = 100 * data["count"] / data["total"]

alt.Chart(data).mark_bar().encode(
    x=alt.X("cluster:O", axis=alt.Axis(title="Cluster")),
    y=alt.Y("perc:Q", axis=alt.Axis(title="% of users")),
).properties(title="Number of clusters and % of currrent MAU base", width=400)

In [39]:
# abstracting away from transactions, let's first look at demographics.
final = (
    final.groupby(
        ["user_id", "country_tnc_legal", "product_id", "is_mau", "age_group", "cluster"]
    )
    .size()
    .reset_index()
    .rename(columns={0: "count"})
)

final = final.loc[final["is_mau"] == True, :]

data = final.groupby(["cluster", "product_id"]).agg("sum").reset_index()

data["group"] = 1
data["total"] = data.groupby("product_id")["count"].transform("sum")

data["perc"] = 100 * data["count"] / data["total"]

alt.Chart(data).mark_bar().encode(
    x=alt.X("cluster:O", axis=alt.Axis(title="Cluster")),
    y=alt.Y("perc:Q", axis=alt.Axis(title="% of users")),
).properties(title="Number of clusters and % of currrent MAU base", width=200).facet(
    facet="product_id:N", columns=3
)

In [51]:
# abstracting away from transactions, let's first look at demographics.
final = (
    final.groupby(
        ["user_id", "country_tnc_legal", "product_id", "is_mau", "age_group", "cluster"]
    )
    .size()
    .reset_index()
    .rename(columns={0: "count"})
)

final = final.loc[final["is_mau"] == True, :]


data = final.groupby(["cluster", "age_group"]).agg("sum").reset_index()

data["group"] = 1
data["total"] = data.groupby("cluster")["count"].transform("sum")

data["perc"] = 100 * data["count"] / data["total"]

alt.Chart(data).mark_line().encode(
    x=alt.X("age_group:O", axis=alt.Axis(title="Age group"), stack="zero"),
    y=alt.Y("perc:Q", axis=alt.Axis(title="% of users")),
    color="cluster:N",
).properties(title="Number of clusters and % of currrent MAU base", width=400)

In [77]:
# then do activity pre and succeeding the travel period.

result = pd.read_pickle("travel_clusters.pkl")
final = pd.merge(user_base, result, on=["user_id"], how="left")
final["cluster"] = final["cluster"].fillna("No travel")

# abstracting away from transactions, let's first look at demographics.
final = (
    final.groupby(
        ["user_id", "country_tnc_legal", "product_id", "is_mau", "age_group", "cluster"]
    )
    .size()
    .reset_index()
    .rename(columns={0: "count"})
)


final = final.loc[final["is_mau"] == True, :]
data = final.groupby(["cluster", "country_tnc_legal"]).agg("sum").reset_index()

data["group"] = 1
data["total"] = data.groupby("country_tnc_legal")["count"].transform("sum")

data["perc"] = 100 * data["count"] / data["total"]

data["country_tnc_legal"] = data["country_tnc_legal"].str.strip()
data = data.loc[data["country_tnc_legal"].isin(["DEU", "ESP", "FRA", "ITA", "AUT"]), :]

alt.Chart(data).mark_line().encode(
    x=alt.X("country_tnc_legal:O", axis=alt.Axis(title="Market"), stack="zero"),
    y=alt.Y("perc:Q", axis=alt.Axis(title="% of users")),
    color="cluster:N",
).properties(title="Number of clusters and % of currrent MAU base", width=400)

In [73]:
data["country_tnc_legal"].unique()

array(['AUT       ', 'BEL       ', 'DEU       ', 'DNK       ',
       'ESP       ', 'EST       ', 'FIN       ', 'FRA       ',
       'GBR       ', 'GRC       ', 'IRL       ', 'ISL       ',
       'ITA       ', 'LIE       ', 'LTU       ', 'LUX       ',
       'LVA       ', 'NLD       ', 'NOR       ', 'POL       ',
       'PRT       ', 'SVK       ', 'SVN       ', 'SWE       '],
      dtype=object)