title: Website journey - ux investigation.
author: Fabio Schmidt-Fischbach   
date: 2020-02-01   
region: US
link: https://docs.google.com/presentation/d/1z8jmHl4Fso3-jUiwFE-TQp6w6wRfAdmMiuGkemvqaWs/edit?usp=sharing   
tags: website, journey, ux, acquire, growth
summary: Our goal here is to understand how users navigate our website, as well as how the journey of a converting user differs from a non-converting user. We found that our traffic comes from performance marketing (63.8% of sessions), SEO (16.8%) and organic (19%). As for the platform split, we have mobile device with 65% and computer with 32%. We also found that 85% and 72% of sessions on a mobile device and computer respectively last only one page. Top 10% of sessions last more than 2 (3) on a mobile device (computer). Interestingly, one more page view typically increases conversion rate to SUI by 1pp.

In [None]:
from IPython import get_ipython
import os 
get_ipython().system('pip install -U scikit-learn matplotlib')
get_ipython().system('pip install -U fuzzywuzzy')
get_ipython().system('pip install -U altair')
get_ipython().system('pip install -U seaborn')
get_ipython().system('pip install -U statsmodels')
get_ipython().system('pip install -U vega')
get_ipython().system('pip install -U altair vega_datasets notebook vega')
import os
os.chdir('/app')
# Import Libraries
import pandas as pd
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
from utils.datalib_database import df_from_sql
from multiprocessing import Pool
import time
import gc 
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from scipy.stats import ks_2samp
import scipy.stats as stats
import statsmodels
from pylab import savefig
from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.proportion import proportions_ztest
import altair as alt
alt.renderers.enable('notebook')
overall_start = time.time()

# Get data

In [2]:
# sample

query = """ 
select  domain_sessionid, 
        min(collector_tstamp) as start, 
        max(collector_tstamp) as end, 
        max(page_count) as visits, 
        median(seconds_spent) as median_time
from dev_dbt.zrh_websessions 
group by 1
"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("session_stats.pkl")

NameError: name 'df_from_sql' is not defined

In [3]:
# sample

query = """ 

select domain_sessionid, 
        shadow_channel, 
        count(1) as visits,
        median(seconds_spent) as median_time
from dev_dbt.zrh_websessions 
group by 1,2

"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("page_stats.pkl")

{"message": "started", "db": "redshiftreader", "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 114, "funcName": "df_from_sql", "created": "20200309T090850", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "b36a7d20-2c27-4465-ad9c-b84a1547f59a", "hostname": "172.21.0.4"}
{"message": "success", "db": "redshiftreader", "duration": 953.1336, "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 124, "funcName": "df_from_sql", "created": "20200309T092443", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "b36a7d20-2c27-4465-ad9c-b84a1547f59a", "hostname": "172.21.0.4"}


In [3]:
# sample

query = """ 

with d as (
select domain_sessionid, 
       first_value(dvce_type) over(partition by domain_sessionid order by collector_tstamp rows unbounded preceding) as device,
       first_value(lang) over(partition by domain_sessionid order by collector_tstamp rows unbounded preceding) as lang,
       first_value(country) over(partition by domain_sessionid order by collector_tstamp rows unbounded preceding) as country,
       first_value(marketing_channel) over(partition by domain_sessionid order by collector_tstamp rows unbounded preceding) as marketing_channel,
       max(cta_session) over(partition by domain_sessionid order by collector_tstamp rows unbounded preceding) as cta_session
from dev_dbt.zrh_websessions 
)

select domain_sessionid, 
        device, 
        lang,
        country, 
       case when marketing_channel in ('email_friend_referral') then 'friend_referral'
                        when marketing_channel in (
                                                'organic_social',
                                                'other',
                                                'lost_souls',
                                                'crm',
                                                'internal',
                                                'direct', 'app_store', 'brand_influencer') 
                            then  'organic'
                    when marketing_channel = 'organic_search' then 'SEO'
                    when marketing_channel in ('paid_search', 'paid_search_non_brand', 'paid_search_brand',
                                        'paid_search_unknown', 'paid_social',
                                        'affiliate', 'display_ads') then 'performance'
                    when marketing_channel in ('partnership', 'ambassador') then 'partnership'
                    else 'no source' end as attr_info, 
        cta_session
from d 
group by 1,2,3,4,5,6

"""

df = df_from_sql("redshiftreader", query)
df.to_pickle("session_dem.pkl")

{"message": "started", "db": "redshiftreader", "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 114, "funcName": "df_from_sql", "created": "20200309T095442", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "49de5ee9-bd63-499a-8f71-df471633dd88", "hostname": "172.21.0.4"}
{"message": "success", "db": "redshiftreader", "duration": 894.7337, "name": "datalib-logger", "args": [], "levelname": "INFO", "pathname": "/usr/local/lib/python3.7/site-packages/datalib/database.py", "filename": "database.py", "module": "database", "lineno": 124, "funcName": "df_from_sql", "created": "20200309T100936", "processName": "MainProcess", "service": "fargo", "environment": "local", "loggerId": "49de5ee9-bd63-499a-8f71-df471633dd88", "hostname": "172.21.0.4"}


# Merge dataframes

In [3]:
import pandas as pd

df = pd.read_pickle("session_dem.pkl")

page_stats = pd.read_pickle("session_stats.pkl").fillna(0)

df = pd.merge(df, page_stats, on="domain_sessionid")
df.to_pickle("temp.pkl")

FileNotFoundError: [Errno 2] No such file or directory: 'session_dem.pkl'

In [6]:
import pandas as pd

page_stats = pd.read_pickle("page_stats.pkl")

page_stats = page_stats.pivot(
    index="domain_sessionid", columns="shadow_channel", values=["visits", "median_time"]
).fillna(0)
page_stats.columns = [" ".join(col).strip() for col in page_stats.columns.values]
page_stats = page_stats.reset_index()

page_stats = page_stats.to_pickle("temp2.pkl")

In [5]:
import pandas as pd

df = pd.read_pickle("temp.pkl")
page_stats = pd.read_pickle("temp2.pkl")

print("Import successful")

Import successful


In [4]:
import pandas as pd

final = pd.merge(df, page_stats, on="domain_sessionid")

print("Merge successful")

NameError: name 'df' is not defined

In [7]:
final.to_csv("websessions_final.csv")
print("Export successful")

Export sucessful


In [2]:
import pandas as pd

final = pd.read_csv("websessions_final.csv")
final.to_pickle("websessions_final.pkl")

print("Successfully pickled")

Successfully pickled


# Descriptives

In [5]:
df = pd.read_pickle("websessions_final.pkl")


df["count"] = 1

df = df.loc[:, ["count", "attr_info", "country"]]

df = df.groupby(["attr_info", "country"]).agg("sum").reset_index()

df = df.loc[df["country"] != "us", :]
df = df.loc[df["country"] != "gb", :]
df = df.loc[df["country"] != "ch", :]

df["group"] = 1
df["total"] = df.groupby("group")["count"].transform("sum")


df["perc"] = 100 * df["count"] / df["total"]
df["perc"] = df["perc"].round(1)


alt.Chart(df).mark_rect().encode(x="attr_info:O", y="country:O", color="perc:Q")

# Configure common options
base = alt.Chart(df).encode(
    alt.X("attr_info:O", axis=alt.Axis(title="Marketing channel")),
    alt.Y("country:O", axis=alt.Axis(title="Market")),
)

# Configure heatmap
heatmap = (
    base.mark_rect()
    .encode(
        color=alt.Color(
            "perc:Q",
            scale=alt.Scale(scheme="viridis"),
            legend=alt.Legend(direction="horizontal"),
        )
    )
    .properties(
        width=400, height=400, title="% of websessions across market and channel"
    )
)

# Configure text
text = base.mark_text(baseline="middle").encode(text="perc:Q", color=alt.value("white"))

# Draw the chart
heatmap + text

In [12]:
df = pd.read_pickle("websessions_final.pkl")


df["count"] = 1

df = df.loc[:, ["count", "attr_info", "country"]]

df = df.groupby(["attr_info"]).agg("sum").reset_index()

df["group"] = 1
df["total"] = df.groupby("group")["count"].transform("sum")


df["perc"] = 100 * df["count"] / df["total"]
df["perc"] = df["perc"].round(1)

df.head(10)

Unnamed: 0,attr_info,count,group,total,perc
0,SEO,993806,1,5904682,16.8
1,friend_referral,11153,1,5904682,0.2
2,no source,965,1,5904682,0.0
3,organic,1120103,1,5904682,19.0
4,partnership,10087,1,5904682,0.2
5,performance,3768568,1,5904682,63.8


In [6]:
df = pd.read_pickle("websessions_final.pkl")


df["count"] = 1

df = df.loc[:, ["count", "device", "attr_info"]]

df = df.groupby(["device", "attr_info"]).agg("sum").reset_index()


df["group"] = 1
df["total"] = df.groupby("group")["count"].transform("sum")


df["perc"] = 100 * df["count"] / df["total"]
df["perc"] = df["perc"].round(1)


alt.Chart(df).mark_rect().encode(x="device:O", y="attr_info:O", color="perc:Q")

# Configure common options
base = alt.Chart(df).encode(
    alt.X("device:O", axis=alt.Axis(title="Device")),
    alt.Y("attr_info:O", axis=alt.Axis(title="Marketing channel")),
)

# Configure heatmap
heatmap = (
    base.mark_rect()
    .encode(
        color=alt.Color(
            "perc:Q",
            scale=alt.Scale(scheme="viridis"),
            legend=alt.Legend(direction="horizontal"),
        )
    )
    .properties(
        width=400, height=400, title="% of websessions across channel and device"
    )
)

# Configure text
text = base.mark_text(baseline="middle").encode(text="perc:Q", color=alt.value("white"))

# Draw the chart
heatmap + text

In [13]:
df = pd.read_pickle("websessions_final.pkl")


df["count"] = 1

df = df.loc[:, ["count", "device", "country"]]

df = df.groupby(["device"]).agg("sum").reset_index()

df["group"] = 1
df["total"] = df.groupby("group")["count"].transform("sum")


df["perc"] = 100 * df["count"] / df["total"]
df["perc"] = df["perc"].round(1)

df.head(10)

Unnamed: 0,device,count,group,total,perc
0,Computer,1940866,1,5904682,32.9
1,Game console,465,1,5904682,0.0
2,Mobile,3876880,1,5904682,65.7
3,Tablet,66402,1,5904682,1.1
4,Unknown,20069,1,5904682,0.3


# length of visit

In [7]:
df = pd.read_pickle("websessions_final.pkl")


df["count"] = 1

df = df.loc[:, ["count", "visits", "device"]]

df = df.groupby(["visits", "device"]).agg("sum").reset_index()

df["total"] = df.groupby("device")["count"].transform("sum")
df["perc"] = 100 * df["count"] / df["total"]
df["perc"] = df["perc"].round(1)

df["cum"] = df.groupby("device")["perc"].cumsum()

df = df.loc[df["device"].isin(["Computer", "Mobile"])]
df = df.loc[df["visits"] < 10, :]

alt.Chart(df).mark_line().encode(
    x=alt.X("visits:Q", axis=alt.Axis(title="Session length")),
    y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
    color="device:N",
).properties(title="# of pages seen in session by device")

In [9]:
df = pd.read_pickle("websessions_final.pkl")


df["count"] = 1

df = df.loc[:, ["count", "visits", "attr_info"]]

df = df.groupby(["visits", "attr_info"]).agg("sum").reset_index()

df["total"] = df.groupby("attr_info")["count"].transform("sum")
df["perc"] = 100 * df["count"] / df["total"]
df["perc"] = df["perc"].round(1)

df["cum"] = df.groupby("attr_info")["perc"].cumsum()

df = df.loc[df["attr_info"].isin(["performance", "organic", "SEO"])]
df = df.loc[df["visits"] < 10, :]

alt.Chart(df).mark_line().encode(
    x=alt.X("visits:Q", axis=alt.Axis(title="Session length")),
    y=alt.Y("cum:Q", axis=alt.Axis(title="Percentile")),
    color="attr_info:N",
).properties(title="# of pages seen in session by channel")

# Length and conversion

In [None]:
df = pd.read_pickle("websessions_final.pkl")

df = df.loc[:, ["cta_session", "visits", "attr_info"]]

df = df.groupby(["attr_info", "visits"]).agg("mean").reset_index()
df["cta_session"] = df["cta_session"] * 100
df = df.loc[df["attr_info"].isin(["performance", "organic", "SEO"]), :]
df = df.loc[df["visits"] < 10, :]

alt.Chart(df).mark_line().encode(
    x=alt.X("visits:Q", axis=alt.Axis(title="Session length (pages viewed)")),
    y=alt.Y("cta_session:Q", axis=alt.Axis(title="% sessions that converted to SUI")),
    color="attr_info:N",
).properties(title="# of pages seen and % of sessions that converted")

In [27]:
df = pd.read_pickle("websessions_final.pkl")

df = df.loc[:, ["cta_session", "attr_info", "device"]]

df = df.groupby(["attr_info", "device"]).agg("mean").reset_index()
df["cta_session"] = round(df["cta_session"] * 100, 1)

alt.Chart(df).mark_rect().encode(x="device:O", y="attr_info:O", color="cta_session:Q")

# Configure common options
base = alt.Chart(df).encode(
    alt.X("device:O", axis=alt.Axis(title="Device")),
    alt.Y("attr_info:O", axis=alt.Axis(title="Marketing channel")),
)

# Configure heatmap
heatmap = (
    base.mark_rect()
    .encode(
        color=alt.Color(
            "cta_session:Q",
            scale=alt.Scale(scheme="viridis"),
            legend=alt.Legend(direction="horizontal"),
        )
    )
    .properties(width=400, height=400, title="% conversion across channel and device")
)

# Configure text
text = base.mark_text(baseline="middle").encode(
    text="cta_session:Q", color=alt.value("white")
)

# Draw the chart
heatmap + text

# correlation of web pages 

In [44]:
import re
from scipy.spatial.distance import cdist, jaccard, squareform
import pandas as pd
from sklearn.metrics import pairwise_distances
import numpy as np

df = pd.read_pickle("websessions_final.pkl")
# drop sessions that are only one page view.
df = df.loc[df["visits"] > 1, :]


# select the visit columns
visit_cols = [col for col in df.columns if re.match("visits [a-z]+", col)]
X = df.loc[:, visit_cols]

# now move data to 0 / 1
X[X != 0] = 1

In [None]:
# implement jacard similarity

In [47]:
sim = 1 - pairwise_distances(X.T.to_numpy(), metric="jaccard")

sim = pd.DataFrame(sim, columns=visit_cols, index=visit_cols).reset_index()

# melt it to long

sim = pd.melt(sim, id_vars=["index"])



Unnamed: 0,index,variable,value
0,visits accountPage,visits accountPage,1.0
1,visits applepayPage,visits accountPage,0.012747
2,visits businessAccountPage,visits accountPage,0.104443
3,visits businessYouPage,visits accountPage,0.012345
4,visits cash26,visits accountPage,0.010159


In [53]:
sim.shape

(256, 3)

In [56]:
sim["value"] = sim["value"].round(2)
alt.Chart(sim).mark_rect().encode(x="index:O", y="variable:O", color="value:Q")

# Configure common options
base = alt.Chart(sim).encode(
    alt.X("index:O", axis=alt.Axis(title="Page")),
    alt.Y("variable:O", axis=alt.Axis(title="Page")),
)

# Configure heatmap
heatmap = (
    base.mark_rect()
    .encode(
        color=alt.Color(
            "value:Q",
            scale=alt.Scale(scheme="viridis"),
            legend=alt.Legend(direction="horizontal"),
        )
    )
    .properties(
        width=400, height=400, title="Jaccard distance (correlation) across pages"
    )
)

# Configure text
text = base.mark_text(baseline="middle").encode(
    text="value:Q", color=alt.value("white")
)

# Draw the chart
heatmap + text