In [1]:
import numpy as np
import pandas as pd
import os
import altair as alt

Slides : 
https://docs.google.com/presentation/d/1nzPG2N70AS--oT9k6ZKmbiH4NOc6-IMwAT09tWq35GQ/edit?usp=sharing

In [2]:
query = """
select  product_id, 
		status, 
		country, 
		order_id, 
		subscription_id, 
		subscription_valid_from, 
		subscription_valid_until, 
		user_id  
from etl_reporting.lp_user_product  
where product_id in ('BUSINESS_BLACK','BLACK_CARD_MONTHLY','METAL_CARD_MONTHLY','BUSINESS_METAL')
"""

In [35]:
n = pd.read_csv("n26_allianz.csv")

print(n.shape)

n.head()

(847585, 8)


Unnamed: 0,product_id,status,country,order_id,subscription_id,subscription_valid_from,subscription_valid_until,user_id
0,BUSINESS_BLACK,INACTIVE,DEU,BUSINESS_BLACK_d77f3dfe-4379-4270-984d-79d72e4...,,,,4fb91051-ca6c-4e8c-9bfb-e80f4b67c2ad
1,BLACK_CARD_MONTHLY,TO_BE_CANCELLED,DEU,6f50b5e2-3cdc-4c8a-9882-df5ab56b42a3,NUMDE5000007421,2019-10-02 19:39:55,2020-10-01 19:39:55,7d84df4a-a544-4eb7-9a88-2eb690a22883
2,METAL_CARD_MONTHLY,INACTIVE,ESP,METAL_CARD_MONTHLY_26a5d3b1-ec53-46b4-a23b-d9d...,,,,44d2bbe6-a636-4d6a-91a0-3ef6eddfe19a
3,METAL_CARD_MONTHLY,BLOCKED,DEU,A301863E-C5CD-4E86-AA89-E54489CD5B7B,NUMDE4000007179,2019-09-02 21:57:02,2020-05-06 12:15:36,45d44d7f-e397-4bdb-a7fc-204ebf34b866
4,BLACK_CARD_MONTHLY,INACTIVE,ITA,BLACK_CARD_MONTHLY_828ab6bb-a622-45bb-bad2-56e...,,,,2cdc55bc-5658-4254-9b5b-cb5c617eee90


## Drop first all subscriptions that never started. 

In [36]:
n = n.loc[n["subscription_valid_from"].isna() == False, :]

print(n.shape)

(519558, 8)


## Drop those without subscription id 

## Understand allianz data. 

In [11]:
print(a.shape)

(771135, 10)


## Highlight cases where external number is missing in allianz data

In [16]:
print(a.loc[a["External Policy Number"].isna() == True, :].shape)
# drop these
a = a.loc[a["External Policy Number"].isna() == False, :]

print(a.shape)

(0, 10)
(771126, 10)


## Are both datasets unique on subscription id? 

In [81]:
a = pd.read_excel("N26 - Contract policy Details - 24.08.2020.xlsx")
a = a.loc[a["External Policy Number"].isna() == False, :]

# let's look at some examples.
# test = a.loc[a["External Policy Number"].duplicated(),:]
# test = test.sort_values(by=["External Policy Number"])

# the data is non unique on the external policy number. only keep the most recent one.
a = a.sort_values(by=["External Policy Number", "Contract inception date"])
# keep only last row for duplicates
a = a.loc[a["External Policy Number"].duplicated(keep="last") == False, :]

n = pd.read_csv("n26_allianz.csv")
# drop non kycc
n = n.loc[n["subscription_valid_from"].isna() == False, :]
# drop non identifiable.
n = n.loc[n["subscription_id"].isna() == False, :]

# investigate non unique subscription ids.
ids_to_check = n.loc[n["subscription_id"].duplicated(), "subscription_id"].tolist()
test = n.loc[n["subscription_id"].isin(ids_to_check), :]
# these are fine to drop.
test.head(10)
# drop duplicates : country was changed.
n = n.loc[n["subscription_id"].duplicated() == False, :]

############## MERGE FULL OUTER ##########################
final = n.merge(
    a,
    left_on=["subscription_id"],
    right_on=["External Policy Number"],
    how="outer",
    indicator=True,
)
# create new "creation column"

final["created"] = final["Contract inception date"]

final.loc[final["created"].isna() == True, "created"] = pd.to_datetime(
    final["subscription_valid_from"]
).dt.date

final.to_csv("final.csv")

In [132]:
df = pd.read_csv("final.csv")

df["x"] = 1
df = df.groupby(["_merge"])["x"].count().reset_index()
df["perc"] = 100 * df["x"] / sum(df["x"])

df.head()

Unnamed: 0,_merge,x,perc
0,both,464658,85.185667
1,left_only,42136,7.724785
2,right_only,38671,7.089547


## Who are the policies that allianz does not know about? 

In [111]:
import altair as alt

df = pd.read_csv("final.csv")

# df = df.loc[df["_merge"]!='both',:]

df = df.groupby(["status", "_merge"])["subscription_id"].agg("count").reset_index()
df["perc"] = round(
    100
    * df["subscription_id"]
    / df.groupby(["_merge"])["subscription_id"].transform("sum")
)

alt.Chart(df).mark_bar().encode(x="status", y="perc:Q", column="_merge").properties(
    width=500, height=500, title="Status of subs across merge condition"
)

In [122]:
import altair as alt

df = pd.read_csv("final.csv")

df["month"] = pd.to_datetime(df["subscription_valid_from"]).dt.to_period("M")

df = (
    df.groupby(["month", "product_id", "_merge"])["subscription_id"]
    .agg("count")
    .reset_index()
)

df["month"] = df["month"].astype(str)

alt.Chart(df).mark_line().encode(
    x="month",
    y=alt.Y("subscription_id:Q", axis=alt.Axis(title="Number of subs")),
    color="_merge",
).properties(width=400, height=300, title="Issuing dates of subscriptions").facet(
    facet="product_id", columns=2
)

In [2]:
import altair as alt

df = pd.read_csv("final.csv")


df = df.groupby(["status", "_merge"])["subscription_id"].agg("count").reset_index()

alt.Chart(df.loc[df["_merge"] == "left_only", :]).mark_bar().encode(
    x="status",
    y=alt.Y("subscription_id:Q", axis=alt.Axis(title="Number of subs")),
    column="_merge",
).properties(
    width=400, height=300, title="Status of policies that miss in allianz data"
)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.loc[df["_merge"] == "left_only", :].head(20)

Unnamed: 0,status,_merge,subscription_id
1,ACTIVE,left_only,9459
3,BLOCKED,left_only,29755
5,CANCELLED,left_only,1185
7,REVOKED,left_only,787
9,TO_BE_CANCELLED,left_only,609
11,UPGRADED,left_only,341


In [8]:
import altair as alt

df = pd.read_csv("final.csv")

df = df.loc[
    df["_merge"].isin(["left_only", "both"]), :
]  ## keep only those that are in the dwh

df["match"] = 0
df.loc[df["_merge"] == "both", "match"] = 1

# look at only active subs
df = df.loc[df["status"].isin(["ACTIVE", "TO_BE_CANCELLED"]), :]

df = df.groupby(["country"])["match"].agg("mean").reset_index()

df.loc[
    df["country"].isin(["DEU", "FRA", "ESP", "ITA", "AUT"]) == False, "country"
] = "Other"

alt.Chart(df).mark_bar().encode(
    x="country",
    y=alt.Y(
        "match:Q",
        axis=alt.Axis(title="% of active subs that are in allianz data", format="%"),
    ),
).properties(
    width=400, height=300, title="% of active policies that are in allianz data"
)

In [133]:
import altair as alt

df = pd.read_csv("final.csv")

df["month"] = pd.to_datetime(df["subscription_valid_from"]).dt.to_period("M")

df = (
    df.groupby(["month", "country", "_merge"])["subscription_id"]
    .agg("count")
    .reset_index()
)

df["month"] = df["month"].astype(str)

alt.Chart(
    df.loc[df["country"].isin(["DEU", "FRA", "ITA", "ESP", "AUT"]), :]
).mark_line().encode(
    x="month",
    y=alt.Y("subscription_id:Q", axis=alt.Axis(title="Number of subs")),
    color="_merge",
).properties(
    width=300, height=200, title="Issuing dates of subscriptions"
).facet(
    facet="country", columns=3
)

  interactivity=interactivity, compiler=compiler, result=result)


## Who are the policies that n26 does not know about? 

In [137]:
import altair as alt

df = pd.read_csv("final.csv")

df["month"] = pd.to_datetime(df["Contract inception date"]).dt.to_period("M")

df = (
    df.groupby(["month", "_merge"])["External Policy Number"].agg("count").reset_index()
)

df["month"] = df["month"].astype(str)

alt.Chart(df).mark_line().encode(
    x="month",
    y=alt.Y("External Policy Number:Q", axis=alt.Axis(title="Number of subs")),
    color="_merge",
).properties(width=400, height=300, title="Issuing dates of subscriptions")

In [161]:
import altair as alt

df = pd.read_csv("final.csv")


df = (
    df.groupby(["Contract policy status", "_merge"])["External Policy Number"]
    .agg("count")
    .reset_index()
)

alt.Chart(df.loc[df["_merge"] == "right_only", :]).mark_bar().encode(
    x="Contract policy status",
    y=alt.Y("External Policy Number:Q", axis=alt.Axis(title="Number of subs")),
    column="_merge",
).properties(width=400, height=300, title="State of subscriptions")

In [173]:
import altair as alt

df = pd.read_csv("final.csv")

df.loc[df["Product package name"].isna() == True, "Product package name"] = ""

df["tier"] = ""
df.loc[df["Product package name"].str.contains("Metal"), "tier"] = "Metal"
df.loc[df["Product package name"].str.contains("Black"), "tier"] = "You"
df.loc[df["Product package name"].str.contains("You"), "tier"] = "You"

df = (
    df.groupby(["tier", "Contract policy status", "_merge"])["External Policy Number"]
    .agg("count")
    .reset_index()
)

alt.Chart(df.loc[df["_merge"] == "right_only", :]).mark_bar().encode(
    x="Contract policy status",
    y=alt.Y("External Policy Number:Q", axis=alt.Axis(title="Number of subs")),
    color="tier",
).properties(width=400, height=300, title="State of subscriptions by tier")

  interactivity=interactivity, compiler=compiler, result=result)


In [176]:
df.loc[(df["_merge"] == "right_only") & (df["tier"] != ""), :].head(10)

Unnamed: 0,tier,Contract policy status,_merge,External Policy Number
4,Metal,Cancelled,right_only,6631
6,Metal,Effective,right_only,1328
8,You,Cancelled,right_only,26914
10,You,Effective,right_only,3797


## How many policies are active according to allianz but not according to n26? 

In [142]:
df = pd.read_csv("final.csv")

df["dwh_active"] = False
df.loc[df["status"].isin(["TO_BE_CANCELLED", "ACTIVE"]), "dwh_active"] = True

df["allianz_active"] = False
df.loc[df["Contract policy status"] == "Effective", "allianz_active"] = True

# compute confusion matrix for those that matched.
df = df.loc[df["_merge"] == "both", :]

df = (
    df.groupby(["dwh_active", "allianz_active"])["subscription_id"]
    .agg("nunique")
    .reset_index()
)

df["perc"] = round(100 * df["subscription_id"] / sum(df["subscription_id"]), 1)


# Configure common options
base = alt.Chart(df).encode(
    alt.X("allianz_active:O", scale=alt.Scale(paddingInner=0)),
    alt.Y("dwh_active:O", scale=alt.Scale(paddingInner=0)),
)

# Configure heatmap
heatmap = (
    base.mark_rect()
    .encode(color=alt.Color("perc:Q", legend=alt.Legend(direction="horizontal")))
    .properties(width=500, height=500)
)

# Configure text
text = base.mark_text(baseline="middle").encode(text="perc:Q")

# Draw the chart
heatmap + text

In [143]:
df.head()

Unnamed: 0,dwh_active,allianz_active,subscription_id,perc
0,False,False,265065,57.0
1,False,True,5343,1.1
2,True,False,29072,6.3
3,True,True,165178,35.5


In [177]:
df = pd.read_csv("final.csv")

df["dwh_active"] = False
df.loc[df["status"].isin(["TO_BE_CANCELLED", "ACTIVE"]), "dwh_active"] = True

df["allianz_active"] = False
df.loc[df["Contract policy status"] == "Effective", "allianz_active"] = True

# compute confusion matrix for those that matched.
df = df.loc[df["_merge"] == "both", :]

df = (
    df.groupby(["product_id", "dwh_active", "allianz_active"])["subscription_id"]
    .agg("nunique")
    .reset_index()
)

df.head(20)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,product_id,dwh_active,allianz_active,subscription_id
0,BLACK_CARD_MONTHLY,False,False,142014
1,BLACK_CARD_MONTHLY,False,True,2098
2,BLACK_CARD_MONTHLY,True,False,15280
3,BLACK_CARD_MONTHLY,True,True,83962
4,BUSINESS_BLACK,False,False,28995
5,BUSINESS_BLACK,False,True,2443
6,BUSINESS_BLACK,True,False,2592
7,BUSINESS_BLACK,True,True,19036
8,BUSINESS_METAL,False,False,187
9,BUSINESS_METAL,False,True,5


### Understand why we disagree

In [153]:
df = pd.read_csv("final.csv")

df["dwh_active"] = False
df.loc[df["status"].isin(["TO_BE_CANCELLED", "ACTIVE"]), "dwh_active"] = True

df["allianz_active"] = False
df.loc[df["Contract policy status"] == "Effective", "allianz_active"] = True

# keep only problem cases
df = df.loc[df["allianz_active"] != df["dwh_active"], :]

df["class"] = "Agree on cases"
df.loc[
    (df["allianz_active"] == False) & (df["dwh_active"] == True), "class"
] = "DWH but not Allianz active"
df.loc[
    (df["allianz_active"] == True) & (df["dwh_active"] == False), "class"
] = "Allianz but not DWH active"

df["month"] = pd.to_datetime(df["created"]).dt.to_period("M")

df["x"] = 1
df = df.groupby(["month", "class"])["x"].agg("sum").reset_index()

df["month"] = df["month"].astype(str)

alt.Chart(df).mark_line().encode(
    x="month:N", y=alt.Y("x:Q", axis=alt.Axis(title="Number of subs")), color="class:N"
).properties(width=400, height=300, title="Issuing dates of subscriptions")