In [1]:
%load_ext lab_black

In [2]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas_gbq
import pyarrow
import pandas as pd
import numpy as np
import os

credentials = service_account.Credentials.from_service_account_file(
    "/Users/miguelcouto/Downloads/zattoo-dataeng-e5f45785174f.json"
)

project_id = "zattoo-dataeng"
client = bigquery.Client(credentials=credentials, project=project_id)

In [3]:
sql_calcs = """
   select mt.transaction_id,
       mt.zuid,
       mt.payment_method,
       mt.sku,
       mt.type_of_transaction,
       case
           when mt.country_name = 'Germany' then 'Germany'
           when mt.country_name = 'Austria' then 'Austria'
           else 'Switzerland' end as country_name,
       mt.new_booking_net_CHF,
       mt.renewal_booking_net_CHF,
       mt.new_booking_net_EUR,
       mt.renewal_booking_net_EUR,
       mt.vat_CHF,
       mt.vat_EUR,
       mt.transaction_date,
       mt.term_start,
       mt.term_end
from b2c_middleware.middlelayer_transactions mt
where true
  and mt.app_shop_id = 'datatrans'
  and mt.transaction_date >= '2021-03-01 00:00:00'
  and mt.transaction_date <= '2021-03-31 23:59:59'
           """

In [4]:
## prepare dataframe
df = pandas_gbq.read_gbq(sql_calcs, project_id=project_id, progress_bar_type=None)
type_trans_lst = ["new_sale", "renewal", "refund"]

parse_dates = ["term_start", "term_end"]
mt_df = df[df["type_of_transaction"].isin(type_trans_lst)].copy()

for date in parse_dates:
    mt_df[date] = pd.to_datetime(mt_df[date]).dt.tz_convert(None)

In [5]:
## calculate total_booking_net columns
mt_df["total_booking_net_CHF"] = (
    mt_df["new_booking_net_CHF"] + mt_df["renewal_booking_net_CHF"]
)
mt_df["total_booking_net_EUR"] = (
    mt_df["new_booking_net_EUR"] + mt_df["renewal_booking_net_EUR"]
)

In [6]:
## drop unnecessary columns
mt_df.drop(
    [
        "new_booking_net_EUR",
        "new_booking_net_CHF",
        "renewal_booking_net_CHF",
        "renewal_booking_net_EUR",
    ],
    axis=1,
    inplace=True,
)

In [7]:
## calculate product_term_length_months
mt_df["product_term_length_months"] = (
    (mt_df["term_end"].dt.year - mt_df["term_start"].dt.year) * 12
    + (mt_df["term_end"].dt.month - mt_df["term_start"].dt.month)
    + 1
)

mt_df["product_term_length_months"][mt_df["product_term_length_months"] < 0] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mt_df["product_term_length_months"][mt_df["product_term_length_months"] < 0] = 0


In [8]:
## reseting index
mt_df = mt_df.reindex(mt_df.index.repeat(mt_df["product_term_length_months"]))

In [9]:
## add revenue_month_number
mt_df["revenue_month_number"] = mt_df.groupby(["transaction_id"]).cumcount() + 1

In [10]:
## add revenue_month_date
mt_df["revenue_month_date"] = mt_df["term_start"].to_numpy().astype("datetime64[M]")
mt_df["revenue_month_date"][mt_df["type_of_transaction"] == "refund"] = (
    mt_df["transaction_date"].to_numpy().astype("datetime64[M]")
)

mt_df["revenue_month_date"] = mt_df.apply(
    lambda x: x["revenue_month_date"]
    + pd.offsets.MonthEnd(x["revenue_month_number"])
    + pd.offsets.MonthBegin(-1),
    axis=1,
)

mt_df["product_group_finance"] = np.nan

mt_df["product_term_length"] = mt_df["revenue_month_date"].apply(
    lambda t: pd.Period(t, freq="S").days_in_month
)

mt_df["product_term_length"][mt_df["term_start"] > mt_df["revenue_month_date"]] = (
    mt_df["term_start"].dt.daysinmonth - mt_df["term_start"].dt.day
) + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mt_df["revenue_month_date"][mt_df["type_of_transaction"] == "refund"] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mt_df["product_term_length"][mt_df["term_start"] > mt_df["revenue_month_date"]] = (


In [11]:
## reseting index
mt_df.reset_index(drop=True, inplace=True)

In [12]:
## set active_sub_content = 1 by default
mt_df["active_sub_content"] = 1

## get last indices of each transaction_id group
last_idxs = (
    len(mt_df)
    - np.unique(
        mt_df.transaction_id.values[::-1],
        return_index=1,
    )[1]
    - 1
)

mt_df["active_sub_content"].iloc[last_idxs] = 0

## mark all one transaction subscriptions as 1
mt_df["active_sub_content"][
    (mt_df["revenue_month_number"] <= 1) & (mt_df["product_term_length_months"] <= 1)
] = 1

## mark all refund transactions as -1
mt_df["active_sub_content"][mt_df["type_of_transaction"] == "refund"] = -1

## mark all subscriptions with only 1 transaction as i
mt_df["active_sub_content"][mt_df["type_of_transaction"] == "refund"] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mt_df["active_sub_content"][
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mt_df["active_sub_content"][mt_df["type_of_transaction"] == "refund"] = -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mt_df["active_s

In [13]:
## fix last position of product_term_length per transaction_id
mt_df["product_term_length"].iloc[last_idxs] = (
    mt_df["term_end"].iloc[last_idxs] - mt_df["revenue_month_date"].iloc[last_idxs]
).dt.days + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [14]:
## TEMP FIX to tackle dynamic term_end for refunds
mt_df["product_term_length"][mt_df["product_term_length"] < 0] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mt_df["product_term_length"][mt_df["product_term_length"] < 0] = 0


In [15]:
## total_days of product_term_length per transaction_id
mt_df["total_days"] = mt_df.groupby("transaction_id")["product_term_length"].transform(
    "sum"
)

In [16]:
## calculate total_revenue_net fields
mt_df["total_revenue_net_EUR"] = (
    mt_df["total_booking_net_EUR"] / mt_df["total_days"] * mt_df["product_term_length"]
)

mt_df["total_revenue_net_CHF"] = (
    mt_df["total_booking_net_CHF"] / mt_df["total_days"] * mt_df["product_term_length"]
)

## remove total_booking values from all lines of group except first
mt_df.loc[
    mt_df["revenue_month_number"] > 1,
    ["total_booking_net_CHF", "total_booking_net_EUR"],
] = 0.0

## remove VAT values from all lines of group except first
vat_cols = ["vat_CHF", "vat_EUR"]

for col in vat_cols:
    mt_df.loc[
        mt_df["revenue_month_number"] > 1,
        [col, "total_booking_net_EUR"],
    ] = 0.0

In [17]:
## drop total_days column
mt_df.drop(
    ["total_days"],
    axis=1,
    inplace=True,
)

## reorder dataframe
mt_df = mt_df[
    [
        "transaction_id",
        "zuid",
        "payment_method",
        "sku",
        "type_of_transaction",
        "country_name",
        "transaction_date",
        "total_booking_net_CHF",
        "total_booking_net_EUR",
        "vat_CHF",
        "vat_EUR",
        "term_start",
        "term_end",
        "product_term_length",
        #        "total_days",
        "product_term_length_months",
        "product_group_finance",
        "revenue_month_number",
        "revenue_month_date",
        "total_revenue_net_EUR",
        "total_revenue_net_CHF",
        "active_sub_content",
    ]
]

In [18]:
## prepare df_nocalcs for free trials and full discounts
mt_df_nocalcs = df[~df["type_of_transaction"].isin(type_trans_lst)].copy()

for date in parse_dates:
    mt_df_nocalcs[date] = pd.to_datetime(mt_df_nocalcs[date]).dt.tz_convert(None)

In [19]:
## calculate total_booking_net columns
mt_df_nocalcs["total_booking_net_CHF"] = (
    mt_df_nocalcs["new_booking_net_CHF"] + mt_df_nocalcs["renewal_booking_net_CHF"]
)
mt_df_nocalcs["total_booking_net_EUR"] = (
    mt_df_nocalcs["new_booking_net_EUR"] + mt_df_nocalcs["renewal_booking_net_EUR"]
)

In [20]:
## drop unnecessary columns
mt_df_nocalcs.drop(
    [
        "new_booking_net_EUR",
        "new_booking_net_CHF",
        "renewal_booking_net_CHF",
        "renewal_booking_net_EUR",
    ],
    axis=1,
    inplace=True,
)

In [21]:
## append dataframes
mt_df_final = mt_df.append(mt_df_nocalcs).reset_index(drop=True)

## convert revenue_month_date to date
mt_df_final["revenue_month_date"] = pd.to_datetime(
    mt_df_final["revenue_month_date"]
).dt.date

In [22]:
## define table schema
bq_schema = [
    {"name": "transaction_id", "type": "STRING"},
    {"name": "zuid", "type": "INTEGER"},
    {"name": "payment_method", "type": "STRING"},
    {"name": "sku", "type": "STRING"},
    {"name": "type_of_transaction", "type": "STRING"},
    {"name": "transaction_date", "type": "TIMESTAMP"},
    {"name": "country_name", "type": "STRING"},
    {"name": "total_booking_net_CHF", "type": "FLOAT"},
    {"name": "total_booking_net_EUR", "type": "FLOAT"},
    {"name": "vat_CHF", "type": "FLOAT"},
    {"name": "vat_EUR", "type": "FLOAT"},
    {"name": "term_start", "type": "TIMESTAMP"},
    {"name": "term_end", "type": "TIMESTAMP"},
    {"name": "product_term_length", "type": "INTEGER"},
    {"name": "product_term_length_months", "type": "INTEGER"},
    {"name": "product_group_finance", "type": "STRING"},
    {"name": "revenue_month_number", "type": "INTEGER"},
    {"name": "revenue_month_date", "type": "DATE"},
    {"name": "total_revenue_net_EUR", "type": "FLOAT"},
    {"name": "total_revenue_net_CHF", "type": "FLOAT"},
    {"name": "active_sub_content", "type": "INTEGER"},
]

In [23]:
## export to BQ table
pandas_gbq.to_gbq(
    dataframe=mt_df_final,
    destination_table="temp.pypayment_v2_1_032021_2",
    project_id="zattoo-dataeng",
    if_exists="replace",
    progress_bar=None,
    table_schema=bq_schema,
)

In [24]:
## export to csv
path = r"/Users/miguelcouto/Desktop/"

mt_df_final.to_csv(os.path.join(path, r"pypayment_v2_1_full.csv"))