In [None]:
# MIT License
# 
# Copyright (c) 2018 Michael DeFelice
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import numpy as np, pandas as pd
import matplotlib.pyplot as plt

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.style.use ('ggplot')

#### Subscription Prices:
Note the first at 0 is a free trial ... As upgrades are considered advancing through this array, please ensure the pricing increases throughout the list.

In [None]:
subscription_prices     = [0,    5,    10,   15,   20  ]

#### Subscription Probabilities
These need to sum to 1

In [None]:
subscription_pvals      = [0.80, 0.10, 0.06, 0.02, 0.02]
sub_n = len (subscription_pvals)
sub_n

#### History Time Frame

In [None]:
start_date = '2014-01-01'
periods = 48

#### Set Parameters Random Data Generation

We may want to tweak these: 

- start number of subscriptions
- monthly upgrade percent
- monthly downgrade percent
- monthly new subscriptions
- monthly churn percent

In [None]:
start_n = 20000
upgrade_p = 0.02
downgrade_p = 0.02
new_n = 200
churn_p = 0.02

#### Initial Subscriptions

In [None]:
df = pd.DataFrame (index = range (start_n), columns = range (periods))
df.iloc[:, 0] = np.random.multinomial (1, subscription_pvals, start_n).argmax (axis = 1)

df.iloc[:, 0].value_counts()

#### New Subscriptions

In [None]:
for _ in range (1, periods):
  subscriptions = np.random.multinomial (1, subscription_pvals, new_n).argmax (axis = 1)
  df = pd.concat ([df,
                   pd.DataFrame (subscriptions, columns = [_]),
                  ], axis = 0)

In [None]:
# for i in df.columns.values:
#    print(df.iloc[:, i].value_counts())

In [None]:
# Reindex
df = df.reset_index (drop = True)

#### Upgrades and Downgrades

In [None]:
# Upgrades & downgrades
for _ in range (1, periods):
  df.iloc[:, _].update (df.iloc[:, _ - 1].dropna ())
  upgrade_candidates = df.iloc[:, _].dropna ()
  upgrade_candidates = upgrade_candidates[upgrade_candidates < sub_n - 1]
  upgrades = upgrade_candidates.sample (frac = upgrade_p)
  upgrades += 1
  df.iloc[:, _].update (upgrades)
  downgrade_candidates = df[~df.index.isin (upgrades.index)].iloc[:, _].dropna ()
  downgrade_candidates = downgrade_candidates[downgrade_candidates > 0]
  downgrades = downgrade_candidates.sample (frac = downgrade_p)
  downgrades -= 1
  df.iloc[:, _].update (downgrades)

In [None]:
df.head()

#### Churn

In [None]:
for _ in range (1, periods):
  churn_candidates = df.iloc[:, _ - 1].dropna ()
  churn = churn_candidates.sample (frac = churn_p)
  churn = pd.DataFrame (index = churn.index, columns = range (_, periods))
  churn = churn.fillna (-1)
  df.update (churn)

In [None]:
print(df.iloc[: ,0].value_counts())
print(df.iloc[: ,47].value_counts())

#### Add invoice date (column names) and customer id (row names)

In [None]:
df = df.replace (-1, np.nan)
df.columns = pd.DatetimeIndex (start = start_date, periods = periods, freq = 'M')
df.columns.name = 'Month_Invoiced'
df.index.name = 'Customer_Id'
df.index += 1000000

In [None]:
df.head()

#### Add MRR

In [None]:
cust_mrr = df.applymap (lambda a: subscription_prices[int (a)] if a >= 0 else a)

In [None]:
cust_mrr.tail()

In [None]:
cust_invoice = pd.concat ([
  df.unstack ().rename ('Subscription_Type'),
  cust_mrr.unstack ().rename ('Amount'),
], axis = 1).dropna ()

In [None]:
cust_invoice.head()

In [None]:
# For better presentation, customer ids start at 1,000,000 and invoice ids at 5,000,000
cust_invoice.insert (0, 'Invoice_Id', range (5000000, 5000000 + cust_invoice.shape[0]))
cust_invoice = cust_invoice.reset_index ()

In [None]:
cust_invoice.head()

In [None]:
# Do stuff now with cust_invoice
# ...

# Example Plot
fig, ax = plt.subplots (1, 1, figsize = (11, 8.5))
cust_invoice.groupby (['Month_Invoiced']).agg ({'Amount': np.sum,}).plot (ax = ax, kind = 'line', legend = False, title = 'Total MRR')
plt.show ()
plt.close ('all')

#### Create a DataFrame with Signup Date

In [None]:
cust_signup = cust_invoice.groupby('Customer_Id')['Month_Invoiced'].min().reset_index()
cust_signup = cust_signup.rename({'Month_Invoiced': 'Signup_Month'}, axis='columns')

#### Create a DataFrame with Churn Date

In [None]:
cust_churn = cust_invoice.groupby('Customer_Id')['Month_Invoiced'].max().reset_index()
cust_churn = cust_churn.rename({'Month_Invoiced': 'Churn_Month'}, axis='columns')
cust_churn = cust_churn[cust_churn['Churn_Month'] != '2017-12-31']

#### Subscription Services DataFrame

In [None]:
d = {'subscription_id': [-1, 0, 1, 2, 3, 4],
    'subscription_price': [0, 0, 5, 10, 15, 20],
    'subscription_dsc': ['discontinued', 'free_tier', 'basic', 'personal', 'professional', 'premium']}
df_sub = pd.DataFrame(data=d)

#### Customer DataFrame

In [None]:
df = pd.read_csv('./OpenAddress/ma.csv')
df_address = df.sample(n=len(cust_signup), replace=False, random_state=123)

In [None]:
cust = pd.concat([cust_signup.iloc[:,0], df_address.reset_index()], axis=1)


In [None]:
cust = cust.drop(columns=['index', 'HASH', 'ID']).rename({'REGION': 'STATE'}, axis='columns')

In [None]:
cust.head()

#### add customer_NPS table

where NPS 0-10 will be correlated with churn.  
it will have customer_id, NPS, survey_sent, survey_response_date, comments

some survey_response_date will be null for those who have not replied.  
some will have an nps score but no comments.

#### Write DataFrames to CSV

In [None]:
cust_churn.to_csv('customer_churn.csv')
cust_signup.to_csv('customer_signup.csv')
cust_invoice.to_csv('customer_invoice.csv')
cust_mrr.to_csv('customer_mrr.csv')
cust.to_csv('customer.csv')
df_sub.to_csv('subscription_services.csv')