<a href="https://colab.research.google.com/github/MWFK/PySaprk_Data_Engineering_Pipeline/blob/main/1_VIP_Promotion_Python_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objectives

**Build** a small data pipeline that outputs **an overview of VIP customers**, **ordered by the total sales value of their purchases**. 

---

For this promotion, the marketing department is only interested in **VIPs currently located in The Netherlands.** 


---


**VIPs that have not purchased any products are still eligible** – they are VIP, after all. 


---


This ask is not a one-off, as Marketing is already thinking about running this promotion again next year if it is successful.


---


In the overview, Marketing is looking for:

  The name of the VIP

  The email address of the VIP

  The total sales value of the VIPs purchases

# Load Libs

In [None]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

# Import Data

In [None]:
# Data source: Sales domain
# One month's data

path_transactions = '/content/transactions.parquet'
df_transactions   = pq.read_table(path_transactions)
df_transactions   = df_transactions.to_pandas()

# print(df_transactions.shape)
# print(df_transactions.dtypes)
# df_transactions

# Data source: Customer domain
# Two week's data
# Date format yyyy-mm-dd

path_vips_1101 = '/content/vips_2020-11-01.csv'
df_vips_1101   = pd.read_csv (path_vips_1101)

# print(df_vips_1101.shape)
# print(df_vips_1101.dtypes)
# df_vips_1101.head()

# Data source: Customer domain
# One week's data

path_vips_1115 = '/content/vips_2020-11-15.csv'
df_vips_1115   = pd.read_csv (path_vips_1115)

# print(df_vips_1115.shape)
# print(df_vips_1115.dtypes)
# df_vips_1115.head()

# Data source: Customer domain
# One week's data

path_vips_1125 = '/content/vips_2020-11-25.csv'
df_vips_1125   = pd.read_csv (path_vips_1125)

# print(df_vips_1125.shape)
# print(df_vips_1125.dtypes)
# df_vips_1125.head()

# Source: Marketing domain
# One month's  data
# Date format dd-mm-yyyy
'''
'meta_change_date' have happened in 04-01-2020 and 25-01-2020, 
VIP's data have occured on 01-11-2020, 15-11-2020, 25-11-2020, 
Which means all the VIP data files needs to be updated according 
to he mapping file, before using them. 
'''

path_vips_mapping = '/content/umd_vip_to_profile_mapping.csv'
df_vips_mapping = pd.read_csv(path_vips_mapping, sep=';')

# print(df_vips_mapping.shape)
# print(df_vips_mapping.dtypes)
# df_vips_mapping.head()

# VIP Data Preprocessing

In [None]:
def vip_process(df_vips_mapping, start_date):

  # Date based data loading
  df_vips = pd.read_csv('/content/vips_'+start_date+'.csv')

  # Only keep VIP's currently based in the Netherlands
  df_vips = df_vips.loc[df_vips['country'] == 'The Netherlands'].reset_index(drop=True)
  
  ''' 
  An athlete got his status active as no since he was duplicated in the system
  and since from the meta_commets we have no other indication, that means he still an active VIP, 
  so we'll delete that duplicate VIP ID with active status no
  '''
  df_vips_mapping.drop_duplicates(subset='vip_id', keep="first", inplace=True)

  # Merge df_vips_mapping with df_vips_1101 using 'vip_id'
  df_vips_mapped = pd.merge(df_vips, df_vips_mapping, on='vip_id', how='left').reset_index(drop=True)

  # Only keep active VIP's
  df_vips_mapped = df_vips_mapped.loc[df_vips_mapped['active'] == 'yes'].reset_index(drop=True)

  df_vips_mapped['start_date'] = start_date

  return df_vips_mapped[['profile_id', 'first_name', 'email', 'start_date']]

In [None]:
start_date = '2020-11-25'
vip_process(df_vips_mapping, start_date)

Unnamed: 0,profile_id,first_name,email,start_date
0,1-5151-4642-1,Brandon,Brandon.Ritter@nike.com,2020-11-25
1,1-9860-3438-0,Kristin,Kristin@Brooks.com,2020-11-25
2,1-59971-953-3,Seth,Seth@Lee.com,2020-11-25


# Transaction Data Preprocessing

In [None]:
def transaction_process(df_transactions, start_date, end_date):

  # Filter transactions dates(between 2020-11-01 and 2020-11-14) 
  df_transactions_date = df_transactions[(df_transactions['transaction_date'] >= start_date) & (df_transactions['transaction_date'] < end_date)]

  # Drop the cancelled items
  df_transactions_date = df_transactions_date.loc[df_transactions_date['cancellation_flag'] != 'yes'].reset_index(drop=True)

  '''
  discount_amount_per_unit values should be prepapred to calculate the unit price after discount so if it's:
  NaN it should be changed with the value 1.
  100.0 should be 0.
  The rest should be divided by 100.
  '''
  df_transactions_date['discount_amount_per_unit'] = np.where(df_transactions_date['discount_amount_per_unit'] == 100, 0, df_transactions_date['discount_amount_per_unit'])
  df_transactions_date['discount_amount_per_unit'] = np.where(df_transactions_date['discount_amount_per_unit'] >= 0, df_transactions_date['discount_amount_per_unit']/100, 1)

  # Calculate the unit price after applying the discount
  df_transactions_date['unit_price_after_discount'] = df_transactions_date['recommended_retail_price_per_unit'].abs() * df_transactions_date['discount_amount_per_unit'] * df_transactions_date['quantity']

  # Filter out unecessary columns
  df_transactions_date = df_transactions_date[['profile_id', 'unit_price_after_discount']]

  # Total sales for each customer
  df_transactions_date = df_transactions_date.groupby(['profile_id']).sum()

  # Rename 
  df_transactions_date.rename(columns = {'unit_price_after_discount':'period_sales_after_discount'}, inplace = True)

  # We need this for the monthly overview
  df_transactions_date['start_date'] = start_date
 
  return df_transactions_date

In [None]:
start_date = '2020-11-15'
end_date   = '2020-11-25'
transaction_process(df_transactions, start_date, end_date)

Unnamed: 0_level_0,period_sales_after_discount,start_date
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1-4786-9297-9,644.0,2020-11-15
1-5151-4642-1,1550.75,2020-11-15
1-57006-547-0,755.0,2020-11-15


# Weekly Overview

In [None]:
def vip_sales_overview(df_vips_mapping, df_transactions, start_date, end_date):
  df_vips_mapped              = vip_process(df_vips_mapping, start_date)
  df_transactions_date        = transaction_process(df_transactions, start_date, end_date)
  df_vips_mapped_transactions = pd.merge(df_vips_mapped, df_transactions_date, on='profile_id', how='left').reset_index(drop=True)
  df_vips_mapped_transactions.rename(columns = {'start_date_x':'period_start'}, inplace = True)
  return df_vips_mapped_transactions[['profile_id','first_name', 'email', 'period_sales_after_discount', 'period_start']]

In [None]:
start_date = '2020-11-01'
end_date   = '2020-11-15'
vip_sales_overview(df_vips_mapping, df_transactions, start_date, end_date)

Unnamed: 0,profile_id,first_name,email,period_sales_after_discount,period_start
0,1-5151-4642-1,Brandon,Brandon.Ritter@nike.com,,2020-11-01
1,1-9860-3438-0,Kristin,Kristin@Brooks.com,362.0,2020-11-01


Interpretations
Brandon did not make a purchase during the first two weeks.
Kristin have bought two of the same unit during the first two weeks, 
Then he returned one of them during the last week 
Therefore, we should keep the minus values as an indications for returned units.

In [None]:
start_date = '2020-11-15'
end_date   = '2020-11-25'
vip_sales_overview(df_vips_mapping, df_transactions, start_date, end_date)

Unnamed: 0,profile_id,first_name,email,period_sales_after_discount,period_start
0,1-5151-4642-1,Brandon,Brandon.Ritter@nike.com,1550.75,2020-11-15
1,1-9860-3438-0,Kristin,Kristin@Brooks.com,,2020-11-15
2,1-59971-953-3,Seth,Seth@Lee.com,,2020-11-15


Seth did not make any transactions during this period.

Kristin did not make any transactions during this period.

In [None]:
start_date = '2020-11-25'
end_date   = '2020-11-31'
vip_sales_overview(df_vips_mapping, df_transactions, start_date, end_date)

Unnamed: 0,profile_id,first_name,email,period_sales_after_discount,period_start
0,1-5151-4642-1,Brandon,Brandon.Ritter@nike.com,372.0,2020-11-25
1,1-9860-3438-0,Kristin,Kristin@Brooks.com,-181.0,2020-11-25
2,1-59971-953-3,Seth,Seth@Lee.com,,2020-11-25


Seth did not make any transactions during this period.

Kristin have returned on eof the two items that he have purchased in the first week of november.


# Monthly Overview

In [None]:
def monthly_overview(df_vips_mapping, df_transactions, start_date, end_date):
  df = pd.DataFrame()
  for dt in range(3):
    df = df.append(vip_sales_overview(df_vips_mapping, df_transactions, start_dates[dt], end_dates[dt]), ignore_index = True)
  return df

In [None]:
start_dates = ('2020-11-01', '2020-11-15', '2020-11-25')
end_dates   = ['2020-11-15', '2020-11-25', '2020-11-31']
monthly_overview(df_vips_mapping, df_transactions, start_date, end_date)

Unnamed: 0,profile_id,first_name,email,period_sales_after_discount,period_start
0,1-5151-4642-1,Brandon,Brandon.Ritter@nike.com,,2020-11-01
1,1-9860-3438-0,Kristin,Kristin@Brooks.com,362.0,2020-11-01
2,1-5151-4642-1,Brandon,Brandon.Ritter@nike.com,1550.75,2020-11-15
3,1-9860-3438-0,Kristin,Kristin@Brooks.com,,2020-11-15
4,1-59971-953-3,Seth,Seth@Lee.com,,2020-11-15
5,1-5151-4642-1,Brandon,Brandon.Ritter@nike.com,372.0,2020-11-25
6,1-9860-3438-0,Kristin,Kristin@Brooks.com,-181.0,2020-11-25
7,1-59971-953-3,Seth,Seth@Lee.com,,2020-11-25


In [None]:
# Total sales for each customer during one month
monthly_overview(df_vips_mapping, df_transactions, start_date, end_date).groupby(['first_name', 'email']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,period_sales_after_discount
first_name,email,Unnamed: 2_level_1
Brandon,Brandon.Ritter@nike.com,1922.75
Kristin,Kristin@Brooks.com,181.0
Seth,Seth@Lee.com,0.0
