In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp
import lib._util.dimreduce as dr

In [None]:
import pandas as pd
import numpy as np
import copy

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Time measurement
import time
from datetime import timedelta, datetime

# Sound notification
import winsound

# Useful Functions

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH = 'resources/output/eda/graph/'

def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
    winsound.Beep(frequency=1000, duration=100)
    winsound.Beep(frequency=1500, duration=50)

# Phase 1 - Data Loading

In [None]:
def load_data(filename):
    source_file = f'{SOURCE_PATH_DATA}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';', dtype={'number': str},
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    df = pd.concat(df_chunks)
    
    # Separate by company
    df_dict = dict()
    for company in df['company_code'].unique():
        df_dict[company] = df[df['company_code'] == company].copy()
    
    return df_dict

In [None]:
EXEC_START = time.time()

df_dict = load_data('4D_result_1985-04-25_2019-12-31.csv')
print(df_dict.keys())

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

In [None]:
def faststat(df_dict):
    for key, df in df_dict.items():
        print(key)
        vp.faststat(df)
        print()

In [None]:
faststat(df_dict)

In [None]:
def histogram(df_dict, columns=None, subtitle='Histogram', layout_height=None):
    for key, df in df_dict.items():
        tmp_df = df.copy()
        if columns is not None:
            tmp_df = df[columns].copy()
            
        vp.histogram(tmp_df, title=f'{key} - {subtitle}', out_path=OUT_PATH_GRAPH, layout_height=layout_height)

In [None]:
histogram(df_dict, subtitle='Phase 1 - Histogram')

# Phase 2 - Data Preparation

- Drop non-informative field
- Handle invalid position

In [None]:
for key, df in df_dict.items():
    # Drop company column
    df.drop(columns=['company_code'], inplace=True)
    
    # Remove invalid number
    df = df[df['number'] != '----'].copy()
    
    df_dict[key] = df

In [None]:
# Handle invalid position on DMC due to duplication
tmp_df = df_dict['DMC'].copy()
tmp_df = tmp_df[tmp_df['position'] <= 10]
df_dict['DMC'] = tmp_df.copy()

del tmp_df

In [None]:
histogram(df_dict, subtitle='Phase 2 - Histogram')

# Phase 3 - Feature Engineering

- Populate number features

In [None]:
def number_feature(df_dict):
    # Load feature data
    source_file = f'{SOURCE_PATH_DATA}number_category.csv'
    feature_df  = pd.read_csv(source_file, sep=';', dtype={'number': str})
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        df = df.merge(feature_df, on='number', how='left')
        newdf_dict[key] = df
    
    return newdf_dict

def price_feature(df_dict):
    # Reference: https://www.magnum4d.my/en/4d-game
    price_dict = {
        'FST': 2500,
        'SCD': 1000,
        'TRD': 500,
        'SP': 180,
        'CONS': 60
    }
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        df['price'] = df['category'].map(price_dict)
    return newdf_dict

In [None]:
df_dict = number_feature(df_dict)
df_dict = price_feature(df_dict)

In [None]:
faststat(df_dict)

In [None]:
histogram(df_dict, subtitle='Phase 3 - Histogram', layout_height=1024)

# Phase 4 - RFM Analysis

- Calculate recency, frequency and monetary values for lottery number
- Assign RFM quantiles
- Assign RFM segments
- Calculate RFM scores
- Assign RFM levels

In [None]:
def generate_rfm(df_dict, groupby):
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        # Calculate RFM values
        df = rfm_value(df, groupby)
        
        # Assign RFM quantiles
        df = rfm_quantitle(df, 'recency', inverse=True)
        df = rfm_quantitle(df, 'frequency')
        df = rfm_quantitle(df, 'monetary')
        
        # Assign RFM segments
        df = rfm_segment(df)
        
        # Calculate RFM scores
        df = rfm_score(df)
        
        # Assign RFM levels
        df = rfm_level(df)
        
        newdf_dict[key] = df
    
    return newdf_dict

def rfm_value(df, groupby, target_date=None):
    if target_date is None:
        target_date = pd.Timestamp(datetime.now().strftime('%Y-%m-%d'))

    rfm_df = df.groupby(groupby).agg(
        recency=('draw_date', lambda x: (target_date - x.max()).days),
        frequency=('category', 'count'),
        monetary=('price', 'sum')
    ).reset_index()

    return rfm_df

def rfm_quantitle(df, column, n_group=4, inverse=False):
    labels  = [x for x in range(n_group, 0, -1)] if inverse else [x for x in range(1, n_group +1)]
    uniques = df[column].unique()

    if len(uniques) == 1:
        quantile_dict = {x: 1 for x in uniques}
    else:
        quantile_dict = dict(zip(uniques, pd.qcut(uniques, q=n_group, labels=labels)))
    df[f'{column}_quantile'] = df[column].map(quantile_dict)

    return df

def rfm_segment(df):
    df['rfm_segment'] = df['recency_quantile'].astype(str) + '_' + df['frequency_quantile'].astype(str) + '_' + df['monetary_quantile'].astype(str)
    return df

def rfm_score(df):
    df['rfm_score'] = df[['recency_quantile', 'frequency_quantile', 'monetary_quantile']].sum(axis=1)
    return df

def rfm_level(df):
    # Reference: https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17
    df.loc[df['rfm_score'] <= 3, 'rfm_level'] = 'Activation Required (6)'

    df.loc[(df['rfm_score'] <= 5) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Needs Attention (5)'

    df.loc[(df['rfm_score'] <= 7) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Promising (4)'

    df.loc[(df['rfm_score'] <= 9) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Potential (3)'

    df.loc[(df['rfm_score'] <= 11) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Champions (2)'

    df.loc[(df['rfm_score'] <= 12) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Best of the Best (1)'

    # Reference: https://www.optimove.com/resources/learning-center/rfm-segmentation
    # RFM score: 8 - 9
    df.loc[df['rfm_segment'].isin(['4_1_4', '4_1_3']), 'rfm_level'] = 'High Profit New Number (3)'
    # RFM score: 7 - 9
    df.loc[df['rfm_segment'].isin(['3_4_1', '3_3_1', '4_4_1', '4_3_1']), 'rfm_level'] = 'Lowest Profit Active Number (3.5)'
    # RFM score: 7 - 9
    df.loc[df['rfm_segment'].isin(['1_3_3', '1_3_4', '1_4_3', '1_4_4']), 'rfm_level'] = 'Best Number At Risk (3.5)'

    return df

### Number RFM

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='number')

In [None]:
histogram(rfmdf_dict, subtitle='Phase 4 - Histogram', layout_height=1024)

In [None]:
for key, df in rfmdf_dict.items():
    df['left_digits']  = df['number'].str.slice(stop=2)
    df['right_digits'] = df['number'].str.slice(start=2, stop=4)

In [None]:
def number_heatmap(df_dict, z_col, subtitle='Heatmap', layout_height=None):
    for key, df in df_dict.items():
        vp.heatmap(df, x_col='left_digits', y_col='right_digits', z_col=z_col,
                   colorscale='Viridis',
                   text='Number: ' + df['number'] + '<br>RFM Segment: ' + df['rfm_segment'],
                   hoverinfo='text',
                   title=f'{key} - {subtitle}', out_path=OUT_PATH_GRAPH, layout_height=layout_height)

In [None]:
number_heatmap(rfmdf_dict, z_col='rfm_score', subtitle='Phase 4 - Heatmap - RFM Score')

In [None]:
number_heatmap(rfmdf_dict, z_col='recency_quantile', subtitle='Phase 4 - Heatmap - Recency Quantile')

In [None]:
number_heatmap(rfmdf_dict, z_col='frequency_quantile', subtitle='Phase 4 - Heatmap - Frequency Quantile')

In [None]:
number_heatmap(rfmdf_dict, z_col='monetary_quantile', subtitle='Phase 4 - Heatmap - Monetary Quantile')

### Pattern RFM

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='pattern')

In [None]:
def bar(df_dict, x_col, y_cols, subtitle='Bar', layout_height=None):
    for key, df in df_dict.items():
        vp.bar(df, x_col=x_col, y_cols=y_cols,
               title=f'{key} - {subtitle}', out_path=OUT_PATH_GRAPH, layout_height=layout_height)

In [None]:
y_cols = ['recency_quantile', 'recency',
          'frequency_quantile', 'frequency',
          'monetary_quantile', 'monetary',
          'rfm_score']
bar(rfmdf_dict, x_col='pattern', y_cols=y_cols, subtitle='Phase 4 - Bar - Pattern RFM', layout_height=1024)

### Group RFM

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='group_4')

In [None]:
y_cols = ['recency_quantile', 'recency',
          'frequency_quantile', 'frequency',
          'monetary_quantile', 'monetary',
          'rfm_score']
bar(rfmdf_dict, x_col='group_4', y_cols=y_cols, subtitle='Phase 4 - Bar - Group-4 RFM', layout_height=1024)

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='group_3')

In [None]:
y_cols = ['recency_quantile', 'recency',
          'frequency_quantile', 'frequency',
          'monetary_quantile', 'monetary',
          'rfm_score']
bar(rfmdf_dict, x_col='group_3', y_cols=y_cols, subtitle='Phase 4 - Bar - Group-3 RFM', layout_height=1024)

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='group_2')

In [None]:
y_cols = ['recency_quantile', 'recency',
          'frequency_quantile', 'frequency',
          'monetary_quantile', 'monetary',
          'rfm_score']
bar(rfmdf_dict, x_col='group_2', y_cols=y_cols, subtitle='Phase 4 - Bar - Group-2 RFM', layout_height=1024)

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='group_1')

In [None]:
y_cols = ['recency_quantile', 'recency',
          'frequency_quantile', 'frequency',
          'monetary_quantile', 'monetary',
          'rfm_score']
bar(rfmdf_dict, x_col='group_1', y_cols=y_cols, subtitle='Phase 4 - Bar - Group-1 RFM', layout_height=1024)

### Odd Even RFM

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='odd_even')

In [None]:
y_cols = ['recency_quantile', 'recency',
          'frequency_quantile', 'frequency',
          'monetary_quantile', 'monetary',
          'rfm_score']
bar(rfmdf_dict, x_col='odd_even', y_cols=y_cols, subtitle='Phase 4 - Bar - Odd-Even RFM', layout_height=1024)

### Big Small RFM

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='big_small')

In [None]:
y_cols = ['recency_quantile', 'recency',
          'frequency_quantile', 'frequency',
          'monetary_quantile', 'monetary',
          'rfm_score']
bar(rfmdf_dict, x_col='big_small', y_cols=y_cols, subtitle='Phase 4 - Bar - Big-Small RFM', layout_height=1024)

# Phase 5 - Moving RFM Analysis

- Calculate RFM values on each periods

In [None]:
def generate_moving_rfm(df_dict, groupby, observe, n_top=10):
    newdf_dict = {}
    for key, df in df_dict.items():
        print(key)
        
        # Gather dates
        dates = np.sort(df['draw_date'].unique())
        dates = list(map(lambda x: pd.Timestamp(x), dates))
        
        # Perform RFM analysis on each dates
        top_list = []
        for date in dates:
            print(date)
            
            # Date filtering
            filter_df = df[df['draw_date'] <= date].copy()

            # Calculate RFM values
            filter_df = rfm_value(filter_df, groupby=groupby, target_date=date)

            # Assign RFM quantiles
            filter_df = rfm_quantitle(filter_df, 'recency', inverse=True)
            filter_df = rfm_quantitle(filter_df, 'frequency')
            filter_df = rfm_quantitle(filter_df, 'monetary')

            # Assign RFM segments
            filter_df = rfm_segment(filter_df)

            # Calculate RFM scores
            filter_df = rfm_score(filter_df)

            # Assign RFM levels
            filter_df = rfm_level(filter_df)
            
            # Collect top N data
            filter_df['date'] = date
            top_df = filter_df.sort_values(by=observe, ascending=False)[:n_top]
            top_list.append(top_df.to_dict())
        
        print()
        newdf_dict[key] = pd.concat([pd.DataFrame(x) for x in top_list]).reset_index(drop=True)
    
    return newdf_dict

In [None]:
EXEC_START = time.time()

rfmdf_dict = generate_moving_rfm(df_dict, groupby='number', observe='rfm_score', n_top=50)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

In [None]:
for key, df in rfmdf_dict.items():
    df['date']   = df['date'].astype(str)
    df['number'] = '[' + df['number'] + ']'

In [None]:
def animated_bar(df_dict, target, observe, subtitle=None):
    for key, df in df_dict.items():
        fig = px.bar(df, x=target, y=observe,
                     animation_frame='date', animation_group=target,
                     range_y=[df[observe].min() -1, df[observe].max() +1])
        
        fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 500
        vp.generate_plot(fig, out_path=OUT_PATH_GRAPH, out_filename=f'{key} - {subtitle}.html')

In [None]:
EXEC_START = time.time()

animated_bar(rfmdf_dict, target='number', observe='rfm_score', subtitle='Phase 4 - Animated Bar - RFM Score')

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)