In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp
import lib._util.dimreduce as dr

In [None]:
import pandas as pd
import copy

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Time measurement
import time
from datetime import timedelta, datetime

# Sound notification
import winsound

# Useful Functions

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH = 'resources/output/eda/graph/'

def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
#     winsound.Beep(frequency=1000, duration=100)
#     winsound.Beep(frequency=1500, duration=50)

# Phase 1 - Data Loading

In [None]:
def load_data(filename):
    source_file = f'{SOURCE_PATH_DATA}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';', dtype={'number': str},
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    df = pd.concat(df_chunks)
    
    # Separate by company
    df_dict = dict()
    for company in df['company_code'].unique():
        df_dict[company] = df[df['company_code'] == company].copy()
    
    return df_dict

In [None]:
EXEC_START = time.time()

df_dict = load_data('4D_result_1985-04-25_2019-12-31.csv')
print(df_dict.keys())

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

In [None]:
def faststat(df_dict):
    for key, df in df_dict.items():
        print(key)
        vp.faststat(df)
        print()

In [None]:
faststat(df_dict)

In [None]:
def histogram(df_dict, columns=None, subtitle='Histogram', layout_height=None):
    for key, df in df_dict.items():
        tmp_df = df.copy()
        if columns is not None:
            tmp_df = df[columns].copy()
            
        vp.histogram(tmp_df, title=f'{key} - {subtitle}', out_path=OUT_PATH_GRAPH, layout_height=layout_height)

In [None]:
histogram(df_dict, subtitle='Phase 1 - Histogram')

# Phase 2 - Data Preparation

- Drop non-informative field
- Handle invalid position

In [None]:
for key, df in df_dict.items():
    # Drop company column
    df.drop(columns=['company_code'], inplace=True)
    
    # Remove invalid number
    df = df[df['number'] != '----'].copy()
    
    df_dict[key] = df

In [None]:
# Handle invalid position on DMC due to duplication
tmp_df = df_dict['DMC'].copy()
tmp_df = tmp_df[tmp_df['position'] <= 10]
df_dict['DMC'] = tmp_df.copy()

del tmp_df

In [None]:
histogram(df_dict, subtitle='Phase 2 - Histogram')

# Phase 3 - Feature Engineering

- Populate number features

In [None]:
def number_feature(df_dict):
    # Load feature data
    source_file = f'{SOURCE_PATH_DATA}number_category.csv'
    feature_df  = pd.read_csv(source_file, sep=';', dtype={'number': str})
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        df = df.merge(feature_df, on='number', how='left')
        newdf_dict[key] = df
    
    return newdf_dict

def price_feature(df_dict):
    # Reference: https://www.magnum4d.my/en/4d-game
    price_dict = {
        'FST': 2500,
        'SCD': 1000,
        'TRD': 500,
        'SP': 180,
        'CONS': 60
    }
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        df['price'] = df['category'].map(price_dict)
    return newdf_dict

In [None]:
df_dict = number_feature(df_dict)
df_dict = price_feature(df_dict)

In [None]:
faststat(df_dict)

In [None]:
histogram(df_dict, subtitle='Phase 3 - Histogram', layout_height=1024)

# Phase 4 - RFM Analysis

- Calculate recency, frequency and monetary values for lottery number
- Assign RFM quantiles
- Assign RFM segments
- Calculate RFM scores
- Assign RFM levels

In [None]:
def generate_rfm(df_dict, groupby):
    def rfm_value(df, groupby):
        target_date = pd.Timestamp(datetime.now().strftime('%Y-%m-%d'))

        rfm_df = df.groupby(groupby).agg(
            recency=('draw_date', lambda x: (target_date - x.max()).days),
            frequency=('category', 'count'),
            monetary=('price', 'sum')
        ).reset_index()
        
        return rfm_df
    
    def rfm_quantitle(df, column, n_group=4, inverse=False):
        labels  = [x for x in range(n_group, 0, -1)] if inverse else [x for x in range(1, n_group +1)]
        uniques = df[column].unique()
        
        quantile_dict = dict(zip(uniques, pd.qcut(uniques, q=n_group, labels=labels)))
        df[f'{column}_quantile'] = df[column].map(quantile_dict)
        
        return df
    
    def rfm_segment(df):
        df['rfm_segment'] = df['recency_quantile'].astype(str) + '_' + df['frequency_quantile'].astype(str) + '_' + df['monetary_quantile'].astype(str)
        return df
    
    def rfm_score(df):
        df['rfm_score'] = df[['recency_quantile', 'frequency_quantile', 'monetary_quantile']].sum(axis=1)
        return df
    
    def rfm_level(df):
        # Reference: https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17
        df.loc[df['rfm_score'] <= 3, 'rfm_level'] = 'Activation Required (6)'

        df.loc[(df['rfm_score'] <= 5) &
               (df['rfm_level'].isna() == True), 'rfm_level'] = 'Needs Attention (5)'

        df.loc[(df['rfm_score'] <= 7) &
               (df['rfm_level'].isna() == True), 'rfm_level'] = 'Promising (4)'

        df.loc[(df['rfm_score'] <= 9) &
               (df['rfm_level'].isna() == True), 'rfm_level'] = 'Potential (3)'

        df.loc[(df['rfm_score'] <= 11) &
               (df['rfm_level'].isna() == True), 'rfm_level'] = 'Champions (2)'

        df.loc[(df['rfm_score'] <= 12) &
               (df['rfm_level'].isna() == True), 'rfm_level'] = 'Best of the Best (1)'
        
        # Reference: https://www.optimove.com/resources/learning-center/rfm-segmentation
        # RFM score: 8 - 9
        df.loc[df['rfm_segment'].isin(['4_1_4', '4_1_3']), 'rfm_level'] = 'High Profit New Number (3)'
        # RFM score: 7 - 9
        df.loc[df['rfm_segment'].isin(['3_4_1', '3_3_1', '4_4_1', '4_3_1']), 'rfm_level'] = 'Lowest Profit Active Number (3.5)'
        # RFM score: 7 - 9
        df.loc[df['rfm_segment'].isin(['1_3_3', '1_3_4', '1_4_3', '1_4_4']), 'rfm_level'] = 'Best Number At Risk (3.5)'
        
        return df
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        # Calculate RFM values
        df = rfm_value(df, groupby)
        
        # Assign RFM quantiles
        df = rfm_quantitle(df, 'recency', inverse=True)
        df = rfm_quantitle(df, 'frequency')
        df = rfm_quantitle(df, 'monetary')
        
        # Assign RFM segments
        df = rfm_segment(df)
        
        # Calculate RFM scores
        df = rfm_score(df)
        
        # Assign RFM levels
        df = rfm_level(df)
        
        newdf_dict[key] = df
    
    return newdf_dict

In [None]:
rfmdf_dict = generate_rfm(df_dict, groupby='number')

In [None]:
histogram(rfmdf_dict, subtitle='Phase 4 - Histogram', layout_height=1024)

In [None]:
for key, df in rfmdf_dict.items():
    df['left_digits'] = df['number'].str.slice(stop=2)
    df['right_digits'] = df['number'].str.slice(start=2, stop=4)

In [None]:
def number_heatmap(df_dict, z_col, subtitle='Heatmap', layout_height=None):
    for key, df in df_dict.items():
        vp.heatmap(df, x_col='left_digits', y_col='right_digits', z_col=z_col,
                   colorscale='Viridis',
                   text='Number: ' + df['number'] + '<br>RFM Segment: ' + df['rfm_segment'],
                   hoverinfo='text',
                   title=f'{key} - {subtitle}', out_path=OUT_PATH_GRAPH, layout_height=layout_height)

In [None]:
number_heatmap(rfmdf_dict, z_col='rfm_score', subtitle='Phase 4 - Heatmap - RFM Score')

In [None]:
number_heatmap(rfmdf_dict, z_col='recency_quantile', subtitle='Phase 4 - Heatmap - Recency Quantile')

In [None]:
number_heatmap(rfmdf_dict, z_col='frequency_quantile', subtitle='Phase 4 - Heatmap - Frequency Quantile')

In [None]:
number_heatmap(rfmdf_dict, z_col='monetary_quantile', subtitle='Phase 4 - Heatmap - Monetary Quantile')