In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import copy
import glob

# Plotly
import plotly.express as px

# Time measurement
import time
from datetime import timedelta, datetime

# Sound notification
import winsound

# Useful Functions

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/eda/graph/'
OUT_PATH_FILE    = 'resources/output/eda/file/'

In [None]:
def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
    winsound.Beep(frequency=1000, duration=100)
    winsound.Beep(frequency=1500, duration=50)

# Phase 1 - Data Loading
- Load number categories

In [None]:
def load_categories():
    source_file = f'{SOURCE_PATH_DATA}number_category.csv'
    df          = pd.read_csv(source_file, sep=';', dtype={'number': str})
    
    return df

In [None]:
df = load_categories()

vp.faststat(df)

In [None]:
vp.histogram(df,
             bin_algo='count',
             title='Phase 1 - Histogram - Number Category',
             out_path=f'{OUT_PATH_GRAPH}',
             max_col=4)

# Phase 2 - Data Loading
- Load draw dates

In [None]:
def load_dates():
    source_file = f'{SOURCE_PATH_DATA}4D_dates.csv'
    df          = pd.read_csv(source_file, sep=';',
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),)
    
    return df

In [None]:
df = load_dates()

vp.faststat(df)

In [None]:
vp.histogram(df,
             bin_algo='count',
             title='Phase 2 - Histogram - Draw Dates',
             out_path=f'{OUT_PATH_GRAPH}')

In [None]:
df['year_month'] = df['draw_date'].dt.to_period('M').astype(str)

tmp_df = df.groupby(['company_code', 'year_month']).agg(
    count=('draw_date', 'count')
).reset_index()

fig = px.bar(tmp_df, x='year_month', y='count', facet_row='company_code')
vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename='Phase 2 - Histogram - Company Draw Dates')

del tmp_df

# Phase 3 - Data Loading
- Load timeseries result

In [None]:
def load_data(filename):
    source_file = f'{SOURCE_PATH_DATA}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';',
                              dtype={'number': str},
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    df = pd.concat(df_chunks)
    
    # Separate by company
    df_dict = dict()
    for company in df['company_code'].unique():
        df_dict[company] = df[df['company_code'] == company].copy()
    
    return df_dict

In [None]:
df_dict = load_data('trainset.csv')
print(df_dict.keys())

In [None]:
def faststat(df_dict):
    for key, df in df_dict.items():
        print(key)
        vp.faststat(df)
        print()

In [None]:
faststat(df_dict)

In [None]:
def histogram(df_dict, title, columns=None,
              max_col=2, layout_kwargs={}, to_image=True):
    
    for key, df in df_dict.items():
        columns = df.columns if columns is None else columns
        vp.histogram(df[columns],
                     bin_algo='count',
                     title=f'{key} - {title}',
                     out_path=f'{OUT_PATH_GRAPH}{key}/',
                     max_col=max_col,
                     layout_kwargs=layout_kwargs,
                     to_image=to_image)

In [None]:
histogram(df_dict,
          title='Phase 3 - Histogram',
          max_col=3)

# Phase 4 - Data Preparation
- Drop non-informative field
- Handle invalid position

In [None]:
for key, df in df_dict.items():
    # Drop company column
    df.drop(columns=['company_code'], inplace=True)
    
    # Remove invalid number
    df = df[df['number'] != '----'].reset_index(drop=True).copy()
    
    df_dict[key] = df
    del df

In [None]:
# Handle invalid position on DMC due to duplication
tmp_df = df_dict['DMC'].copy()
tmp_df = tmp_df[tmp_df['position'] <= 10].reset_index(drop=True)
df_dict['DMC'] = tmp_df.copy()

del tmp_df

In [None]:
histogram(df_dict,
          title='Phase 4 - Histogram')

# Phase 5 - Feature Engineering
- Populate number, price & period features

In [None]:
def number_feature(df_dict):
    # Load feature data
    feature_df  = load_categories()
    rename_dict = {x: x.replace('group', 'digit') for x in feature_df.columns if x.startswith('group_')}
    feature_df.rename(columns=rename_dict, inplace=True)
    
    for column in rename_dict.values():
        feature_df[column] = feature_df[column].str.replace('*', '')
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        df = df.merge(feature_df, on='number', how='left')
        newdf_dict[key] = df
    
    return newdf_dict

def price_feature(df_dict):
    # Reference: https://www.magnum4d.my/en/4d-game
    price_dict = {
        'FST': 2500,
        'SCD': 1000,
        'TRD': 500,
        'SP':  180,
        'CONS': 60
    }
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        df['price'] = df['category'].map(price_dict)
    return newdf_dict

def period_feature(df_dict):
    # Load feature data
    period_df = load_dates()
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        tmp_df = period_df[period_df['company_code'] == key].reset_index(drop=True)
        tmp_df.sort_values(by='draw_date', inplace=True)
        tmp_df['draw_period'] = tmp_df.index + 1
        
        df = df.merge(tmp_df[['draw_date', 'draw_period']], on='draw_date', how='left')
        newdf_dict[key] = df
    
    return newdf_dict

In [None]:
df_dict = number_feature(df_dict)
df_dict = price_feature(df_dict)
df_dict = period_feature(df_dict)

In [None]:
faststat(df_dict)

In [None]:
histogram(df_dict,
          title='Phase 5 - Histogram',
          columns=['pattern', 'digit_4', 'digit_3', 'digit_2', 'digit_1', 'odd_even', 'big_small', 'price'],
          max_col=2,
          layout_kwargs={'height': 1000})

# Phase 6 - Data Preparation
- Convert to transactional format

In [None]:
def to_transaction(df_dict, trans_value):
    transdf_dict = {}
    
    for key, df in df_dict.items():
        count_df = df.groupby('draw_date').agg(
            count=(trans_value, 'count')
        ).reset_index()
        
        count   = count_df['count'].min()
        columns = ['1st', '2nd', '3rd'] + [f'Sp{x +1}' for x in range(10)] + [f'Cons{x +1}' for x in range(10)]
        assert count == len(columns), f'{key} - having invalid count'
        
        # Map draw date
        trans_df = pd.DataFrame(df[trans_value].values.reshape(-1, count), columns=columns)
        trans_df.index = df['draw_date'].unique()
        trans_df.reset_index(inplace=True)
        trans_df.rename(columns={'index': 'draw_date'}, inplace=True)
        
        # Map draw period
        period_df = df.groupby('draw_date').agg(
            draw_period=('draw_period', 'max')
        ).reset_index()
        trans_df = trans_df.merge(period_df, on='draw_date', how='left')
        
        # Re-order columns
        date_columns   = ['draw_date', 'draw_period']
        remain_columns = [x for x in trans_df.columns if x not in date_columns]
        trans_df       = pd.concat([trans_df[date_columns], trans_df[remain_columns]], axis=1)
        
        transdf_dict[key] = trans_df
    
    return transdf_dict

In [None]:
# Number transaction
transdf_dict = to_transaction(df_dict, trans_value='number')

faststat(transdf_dict)

In [None]:
def transaction_heatmap(df_dict, title,
                        heatmap_kwargs={}, layout_kwargs={}, to_image=True):
    
    for key, df in df_dict.items():
        tmp_df     = df.copy()
        categories = ['1st', '2nd', '3rd'] + [f'Sp{x +1}' for x in range(10)] + [f'Cons{x +1}' for x in range(10)]
        
        # Split number to digits
        for category in categories:
            tmp_df[category] = tmp_df[category].apply(lambda x: [int(c) for c in x])
        
        # Heatmap for each digits
        category_dict = {
            0: 'Digit 4',
            1: 'Digit 3',
            2: 'Digit 2',
            3: 'Digit 1'
        }
        for digit_key, digit_value in category_dict.items():
            vp.heatmap(
                x=categories,
                y=df['draw_period'],
                z=df[categories].apply(lambda x: list(map(lambda y: y[digit_key], x))).values,
                title=f'{key} - {title} - {digit_value}',
                out_path=f'{OUT_PATH_GRAPH}{key}/',
                layout_kwargs=layout_kwargs,
                to_image=to_image,
                heatmap_kwargs=heatmap_kwargs
            )
        print()

In [None]:
transaction_heatmap(transdf_dict,
                    title=f'Phase 6 - Heatmap - Transaction',
                    heatmap_kwargs={'colorscale': 'RdYlGn'},
                    to_image=False)

# Phase 7 - RFM Analysis
- Calculate recency, frequency and monetary values for lottery number
- Assign RFM quantiles
- Assign RFM segments
- Calculate RFM scores
- Assign RFM levels

In [None]:
def generate_rfm(df_dict, groupby, n_group=4):
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        # Calculate RFM values
        df = rfm_value(df, groupby)
        
        # Assign RFM quantiles
        df = rfm_quantitle(df, 'recency', n_group=n_group, inverse=True)
        df = rfm_quantitle(df, 'frequency', n_group=n_group)
        df = rfm_quantitle(df, 'monetary', n_group=n_group)
        
        # Assign RFM segments
        df = rfm_segment(df)
        
        # Calculate RFM scores
        df = rfm_score(df)
        
        # Assign RFM levels
        # df = rfm_level(df)
        
        newdf_dict[key] = df
    
    return newdf_dict

def rfm_value(df, groupby, target_period=None):
    if target_period is None:
        target_period = df['draw_period'].max()

    # Standard RFM
    rfm_df = df.groupby(groupby).agg(
        recency=('draw_period', lambda x: (target_period - x.max())),
        frequency=('category', 'count'),
        monetary=('price', 'sum')
    ).reset_index()
    
    # Category Frequency
    tmp_df = df.groupby([groupby, 'category']).agg(
        frequency=('draw_period', 'count')
    ).reset_index()
    
    freq_df = pd.DataFrame([str(x).zfill(4) for x in range(10000)], columns=['number'])
    for category in df['category'].unique():
        freq_df = freq_df.merge(tmp_df[tmp_df['category'] == category], on='number', how='left')
        freq_df.drop(columns=['category'], inplace=True)

        column = f'{category}_frequency'
        freq_df.rename(columns={'frequency': column}, inplace=True)
        freq_df[column] = freq_df[column].fillna(0).astype(int)
        
    rfm_df = rfm_df.merge(freq_df, on='number', how='left')
    
    # Average Monetary
    rfm_df['avg_monetary'] = rfm_df['monetary'] / rfm_df['frequency']
    rfm_df['avg_monetary'] = np.round(rfm_df['avg_monetary'], 2)
    
    return rfm_df

def rfm_quantitle(df, column, n_group=4, inverse=False):
    labels  = [x for x in range(n_group, 0, -1)] if inverse else [x for x in range(1, n_group +1)]
    uniques = df[column].unique()

    if len(uniques) == 1:
        quantile_dict = {x: 1 for x in uniques}
    else:
        quantile_dict = dict(zip(uniques, pd.qcut(uniques, q=n_group, labels=labels)))
    df[f'{column}_quantile'] = df[column].map(quantile_dict)
    df[f'{column}_quantile'] = df[f'{column}_quantile'].astype(str)

    return df

def rfm_segment(df):
    df['rfm_segment'] = df['recency_quantile'] + '_' + df['frequency_quantile'] + '_' + df['monetary_quantile']
    return df

def rfm_score(df):
    df['rfm_score'] = df[['recency_quantile', 'frequency_quantile', 'monetary_quantile']].astype(int).sum(axis=1)
    return df

def rfm_level(df):
    # Reference: https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17
    df.loc[df['rfm_score'] <= 3, 'rfm_level'] = 'Activation Required (6)'

    df.loc[(df['rfm_score'] <= 5) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Needs Attention (5)'

    df.loc[(df['rfm_score'] <= 7) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Promising (4)'

    df.loc[(df['rfm_score'] <= 9) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Potential (3)'

    df.loc[(df['rfm_score'] <= 11) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Champions (2)'

    df.loc[(df['rfm_score'] <= 12) &
           (df['rfm_level'].isna() == True), 'rfm_level'] = 'Best of the Best (1)'

    # Reference: https://www.optimove.com/resources/learning-center/rfm-segmentation
    # RFM score: 8 - 9
    df.loc[df['rfm_segment'].isin(['4_1_4', '4_1_3']), 'rfm_level'] = 'High Profit New Number (3)'
    # RFM score: 7 - 9
    df.loc[df['rfm_segment'].isin(['3_4_1', '3_3_1', '4_4_1', '4_3_1']), 'rfm_level'] = 'Lowest Profit Active Number (3.5)'
    # RFM score: 7 - 9
    df.loc[df['rfm_segment'].isin(['1_3_3', '1_3_4', '1_4_3', '1_4_4']), 'rfm_level'] = 'Best Number At Risk (3.5)'

    return df

In [None]:
# Number RFM
rfmdf_dict = generate_rfm(df_dict, groupby='number', n_group=10)

faststat(rfmdf_dict)

In [None]:
histogram(rfmdf_dict,
          title='Phase 7 - Histogram - Number RFM',
          max_col=3,
          layout_kwargs={'height': 1000})

In [None]:
def box(df_dict, title, color=None,
        max_col=2, layout_kwargs={}, to_image=True):
    
    for key, df in df_dict.items():
        vp.box(df,
               color=color,
               title=f'{key} - {title}',
               out_path=f'{OUT_PATH_GRAPH}{key}/',
               max_col=max_col,
               layout_kwargs=layout_kwargs,
               to_image=to_image)

In [None]:
box(rfmdf_dict,
    title='Phase 7 - Box - Number RFM',
    max_col=3)

In [None]:
def box_rfm(df_dict, title,
            max_col=2, layout_kwargs={}, to_image=True):
    
    for key, df in df_dict.items():
        fig1 = px.box(df, x='recency_quantile', y='recency')
        fig2 = px.box(df, x='frequency_quantile', y='frequency')
        fig3 = px.box(df, x='monetary_quantile', y='monetary')
        
        vp.datagroups_subplots(
            data_groups=[fig1['data'], fig2['data'], fig3['data']],
            xaxis_titles=['Recency Quantile', 'Frequency Quantile', 'Monetary Quantile'],
            yaxis_titles=['Recency', 'Frequency', 'Monetary'],
            title=f'{key} - {title}',
            out_path=f'{OUT_PATH_GRAPH}{key}/',
            max_col=max_col,
            layout_kwargs=layout_kwargs,
            to_image=to_image
        )

In [None]:
box_rfm(rfmdf_dict,
        title='Phase 7 - Box - RFM Quantitle',
        to_image=False)

In [None]:
def rfm_heatmap(df_dict, z, title,
                heatmap_kwargs={}, layout_kwargs={}, to_image=True):
    
    for key, df in df_dict.items():
        z_label = ' '.join(z.split('_')).title()
        heatmap_kwargs['text']      = 'Number: ' + df['number'] + '<br>RFM Segment: ' + df['rfm_segment'] + f'<br>{z_label}: ' + df[z].astype(str)
        heatmap_kwargs['hoverinfo'] = 'text'
        
        vp.heatmap(
            x=df['left_digits'],
            y=df['right_digits'],
            z=df[z].values,
            title=f'{key} - {title}',
            out_path=f'{OUT_PATH_GRAPH}{key}/',
            layout_kwargs=layout_kwargs,
            to_image=to_image,
            heatmap_kwargs=heatmap_kwargs
        )

In [None]:
# Split number to X & Y axis
for key, df in rfmdf_dict.items():
    df['left_digits']  = df['number'].str.slice(stop=2)
    df['right_digits'] = df['number'].str.slice(start=2, stop=4)

In [None]:
# RFM Score
rfm_heatmap(rfmdf_dict,
            z='rfm_score',
            title='Phase 7 - Heatmap - RFM Score',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Recency Quantile
rfm_heatmap(rfmdf_dict,
            z='recency_quantile',
            title='Phase 7 - Heatmap - Recency Quantile',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Frequency Quantile
rfm_heatmap(rfmdf_dict,
            z='frequency_quantile',
            title='Phase 7 - Heatmap - Frequency Quantile',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Monetary Quantile
rfm_heatmap(rfmdf_dict,
            z='monetary_quantile',
            title='Phase 7 - Heatmap - Monetary Quantile',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Recency
rfm_heatmap(rfmdf_dict,
            z='recency',
            title='Phase 7 - Heatmap - Recency',
            heatmap_kwargs={
                'colorscale': 'RdYlGn',
                'reversescale': True
            },
            to_image=False)

In [None]:
# Frequency
rfm_heatmap(rfmdf_dict,
            z='frequency',
            title='Phase 7 - Heatmap - Frequency',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Monetary
rfm_heatmap(rfmdf_dict,
            z='monetary',
            title='Phase 7 - Heatmap - Monetary',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Category Frequency
category_dict = {
    'FST': '1st',
    'SCD': '2nd',
    'TRD': '3rd',
    'SP': 'Special',
    'CONS': 'Consolation'
}

for key, value in category_dict.items():
    rfm_heatmap(rfmdf_dict,
                z=f'{key}_frequency',
                title=f'Phase 7 - Heatmap - Frequency - {value}',
                heatmap_kwargs={'colorscale': 'RdYlGn'},
                to_image=False)
    print()

In [None]:
# Average Monetary
rfm_heatmap(rfmdf_dict,
            z='avg_monetary',
            title='Phase 7 - Heatmap - Monetary - Average',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

# Phase 8 - Moving RFM Analysis
- Calculate RFM values on each periods

In [None]:
def generate_moving_rfm(df_dict, groupby, n_group=4, subtitle=None):
    for key, df in df_dict.items():
        print(key)
        
        # Gather dates
        dates = np.sort(df['draw_date'].unique())
        dates = list(map(lambda x: pd.Timestamp(x), dates))
        
        # Gather years
        years = list(set([x.year for x in dates]))
        
        for year in years:
            # Filter for dates fall under year
            filter_dates = [x for x in dates if x.year == year]
            
            # Perform RFM analysis on each dates
            filter_list = []
            for date in filter_dates:
                # Date filtering
                filter_df = df[df['draw_date'] <= date].copy()
                period    = filter_df['draw_period'].max()

                # Calculate RFM values
                filter_df = rfm_value(filter_df, groupby=groupby, target_period=period)

                # Collect data
                filter_df['date'] = date
                filter_list.append(filter_df.to_dict())

            # Compile data for each dates
            rfm_df = pd.concat([pd.DataFrame(x) for x in filter_list]).reset_index(drop=True)
            
            # Export moving RFM
            fp.generate_csv(rfm_df,
                            out_path=f'{OUT_PATH_FILE}{key}/Moving RFM/',
                            out_filename=f'{key} - {year}.csv',
                            export_index=False)

In [None]:
EXEC_START = time.time()

generate_moving_rfm(df_dict, groupby='number', n_group=10)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

# Phase 9 - Complete Set
- Load result having all numbers occurs at least once

In [None]:
def load_moving_rfm(company_code, start_year=None, end_year=None):
    files      = glob.glob(f'{OUT_PATH_FILE}{company_code}/Moving RFM/{company_code} - *.csv')
    files_dict = {x: int(x[x.index('.csv') - 4: x.index('.csv')]) for x in files}
    files      = [k for k,v in files_dict.items()
                  if (True if start_year is None else v >= start_year) and (True if end_year is None else v <= end_year)]
    
    dfs = []
    for file in files:
        print(file)
        df_chunks = pd.read_csv(file, sep=';', dtype={'number': str},
                                parse_dates=['date'],
                                date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                                chunksize=50_000)
        df = pd.concat(df_chunks)
        dfs.append(df)
        
    return pd.concat(dfs)

In [None]:
# Magnum
# - Operate from 1985-04-25, and 2013-11-17 is the date where all numbers occurs at least once
# - Last number to occur is 6962
company_code = 'MAG'
rfm_df = load_moving_rfm(company_code, start_year=2013, end_year=2013)
rfm_df = rfm_df[rfm_df['date'] == '2013-11-17'].reset_index(drop=True).copy()

# # Da Ma Cai
# # - Operate from 1990-01-06, and 2016-11-27 is the date where all numbers occurs at least once
# # - Last number to occur is 6771
# company_code = 'DMC'
# rfm_df = load_moving_rfm(company_code, start_year=2016, end_year=2016)
# rfm_df = rfm_df[rfm_df['date'] == '2016-11-27'].reset_index(drop=True).copy()

# # Sports Toto
# # - Operate from 1992-05-06, and 2015-05-27 is the date where all numbers occurs at least once
# # - Last number to occur is 5488
# company_code = 'ST'
# rfm_df = load_moving_rfm(company_code, start_year=2015, end_year=2015)
# rfm_df = rfm_df[rfm_df['date'] == '2015-05-27'].reset_index(drop=True).copy()

In [None]:
# Assign RFM quantiles
n_group = 10
rfm_df  = rfm_quantitle(rfm_df, 'recency', n_group=n_group, inverse=True)
rfm_df  = rfm_quantitle(rfm_df, 'frequency', n_group=n_group)
rfm_df  = rfm_quantitle(rfm_df, 'monetary', n_group=n_group)

# Assign RFM segments
rfm_df = rfm_segment(rfm_df)

# Calculate RFM scores
rfm_df = rfm_score(rfm_df)

In [None]:
histogram({company_code: rfm_df},
          title='Phase 8 - Histogram - Number RFM',
          columns=['number', 'recency', 'frequency', 'monetary',
                   'FST_frequency', 'SCD_frequency', 'TRD_frequency', 'SP_frequency', 'CONS_frequency',
                   'avg_monetary', 'recency_quantile', 'frequency_quantile', 'monetary_quantile',
                   'rfm_segment', 'rfm_score'],
          max_col=3,
          layout_kwargs={'height': 1000})

In [None]:
box({company_code: rfm_df},
    title='Phase 9 - Box - Number RFM',
    max_col=3)

In [None]:
box_rfm({company_code: rfm_df},
        title='Phase 9 - Box - RFM Quantitle',
        to_image=False)

In [None]:
# Split number to X & Y axis
rfm_df['left_digits']  = rfm_df['number'].str.slice(stop=2)
rfm_df['right_digits'] = rfm_df['number'].str.slice(start=2, stop=4)

In [None]:
# RFM Score
rfm_heatmap({company_code: rfm_df},
            z='rfm_score',
            title='Phase 9 - Heatmap - RFM Score',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Recency Quantile
rfm_heatmap({company_code: rfm_df},
            z='recency_quantile',
            title='Phase 9 - Heatmap - Recency Quantile',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Frequency Quantile
rfm_heatmap({company_code: rfm_df},
            z='frequency_quantile',
            title='Phase 9 - Heatmap - Frequency Quantile',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Monetary Quantile
rfm_heatmap({company_code: rfm_df},
            z='monetary_quantile',
            title='Phase 9 - Heatmap - Monetary Quantile',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Recency
rfm_heatmap({company_code: rfm_df},
            z='recency',
            title='Phase 9 - Heatmap - Recency',
            heatmap_kwargs={
                'colorscale': 'RdYlGn',
                'reversescale': True
            },
            to_image=False)

In [None]:
# Frequency
rfm_heatmap({company_code: rfm_df},
            z='frequency',
            title='Phase 9 - Heatmap - Frequency',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Monetary
rfm_heatmap({company_code: rfm_df},
            z='monetary',
            title='Phase 9 - Heatmap - Monetary',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

In [None]:
# Category Frequency
category_dict = {
    'FST': '1st',
    'SCD': '2nd',
    'TRD': '3rd',
    'SP': 'Special',
    'CONS': 'Consolation'
}

for key, value in category_dict.items():
    rfm_heatmap({company_code: rfm_df},
                z=f'{key}_frequency',
                title=f'Phase 9 - Heatmap - Frequency - {value}',
                heatmap_kwargs={'colorscale': 'RdYlGn'},
                to_image=False)

In [None]:
# Average Monetary
rfm_heatmap({company_code: rfm_df},
            z='avg_monetary',
            title='Phase 9 - Heatmap - Monetary - Average',
            heatmap_kwargs={'colorscale': 'RdYlGn'},
            to_image=False)

# Phase 10 - Feature Engineering
- Load result starting from date where all numbers occurs at least once

In [None]:
# Magnum
company_code = 'MAG'
data_df = load_moving_rfm(company_code, start_year=2013, end_year=2019)
data_df = data_df[data_df['date'] >= '2013-11-17'].reset_index(drop=True).copy()

# # Da Ma Cai
# company_code = 'DMC'
# data_df = load_moving_rfm(company_code, start_year=2016, end_year=2019)
# data_df = data_df[data_df['date'] >= '2016-11-27'].reset_index(drop=True).copy()

# # Sports Toto
# company_code = 'ST'
# data_df = load_moving_rfm(company_code, start_year=2015, end_year=2019)
# data_df = data_df[data_df['date'] >= '2015-05-27'].reset_index(drop=True).copy()

In [None]:
# TODO - create target class