# Packages

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import Birch
from sklearn import metrics

from tqdm import tqdm
import pandas as pd
import numpy as np
import itertools

# Read file


In [2]:
# 308891
data = pd.read_csv('../feature_data/HappyGo_NOV_DEC_0509_FV.csv')

In [3]:
# 全部數值型(44)
feature_cols1 = ['INV_total_amount', 'INV_avg_amount', 'INV_count', 'INV_seller_count', 'INV_industry_count', 'INV_industry_per_catering', 'INV_industry_per_entertainment', 'INV_industry_per_financial_insurance',
                 'INV_industry_per_health_and_beauty', 'INV_industry_per_manufacture', 'INV_industry_per_other', 'INV_industry_per_professional_services', 'INV_industry_per_retail_service', 'INV_industry_per_transportation',
                 'INV_industry_per_wholesale', 'INV_time_avg_period', 'APP_participate_count', 'APP_bnnr_count', 'APP_game_count', 'APP_bnnr_per', 'APP_game_per', 'APP_time_avg_period', 'TXN_total_count', 'TXN_earn_count', 
                 'TXN_redeem_count', 'TXN_earn_count_per', 'TXN_redeem_count_per', 'TXN_total_point', 'TXN_earn_point', 'TXN_redeem_point', 'TXN_earn_point_per', 'TXN_redeem_point_per', 'TXN_industry_count', 'TXN_industry_per_retail',
                 'TXN_industry_per_other', 'TXN_industry_per_dingding', 'TXN_industry_per_financial', 'TXN_industry_per_digit', 'TXN_industry_per_transportation', 'TXN_industry_per_leisure','TXN_industry_per_health', 'TXN_industry_per_professional', 'TXN_industry_per_food', 'TXN_time_avg_period']
# 基本消費與互動資訊(8)
feature_cols2 = ['INV_total_amount', 'INV_count', 'INV_time_avg_period', 'APP_participate_count', 'APP_time_avg_period', 'TXN_total_count', 'TXN_total_point', 'TXN_time_avg_period']
# 基於消費通路(19)
feature_cols3 = ['INV_avg_amount', 'INV_count', 'INV_industry_count', 'INV_industry_per_retail_service', 'INV_industry_per_wholesale', 'INV_industry_per_professional_services', 'INV_industry_per_catering', 
                 'INV_industry_per_entertainment', 'INV_industry_per_manufacture', 'INV_industry_per_other', 'INV_industry_per_transportation', 'INV_industry_per_health_and_beauty', 
                 'INV_industry_per_financial_insurance', 'INV_time_avg_period', 'APP_participate_count', 'APP_time_avg_period', 'TXN_total_count','TXN_total_point', 'TXN_time_avg_period']
# 基於HappyGo平台互動通路(25)
feature_cols4 = ['INV_total_amount', 'INV_count', 'INV_time_avg_period', 'APP_participate_count', 'APP_bnnr_per', 'APP_game_per', 'APP_time_avg_period', 'TXN_total_count', 'TXN_earn_count_per', 'TXN_redeem_count_per', 
                 'TXN_total_point', 'TXN_earn_point_per', 'TXN_redeem_point_per', 'TXN_industry_count', 'TXN_industry_per_retail', 'TXN_industry_per_other', 'TXN_industry_per_dingding', 'TXN_industry_per_financial', 
                 'TXN_industry_per_digit', 'TXN_industry_per_transportation', 'TXN_industry_per_leisure', 'TXN_industry_per_health', 'TXN_industry_per_professional', 'TXN_industry_per_food', 'TXN_time_avg_period']

# Standardization & PCA

In [4]:
scaler = StandardScaler()
data_std1 = scaler.fit_transform(data[feature_cols1])
# data_std2 = scaler.fit_transform(data[feature_cols2])
data_std3 = scaler.fit_transform(data[feature_cols3])
data_std4 = scaler.fit_transform(data[feature_cols4])

In [5]:
pca1 = PCA()
pca1.fit(data_std1)
pca_cumsum1 = pca1.explained_variance_ratio_.cumsum()
pca_num1 = np.where(pca_cumsum1 < 0.9)[0]+1

# pca2 = PCA()
# pca2.fit(data_std2)
# pca_cumsum2 = pca2.explained_variance_ratio_.cumsum()
# pca_num2 = np.where(pca_cumsum2 < 0.9)[0]+1

pca3 = PCA()
pca3.fit(data_std3)
pca_cumsum3 = pca3.explained_variance_ratio_.cumsum()
pca_num3 = np.where(pca_cumsum3 < 0.9)[0]+1

pca4 = PCA()
pca4.fit(data_std4)
pca_cumsum4 = pca4.explained_variance_ratio_.cumsum()
pca_num4 = np.where(pca_cumsum4 < 0.9)[0]+1

# 參數排列組合

In [None]:
# pn * 27 種
cluster_num = [10, 20, 30]
max_radius = [0.5, 0.7, 0.9]
max_subcluster_num = [10000, 15000, 20000]
pca_num = [1, 2, 3, 4]

parameter_set1 = list(itertools.product(pca_num, cluster_num, max_radius, max_subcluster_num))
# parameter_set2 = list(itertools.product(pca_num2, cluster_num, max_radius, max_subcluster_num))
parameter_set3 = list(itertools.product(pca_num, cluster_num, max_radius, max_subcluster_num))
parameter_set4 = list(itertools.product(pca_num, cluster_num, max_radius, max_subcluster_num))

# Build the Birch model 

In [None]:
# parameter setting

def run_model(set, data_std):
    # get the parameter
    pn, cn, T, b = set

    # PCA
    pca = PCA(n_components = pn)
    pca.fit(data_std)
    data_pca = pca.transform(data_std)

    # build the model and evaluate
    model = Birch(threshold=T, branching_factor=b, n_clusters=cn)
    model.fit(data_pca)
    labels = model.fit_predict(data_pca)
    CH_score = round(metrics.calinski_harabasz_score(data_pca, labels), 4)
    
    return pn, cn, T, b, CH_score, labels

In [None]:
result1 = [run_model(s, data_std1) for s in tqdm(parameter_set1)]
result3 = [run_model(s, data_std3) for s in tqdm(parameter_set3)]
result4 = [run_model(s, data_std4) for s in tqdm(parameter_set4)]

 51%|█████     | 51/100 [1:42:03<1:38:02, 120.06s/it]
  0%|          | 0/100 [1:43:04<?, ?it/s]
100%|██████████| 50/50 [1:40:33<00:00, 120.66s/it]


In [None]:
df1 = pd.DataFrame(result1, columns=['pca_num', 'n', 'T', 'b', 'CH_score', 'labels'])
df3 = pd.DataFrame(result3, columns=['pca_num', 'n', 'T', 'b', 'CH_score', 'labels'])
df4 = pd.DataFrame(result4, columns=['pca_num', 'n', 'T', 'b', 'CH_score', 'labels'])

In [None]:
df1.to_csv("all_numeric_features.csv", index=False)
df3.to_csv("buy_way_features.csv", index=False)
df4.to_csv("happy_way_features.csv", index=False)