In [None]:
import pandas as pd

# 讀取資料
orders_df = pd.read_csv("your_path/orders.csv")
member_df = pd.read_csv("your_path/member.csv")
behavior_202207_df = pd.read_csv("your_path/behavior_202207.csv")
# 檢視資料
orders_df.head()
member_df.head()
behavior_202207_df.head()

# orders_df member_df和behavior_df有共同欄位 'customer_id'，合併資料
merged_df = pd.merge(orders_df, member_df, on='customer_id', how='inner')
final_df = pd.merge(merged_df, behavior_202207_df, on='customer_id', how='inner')

In [None]:
#查看個欄位類別數
categorical_columns = [colname for colname in final_df.columns if final_df[colname].dtype == "object"]
for col in categorical_columns:
    print("# unique values in '{0}': {1}".format(col, len(final_df[col].unique().tolist())))

In [None]:
#各欄位敘述統計量
import matplotlib.pyplot as plt
import seaborn as sns
# 設定顏色調色盤
colors = ["#4492C0", "#EDB120", "#78C3D0"]
sns.set_palette(sns.color_palette(colors))
# 欄位列表
columns_to_plot = ['created_by_x', 'created_by_y', 'gender', 'event_name']
# 繪製圖表
for col in columns_to_plot:
    plt.figure(figsize=(8, 6), dpi=100)
    # 繪製計數圖
    sns.countplot(data=final_df, x=col)
    # 設定標籤和標題
    plt.xlabel(col.replace('_', ' ').title(), fontsize=12, fontweight='bold')
    plt.ylabel('Count', fontsize=12, fontweight='bold')
    plt.title(f'Frequency of {col.replace("_", " ").title()}', fontsize=14, fontweight='bold')
    # 列印每個欄位的訪問次數
    col_counts = final_df[col].value_counts()
    for value, count in col_counts.items():
        print(f'{col.replace("_", " ").title()} {value} - {count}')
    # 顯示圖表
    plt.show()

In [None]:
#各欄位與銷售額之直方圖
import matplotlib.pyplot as plt
import seaborn as sns
# 設定顏色調色盤
colors = ["#4492C0", "#EDB120", "#78C3D0"]
sns.set(style="whitegrid", palette=sns.color_palette(colors))
# 繪製根據 created_by_x 的計數圖
plt.figure(figsize=(10, 6), dpi=100)
sns.countplot(data=final_df, x='event_name', hue='created_by_x', palette=sns.color_palette(colors))
plt.xlabel('Event Name', fontsize=12, fontweight='bold')
plt.ylabel('Count', fontsize=12, fontweight='bold')
plt.title('Sales of Event Name Based on Created By X', fontsize=14, fontweight='bold')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Created By X', title_fontsize='12', fontsize='10')
plt.xticks(rotation=45)
plt.show()
# 繪製根據 created_by_y 的計數圖
plt.figure(figsize=(10, 6), dpi=100)
sns.countplot(data=final_df, x='event_name', hue='created_by_y', palette=sns.color_palette(colors))
plt.xlabel('Event Name', fontsize=12, fontweight='bold')
plt.ylabel('Count', fontsize=12, fontweight='bold')
plt.title('Sales of Event Name Based on Created By Y', fontsize=14, fontweight='bold')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Created By Y', title_fontsize='12', fontsize='10')
plt.xticks(rotation=45)
plt.show()

# 繪製根據 gender 的計數圖
plt.figure(figsize=(10, 6), dpi=100)
sns.countplot(data=final_df, x='event_name', hue='gender', palette=sns.color_palette(colors))
plt.xlabel('Event Name', fontsize=12, fontweight='bold')
plt.ylabel('Count', fontsize=12, fontweight='bold')
plt.title('Sales of Event Name Based on Gender', fontsize=14, fontweight='bold')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Gender', title_fontsize='12', fontsize='10')
plt.xticks(rotation=45)
plt.show()

In [None]:
#檢查欄位相關性
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# 假設 final_df 已經存在並包含所需欄位
categorical_vars = [ 'created_by_x', 'channel_created_by_channel_name', 'order_payment_type', 'sku',
                    'item_variation_id', 'created_by_y', 'gender', 'membership_tier', 'device_category',
                    'device_mobile_brand_name', 'device_browser', 'device_language', 'country', 'source', 'medium',
                    'event_name', 'item_id', 'partition_date']
# 初始化空的相關性矩陣
corr_matrix = pd.DataFrame(index=categorical_vars, columns=categorical_vars)
# 計算類別變量之間的相關性
for var1 in categorical_vars:
    for var2 in categorical_vars:
        if var1 != var2:
            contingency_table = pd.crosstab(final_df[var1], final_df[var2])
            chi2, p, dof, ex = chi2_contingency(contingency_table)
            corr_matrix.loc[var1, var2] = round(chi2, 2)
            corr_matrix.loc[var2, var1] = round(chi2, 2)
        else:
            corr_matrix.loc[var1, var2] = np.nan
# 繪製熱點圖
plt.figure(figsize=(16, 14))
sns.heatmap(corr_matrix.astype(float), annot=True, fmt='.2f', cmap='coolwarm', cbar_kws={'label': 'Chi2 Statistic'})
plt.title('Categorical Variables Correlation Heatmap', fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.show()


In [None]:
#分類別及連續資料，再次繪製相關性圖
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
# final_df 已經存在並包含所需欄位
continuous_vars = ['created_at', 'updated_at', 'registered_at', 'event_timestamp', 'birthday']
categorical_vars = ['customer_id', 'created_by_x', 'channel_created_by_channel_name', 'order_payment_type', 'sku',
                    'item_variation_id', 'created_by_y', 'gender', 'membership_tier', 'device_category',
                    'device_mobile_brand_name', 'device_browser', 'device_language', 'country', 'source', 'medium',
                    'event_name', 'item_id', 'transaction_id', 'partition_date']

# 將日期時間類型轉換為 Unix 
for var in continuous_vars:
    if pd.api.types.is_datetime64_any_dtype(final_df[var]):
        final_df[var] = final_df[var].astype(np.int64) // 10**9  # 轉換為秒級 Unix 時間戳
# 計算連續變量之間的相關性
corr_matrix = final_df[continuous_vars].corr()
# 計算類別變量之間的相關性
for var1 in categorical_vars:
    for var2 in categorical_vars:
        if var1 != var2:
            contingency_table = pd.crosstab(final_df[var1], final_df[var2])
            chi2, p, dof, ex = chi2_contingency(contingency_table)
            corr_matrix.loc[var1, var2] = chi2
            corr_matrix.loc[var2, var1] = chi2
        else:
            corr_matrix.loc[var1, var2] = np.nan
# 合併連續變量和類別變量之間的相關性
for var1 in continuous_vars:
    for var2 in categorical_vars:
        contingency_table = pd.crosstab(final_df[var2], pd.qcut(final_df[var1], q=10, duplicates='drop'))
        chi2, p, dof, ex = chi2_contingency(contingency_table)
        corr_matrix.loc[var1, var2] = chi2
        corr_matrix.loc[var2, var1] = chi2
# 繪製熱點圖
plt.figure(figsize=(16, 14))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar_kws={'label': 'Correlation Coefficient / Chi2 Statistic'})
plt.title('Correlation Heatmap', fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.show()