In [4]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [5]:
data = pd.read_csv("C:/Users/KK Chan/Desktop/作業/fianl_project/payment.csv")

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102925 entries, 0 to 102924
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   customer_unique_id  102925 non-null  object
 1   payment_value       102925 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


In [18]:
data.dtypes

customer_unique_id    object
payment_value         object
dtype: object

In [6]:
data['payment_value'] = data['payment_value'].astype(str)

In [7]:
# 將數據類型轉換為浮點數 無法轉換的設為nan 
data['payment_value'] = pd.to_numeric(data['payment_value'], errors='coerce')

In [29]:
data['payment_value'].isna().sum()

1

In [8]:
data.dropna(inplace=True)

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102924 entries, 0 to 102924
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   customer_unique_id  102924 non-null  object 
 1   payment_value       102924 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.4+ MB


In [9]:
# 提取特征（支付金额）
X = data['payment_value'].values.reshape(-1, 1)

In [10]:
# 特征標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# 使用K-means聚類
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

In [12]:
# 獲取每個樣本所屬的聚類標籤
labels = kmeans.labels_

In [13]:
# 定義等級標籤映射
label_map = {
    4: 'High',
    3: 'SVIP',
    2: 'SSVIP',
    1: 'VIP',
    0: 'Normal'
}

In [14]:
# 將聚類結果映射為等級標籤
data['level_label'] = [label_map[label] for label in labels]

In [15]:
# 打印每個等級的統計信息
level_counts = data['level_label'].value_counts().sort_index(ascending=False)
print(level_counts)

level_label
VIP        4805
SVIP       1163
SSVIP       241
Normal    70733
High      25982
Name: count, dtype: int64


In [16]:
level_ranges = data.groupby('level_label')['payment_value'].agg(['min', 'max'])
print(level_ranges)

                 min       max
level_label                   
High          149.40    394.85
Normal          0.00    149.39
SSVIP        1925.81  13664.08
SVIP          898.51   1916.96
VIP           394.94    896.37


In [17]:
data

Unnamed: 0,customer_unique_id,payment_value,level_label
0,ffffd2657e2aad2907e67c3e9daecbeb,71.56,Normal
1,ffff5962728ec6157033ef9805bacc48,133.69,Normal
2,ffff371b4d645b6ecea244b27531430a,112.46,Normal
3,fffea47cd6d3cc0a88bd621562a9d061,84.58,Normal
4,fffcf5a5ff07b0908bd4e2dbc735a684,2067.42,SSVIP
...,...,...,...
102920,0004aac84e0df4da2b147fca70cf8255,196.89,High
102921,0000f6ccb0745a6a4b88665a16c9f078,43.62,Normal
102922,0000f46a3911fa3c0805444483337064,86.22,Normal
102923,0000b849f77a49e4a4ce2b2a4ca5be3f,27.19,Normal


In [64]:
# 將結果保存到新的CSV文件中
data.to_csv('payment_clustered.csv', index=True)