#  数据准备

In [1]:
import pandas as pd

# 尝试使用不同的编码格式加载数据
try:
    mobile_services = pd.read_csv('mobile_services.csv', encoding='utf-8-sig')
except UnicodeDecodeError:
    mobile_services = pd.read_csv('mobile_services.csv', encoding='gbk')

try:
    mobile_trans_level = pd.read_csv('mobile_trans_level.csv', encoding='utf-8-sig')
except UnicodeDecodeError:
    mobile_trans_level = pd.read_csv('mobile_trans_level.csv', encoding='gbk')




In [2]:
# 检查数据
mobile_services.head()

Unnamed: 0,CUSTID(客户编号),userType(客户类型),datetime(交易日期),NUMSRVID(产品编号),srvClass(产品类型)
0,685521,全球通,1451210488,,G
1,697155,全球通,1451223882,,G
2,704827,动感地,1451210519,,G
3,809077,动感地,1450295753,0.0,G
4,818875,动感地,1450449626,0.0,G


In [3]:
# 去除列名中的中文
mobile_services.columns = ['CUSTID', 'userType', 'datetime', 'NUMSRVID', 'srvClass']

In [4]:
mobile_trans_level.head()

Unnamed: 0,CUSTID,0,216,170,232,159,3,1,230,155,...,455,130,181,G,D,A,B,C,E,F
0,809077,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,818875,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,818961,1,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,818989,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,847000,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


# 挖掘客户类型与具体产品之间的多维关联规则

In [5]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# 数据预处理
# 将 userType 和 NUMSRVID 转换为二值特征
df = mobile_services[['CUSTID', 'userType', 'NUMSRVID']]
df = pd.get_dummies(df, columns=['userType', 'NUMSRVID'])

# 聚合数据
grouped = df.groupby('CUSTID').sum()
grouped[grouped > 1] = 1  # 将值超过1的部分转为1

# 生成频繁项集
frequent_itemsets = apriori(grouped, min_support=0.02, use_colnames=True)

# 生成关联规则
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules




Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(userType_全球通),(NUMSRVID_159.0),0.333814,0.952582,0.318682,0.954668,1.002189,0.000696,1.045998,0.003279
1,(userType_动感地),(NUMSRVID_159.0),0.330448,0.952582,0.31445,0.951586,0.998954,-0.000329,0.979424,-0.001561
2,(userType_神州行),(NUMSRVID_159.0),0.335738,0.952582,0.319451,0.95149,0.998853,-0.000367,0.977473,-0.001726
3,(NUMSRVID_153.0),(NUMSRVID_159.0),0.055144,0.952582,0.054888,0.995349,1.044895,0.002358,10.194768,0.045474
4,(NUMSRVID_229.0),(NUMSRVID_159.0),0.022186,0.952582,0.021545,0.971098,1.019437,0.000411,1.640646,0.019499
5,(NUMSRVID_232.0),(NUMSRVID_159.0),0.082043,0.952582,0.079863,0.973427,1.021882,0.00171,1.784433,0.023328
6,(NUMSRVID_229.0),(NUMSRVID_232.0),0.022186,0.082043,0.02209,0.995665,12.135904,0.02027,211.742105,0.93842
7,"(NUMSRVID_232.0, userType_全球通)",(NUMSRVID_159.0),0.027636,0.952582,0.026899,0.973318,1.021768,0.000573,1.777126,0.021909
8,"(NUMSRVID_232.0, userType_动感地)",(NUMSRVID_159.0),0.026418,0.952582,0.025648,0.970874,1.019202,0.000483,1.628002,0.019351
9,"(userType_神州行, NUMSRVID_232.0)",(NUMSRVID_159.0),0.027989,0.952582,0.027316,0.975945,1.024525,0.000654,1.971214,0.024628


# 挖掘产品之间的一维关联规则

In [6]:
# 数据预处理
df = mobile_services[['CUSTID', 'NUMSRVID']]
df = pd.get_dummies(df, columns=['NUMSRVID'])

# 聚合数据
grouped = df.groupby('CUSTID').sum()
grouped[grouped > 1] = 1  # 将值超过1的部分转为1

# 生成频繁项集
frequent_itemsets = apriori(grouped, min_support=0.02, use_colnames=True)

# 生成关联规则
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules




Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(NUMSRVID_153.0),(NUMSRVID_159.0),0.055144,0.952582,0.054888,0.995349,1.044895,0.002358,10.194768,0.045474
1,(NUMSRVID_229.0),(NUMSRVID_159.0),0.022186,0.952582,0.021545,0.971098,1.019437,0.000411,1.640646,0.019499
2,(NUMSRVID_232.0),(NUMSRVID_159.0),0.082043,0.952582,0.079863,0.973427,1.021882,0.00171,1.784433,0.023328
3,(NUMSRVID_229.0),(NUMSRVID_232.0),0.022186,0.082043,0.02209,0.995665,12.135904,0.02027,211.742105,0.93842
4,"(NUMSRVID_229.0, NUMSRVID_159.0)",(NUMSRVID_232.0),0.021545,0.082043,0.021513,0.998512,12.170608,0.019745,616.867173,0.938045
5,"(NUMSRVID_229.0, NUMSRVID_232.0)",(NUMSRVID_159.0),0.02209,0.952582,0.021513,0.973875,1.022353,0.00047,1.815037,0.022358
6,(NUMSRVID_229.0),"(NUMSRVID_159.0, NUMSRVID_232.0)",0.022186,0.079863,0.021513,0.969653,12.14149,0.019741,30.320712,0.938458


# 挖掘层次关联规则

In [7]:
# 加载数据
try:
    trans_data = pd.read_csv('mobile_trans_level.csv', encoding='utf-8-sig')  # 假设编码是utf-8-sig
except UnicodeDecodeError:
    trans_data = pd.read_csv('mobile_trans_level.csv', encoding='gbk')

# 确保数据以客户为行，产品订购情况为列，值为0或1
# (这里假设数据已经是这样的格式)

# 生成频繁项集
frequent_itemsets = apriori(trans_data.set_index('CUSTID'), min_support=0.06, use_colnames=True)

# 生成关联规则
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules




Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(232),(159),0.082198,0.954388,0.080014,0.973427,1.019949,0.001565,1.716496,0.021311
1,(232),(A),0.082198,0.96139,0.080078,0.974209,1.013333,0.001054,1.497007,0.014336
2,(B),(232),0.083387,0.082198,0.082198,0.985747,11.992296,0.075344,64.394946,1.0
3,(232),(B),0.082198,0.083387,0.082198,1.0,11.992296,0.075344,inf,0.998705
4,(A),(159),0.96139,0.954388,0.954388,0.992716,1.04016,0.036849,6.262267,1.0
5,(159),(A),0.954388,0.96139,0.954388,1.0,1.04016,0.036849,inf,0.846479
6,(B),(159),0.083387,0.954388,0.080881,0.969954,1.01631,0.001298,1.518069,0.017508
7,(B),(A),0.083387,0.96139,0.081042,0.97188,1.010911,0.000875,1.373028,0.011775
8,"(A, 232)",(159),0.080078,0.954388,0.080014,0.999198,1.046952,0.003588,56.855647,0.04875
9,"(159, 232)",(A),0.080014,0.96139,0.080014,1.0,1.04016,0.003089,inf,0.041968


# 评估和优化关联规则

In [8]:
# 筛选高支持度和高置信度的规则
filtered_rules = rules[(rules['support'] >= 0.08) & (rules['confidence'] >= 0.99)]
#confidence 0.5-0.98都有34条
# 输出筛选后的规则
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
3,(232),(B),0.082198,0.083387,0.082198,1.0,11.992296,0.075344,inf,0.998705
4,(A),(159),0.96139,0.954388,0.954388,0.992716,1.04016,0.036849,6.262267,1.0
5,(159),(A),0.954388,0.96139,0.954388,1.0,1.04016,0.036849,inf,0.846479
8,"(A, 232)",(159),0.080078,0.954388,0.080014,0.999198,1.046952,0.003588,56.855647,0.04875
9,"(159, 232)",(A),0.080014,0.96139,0.080014,1.0,1.04016,0.003089,inf,0.041968
12,"(159, 232)",(B),0.080014,0.083387,0.080014,1.0,11.992296,0.073342,inf,0.996334
17,"(A, 232)",(B),0.080078,0.083387,0.080078,1.0,11.992296,0.073401,inf,0.996404
21,"(A, B)",(159),0.081042,0.954388,0.080881,0.998018,1.045716,0.003536,23.015932,0.047572
22,"(159, B)",(A),0.080881,0.96139,0.080881,1.0,1.04016,0.003123,inf,0.042007
25,"(A, 159, 232)",(B),0.080014,0.083387,0.080014,1.0,11.992296,0.073342,inf,0.996334


In [9]:
# 根据提升度进行排序
filtered_rules = filtered_rules.sort_values(by='lift', ascending=False)
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
29,"(A, 232)","(159, B)",0.080078,0.080881,0.080014,0.999198,12.353862,0.073537,1145.681325,0.999056
31,"(159, 232)","(A, B)",0.080014,0.081042,0.080014,1.0,12.339279,0.07353,inf,0.998883
3,(232),(B),0.082198,0.083387,0.082198,1.0,11.992296,0.075344,inf,0.998705
12,"(159, 232)",(B),0.080014,0.083387,0.080014,1.0,11.992296,0.073342,inf,0.996334
17,"(A, 232)",(B),0.080078,0.083387,0.080078,1.0,11.992296,0.073401,inf,0.996404
25,"(A, 159, 232)",(B),0.080014,0.083387,0.080014,1.0,11.992296,0.073342,inf,0.996334
8,"(A, 232)",(159),0.080078,0.954388,0.080014,0.999198,1.046952,0.003588,56.855647,0.04875
26,"(A, B, 232)",(159),0.080078,0.954388,0.080014,0.999198,1.046952,0.003588,56.855647,0.04875
21,"(A, B)",(159),0.081042,0.954388,0.080881,0.998018,1.045716,0.003536,23.015932,0.047572
4,(A),(159),0.96139,0.954388,0.954388,0.992716,1.04016,0.036849,6.262267,1.0


In [17]:
import pandas as pd
import networkx as nx
from pyecharts import options as opts
from pyecharts.charts import Graph



# 删除包含空值的行
mobile_services = mobile_services.dropna()

# 去重
mobile_services = mobile_services.drop_duplicates()

# 判断分类关系
classification_relations = mobile_services[['NUMSRVID', 'srvClass']]

# 创建分类关系的有向图
G = nx.DiGraph()

for idx, row in classification_relations.iterrows():
    G.add_edge(str(row['srvClass']), str(row['NUMSRVID']))

# 获取节点和边的数据
nodes = [{"name": str(node)} for node in G.nodes]
links = [{"source": str(source), "target": str(target)} for source, target in G.edges]

# 调试输出节点和边
print("Nodes:", nodes)
print("Edges:", links)

# 使用 pyecharts 绘制图形
graph = (
    Graph()
    .add(
        "",
        nodes,
        links,
        repulsion=8000,
        edge_symbol=['circle', 'arrow'],
        edge_symbol_size=[4, 10],
        linestyle_opts=opts.LineStyleOpts(color="source", curve=0.3),
        label_opts=opts.LabelOpts(is_show=True, position="right"),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Classification Relationship Graph"),
        toolbox_opts=opts.ToolboxOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b}"),
    )
)

# 渲染图形
graph.render("classification_relationship.html")



Nodes: [{'name': 'G'}, {'name': '0.0'}, {'name': 'D'}, {'name': '216.0'}, {'name': 'A'}, {'name': '170.0'}, {'name': 'B'}, {'name': '232.0'}, {'name': '159.0'}, {'name': '3.0'}, {'name': '1.0'}, {'name': '230.0'}, {'name': '155.0'}, {'name': '213.0'}, {'name': 'C'}, {'name': '229.0'}, {'name': '153.0'}, {'name': 'E'}, {'name': '597.0'}, {'name': 'F'}, {'name': '542.0'}, {'name': '544.0'}, {'name': '596.0'}, {'name': '119.0'}, {'name': '548.0'}, {'name': '227.0'}, {'name': '225.0'}, {'name': '245.0'}, {'name': '543.0'}, {'name': '545.0'}, {'name': '121.0'}, {'name': '553.0'}, {'name': '589.0'}, {'name': '555.0'}, {'name': '226.0'}, {'name': '573.0'}, {'name': '560.0'}, {'name': '540.0'}, {'name': '215.0'}, {'name': '575.0'}, {'name': '455.0'}, {'name': '130.0'}, {'name': '181.0'}]
Edges: [{'source': 'G', 'target': '0.0'}, {'source': 'G', 'target': '3.0'}, {'source': 'G', 'target': '455.0'}, {'source': 'D', 'target': '216.0'}, {'source': 'D', 'target': '213.0'}, {'source': 'D', 'target':

'/Users/L.J.Wang/Desktop/Assignment2/classification_relationship.html'