CPD builder

In [None]:
import pandas as pd
from pgmpy.factors.discrete import TabularCPD

class BayesianCPTBuilder:
    def __init__(self, filepath):
        self.df = pd.read_csv(filepath)
        self.df = self.df.rename(columns={
            'gender': 'Gender',
            'feature': 'Feature',
            'median_income': 'IncomeLevel',
            'PROPORTION': 'Proportion'
        })
        self.genders = sorted(self.df['Gender'].unique())
        self.feature_groups = sorted(self.df['Feature'].unique())
        self.income_levels = sorted(self.df['IncomeLevel'].unique())
        #TODO modulize later
        self.income_levels = ['<10k', '10k-20k', '20k-30k', '30k-50k', '50k-70k', '70k-90k', '90k-110k', '110k-130k', '130-150k', '>150k']

    def build_feature_given_gender_cpd(self, feature_name):
        cpt_matrix = []
        for feature in self.feature_groups:
            row = []
            for gender in self.genders:
                prob = self.df[
                    (self.df['Gender'] == gender) & (self.df['Feature'] == feature)
                ]['Proportion'].values
                row.append(prob[0] if len(prob) > 0 else 0.0)
            cpt_matrix.append(row)

        return TabularCPD(
            variable=feature_name,
            variable_card=len(self.feature_groups),
            values=cpt_matrix,
            evidence=['Gender'],
            evidence_card=[len(self.genders)],
            state_names={
                feature_name: self.feature_groups,
                'Gender': self.genders
            }
        )
        
    def build_income_given_feature_gender_cpd(self, feature_name):
        cpt_matrix = []
        for income in self.income_levels:
            row = []
            for gender in self.genders:
                for feature in self.feature_groups:
                    prob = self.df[
                        (self.df['Gender'] == gender) &
                        (self.df['Feature'] == feature) &
                        (self.df['IncomeLevel'] == income)
                    ]['Proportion'].sum()
                    row.append(prob)
            cpt_matrix.append(row)

        import numpy as np
        cpt_array = np.array(cpt_matrix)
        col_sums = cpt_array.sum(axis=0)
        normalized_matrix = (cpt_array / col_sums).tolist()

        return TabularCPD(
            variable='IncomeLevel',
            variable_card=len(self.income_levels),
            values=normalized_matrix,
            evidence=[feature_name, 'Gender'],
            evidence_card=[len(self.feature_groups), len(self.genders)],
            state_names={
                'IncomeLevel': self.income_levels,
                feature_name: self.feature_groups,
                'Gender': self.genders
            }
        )




In [2]:
##CPT可视化热力图 TODO

# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # 读取整理后的 CPT 数据
# df = pd.read_csv("Age_group_gender_transformed.csv")

# # 重命名列方便处理
# df = df.rename(columns={'feature': 'Feature', 'gender': 'Gender', 'PROPORTION': 'Probability'})

# # 创建透视表：行是年龄组，列是性别，值是概率
# pivot_df = df.pivot(index='Feature', columns='Gender', values='Probability')

# # 设置图形大小
# plt.figure(figsize=(8, 10))

# # 绘制热力图
# sns.heatmap(
#     pivot_df,
#     annot=True, fmt=".2f", cmap="YlGnBu",
#     cbar_kws={'label': 'P(Feature | Gender)'}
# )

# # 设置标题和轴标签
# plt.title("Conditional Probability: P(Feature | Gender)", fontsize=14)
# plt.xlabel("Gender")
# plt.ylabel("Age Group")

# # 显示图形
# plt.tight_layout()
# plt.show()


constructing cpd and corresponding BN

In [3]:
# 构建BN框架
from pgmpy.models import DiscreteBayesianNetwork

#TODO modify file path later
builder = BayesianCPTBuilder("age_gender_transformed_v3.csv")
gender_proportions = builder.df.groupby('Gender')['Proportion'].sum()
cpd_age_gender = builder.build_feature_given_gender_cpd('AgeGroup')
cpd_age_gender.normalize()
# cpd_income = builder.build_income_given_age_cpd()
# cpd_income.normalize()
# cpd_income_age = builder.build_income_given_feature_gender_cpd('AgeGroup')
# cpd_income_age.normalize()

builder2 = BayesianCPTBuilder("industry_gender_transformed_v3.csv")
cpd_industry_gender = builder2.build_feature_given_gender_cpd('Industry')
cpd_industry_gender.normalize()
# cpd_income = builder2.build_income_given_age_cpd()
# cpd_income.normalize()
# cpd_income_industry = builder2.build_income_given_feature_gender_cpd('Industry')
# cpd_income_industry.normalize()

# 定义结构：Gender → Feature
model = DiscreteBayesianNetwork()
model.add_edges_from([
    ('Gender', 'AgeGroup'),
    ('Gender', 'Industry'),
    ('Gender', 'IncomeLevel'),
    ('AgeGroup', 'IncomeLevel'),
    ('Industry', 'IncomeLevel')
])


# model.add_cpds(cpd_age_gender, cpd_income_age)
# model.add_cpds(cpd_industry_gender, cpd_income_industry)
model.add_cpds(cpd_age_gender, cpd_industry_gender)


construct joint distribution for all features

In [4]:
# 构造虚拟联合分布
import itertools
genders = sorted(builder.df['Gender'].unique())
age_groups = sorted(builder.df['Feature'].unique())
income_levels = sorted(builder.df['IncomeLevel'].unique())
industry_groups = sorted(builder2.df['Feature'].unique())
combinations = list(itertools.product(age_groups, industry_groups, genders, income_levels))
df_joint = pd.DataFrame(combinations, columns=['AgeGroup', 'Industry', 'Gender', 'IncomeLevel'])
df_joint['Proportion'] = 0.0

def get_prop(df, feature, gender, income):
    result = df[
        (df['Feature'] == feature) &
        (df['Gender'] == gender) &
        (df['IncomeLevel'] == income)
    ]['Proportion']
    return result.values[0] if not result.empty else 0.0

df_joint['Proportion'] = df_joint.apply(
    lambda row: get_prop(builder.df, row['AgeGroup'], row['Gender'], row['IncomeLevel']) *
                get_prop(builder2.df, row['Industry'], row['Gender'], row['IncomeLevel']),
    axis=1
)

df_joint['Proportion'] = df_joint.groupby(['AgeGroup', 'Industry', 'Gender'])['Proportion'].transform(lambda x: x / x.sum())

def build_income_given_age_industry_gender_cpd(df_joint):
    cpt_matrix = []
    for income in income_levels:
        row = []
        for gender in genders:
            for age in age_groups:
                for industry in industry_groups:
                    prob = df_joint[
                        (df_joint['Gender'] == gender) &
                        (df_joint['AgeGroup'] == age) &
                        (df_joint['Industry'] == industry) &
                        (df_joint['IncomeLevel'] == income)
                    ]['Proportion'].sum()
                    row.append(prob)
        cpt_matrix.append(row)

    import numpy as np
    cpt_array = np.array(cpt_matrix)
    col_sums = cpt_array.sum(axis=0)
    col_sums[col_sums == 0] = 1
    normalized_matrix = (cpt_array / col_sums).tolist()

    # 强制每列和为1
    for col in range(len(normalized_matrix[0])):
        col_sum = sum(row[col] for row in normalized_matrix)
        if not np.isclose(col_sum, 1.0):
            # 如果全为0，均匀分布
            if col_sum == 0:
                for row in normalized_matrix:
                    row[col] = 1.0 / len(normalized_matrix)
            else:
                # 归一化
                for row in normalized_matrix:
                    row[col] /= col_sum
                    
    return TabularCPD(
        variable='IncomeLevel',
        variable_card=len(income_levels),
        values=normalized_matrix,
        evidence=['AgeGroup', 'Industry', 'Gender'],
        evidence_card=[len(age_groups), len(industry_groups), len(genders)],
        state_names={
            'IncomeLevel': income_levels,
            'AgeGroup': age_groups,
            'Industry': industry_groups,
            'Gender': genders
        }
    )
    
# 加入联合概率cpd -> income
joint_cpd = build_income_given_age_industry_gender_cpd(df_joint)
model.add_cpds(joint_cpd)

add prior prob

In [5]:
# 加入先验gender概率
from pgmpy.factors.discrete import TabularCPD

cpd_gender = TabularCPD(
    variable='Gender',
    variable_card=2,
    values=[[gender_proportions['FEMALES']], [gender_proportions['MALES']]],
    state_names={'Gender': ['FEMALES', 'MALES']}
)
model.add_cpds(cpd_gender)
# 检查模型是否有效（所有节点都有 CPT，结构无环）
assert model.check_model()

In [6]:
# # cpd = builder.build_feature_given_gender_cpd('Industry')
# # cpd.normalize()
# print(joint_cpd)

# # 获取 CPT 的值矩阵（每行是 Feature，每列是 Gender）
# values = cpd.get_values()

# # 按列求和（每列对应一个 Gender）
# import numpy as np
# col_sums = np.sum(values, axis=0)

# # 打印每个 Gender 的总和
# for gender, total in zip(cpd.state_names['Gender'], col_sums):
#     print(f"Sum of P(Feature | Gender={gender}): {total:.6f}")
# print("total", np.sum(col_sums))  # 打印总和


In [None]:
from pgmpy.sampling import BayesianModelSampling

# 假设你的模型变量名为 model
sampling_inference = BayesianModelSampling(model)

# 采样100000组数据
# samples = sampling_inference.forward_sample(size=100000)

# samples 是一个 pandas DataFrame，每一行就是一组样本
print(samples.head())
samples.to_csv("test_samples_100000.csv", index=False)


  0%|          | 0/4 [00:00<?, ?it/s]





    Gender        AgeGroup                                         Industry  \
0    MALES  25 to 29 years                                     Construction   
1  FEMALES  50 to 54 years                Health care and social assistance   
2    MALES  50 to 54 years  Professional, scientific and technical services   
3  FEMALES  21 to 24 years                  Accommodation and food services   
4    MALES  25 to 29 years                   Finance and insurance services   

  IncomeLevel  
0     20k-30k  
1        <10k  
2     20k-30k  
3     50k-70k  
4     50k-70k  
