In [3]:
import pandas as pd
import os

In [12]:
def get_feature_stats(df, feature_name, group_col='feature'):
    """
    统计某个特征的概率分布和加权平均收入
    """
    weight_col='number_of_jobs'
    income_col='median_income'
    # 概率分布
    df = df.dropna(subset=[group_col, weight_col, income_col])
    prob = df.groupby(group_col)[weight_col].sum()
    prob = prob / prob.sum()

    # 加权平均收入
    def weighted_mean(x):
        return (x[weight_col] * x[income_col]).sum() / x[weight_col].sum()

    mean_income = df.groupby(group_col).apply(weighted_mean)

    # 合并结果
    stats = pd.DataFrame({
        'prob': prob,
        'mean_income': mean_income
    })
    return stats

# csv_path = r"E:\git\DataScience_Project_Team_25\data\pre_transform\Age group_gender.csv"
# df = pd.read_csv(csv_path)
# stats = get_feature_stats(df)
# print(stats)

In [14]:
folder = r"E:\git\DataScience_Project_Team_25\data\pre_transform"
for fname in os.listdir(folder):
    if fname.endswith('_gender.csv'):
        feature_name = fname[:-11]
        if not feature_name in ['Employment size', 'Job duration']:
            fpath = os.path.join(folder, fname)
            df = pd.read_csv(fpath)
            stats = get_feature_stats(df, feature_name=feature_name, group_col='feature')
            print(f"==== {feature_name} ====")
            print(stats)
            print()

==== Age group ====
                        prob   mean_income
feature                                   
14 years and under  0.001448   1137.289386
15 to 17 years      0.022079   2828.853417
18 to 20 years      0.059226   6568.220902
21 to 24 years      0.099093  12854.342791
25 to 29 years      0.133123  21219.066763
30 to 34 years      0.122570  31899.392273
35 to 39 years      0.109689  40428.532942
40 to 44 years      0.094967  45027.931578
45 to 49 years      0.093249  46873.064480
50 to 54 years      0.084626  47077.209990
55 to 59 years      0.075155  45722.038404
60 to 64 years      0.055522  40140.383193
65 to 69 years      0.028512  27427.184164
70 to 74 years      0.012088  15091.587712
75 to 79 years      0.004795   9660.065858
80 to 84 years      0.002230   7137.806440
85 years and over   0.001630   7241.402214

==== Arrival Group ====
                prob   mean_income
feature                           
0-5 years   0.185226  29606.286945
11+ years   0.497538  36925.99074