In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

df = pd.read_csv('assuming_independent_initial_synthetic.csv')

print("\n数据列信息:")
print(df.info())

# 检查是否有缺失值
print("\n各列缺失值数量:")
print(df.isnull().sum())

# 将收入从连续变量变为分类变量
print("\n收入描述性统计:")
# df['income_level'] = pd.qcut(df['median_income'], q=4, labels=['low_income', 'medium_income', 'high_income', 'very_high_income'])

# print(pd.qcut(df['median_income'], q=4).unique())

df['income_level'] = pd.qcut(df['median_income'], q=5, labels=['very_low_income', 'low_income', 'medium_income', 'high_income', 'very_high_income'])

print(pd.qcut(df['median_income'], q=5).unique())

# 打印分箱后的新列的前几行，以确认操作成功
print("\n分箱后数据前5行:")
print(df.head())
print("\n分箱后各收入组的样本数量:")
print(df['income_level'].value_counts())

features = ['gender', 'Age', 'Arrival_group', 'Business_size', 'Industry', 'Institutional_sector', 'Job_duration', 'Visa_group']
target = 'income_level'

# Remove missing values
df = df[features + [target]].dropna()

# Convert categorical variables to numbers (gender -> 0, 1)
le = LabelEncoder()
for col in features + [target]:
    df[col] = le.fit_transform(df[col])

# Training and testing
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


数据列信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98993 entries, 0 to 98992
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gender                98993 non-null  object 
 1   median_income         98993 non-null  float64
 2   Age                   98993 non-null  object 
 3   Arrival_group         98993 non-null  object 
 4   Business_size         98993 non-null  object 
 5   Industry              98991 non-null  object 
 6   Institutional_sector  98993 non-null  object 
 7   Job_duration          98993 non-null  object 
 8   Visa_group            98993 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.8+ MB
None

各列缺失值数量:
gender                  0
median_income           0
Age                     0
Arrival_group           0
Business_size           0
Industry                2
Institutional_sector    0
Job_duration            0
Visa_group              0
dtype: int64

收入描述性统计:

In [7]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# test dataset
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"随机森林模型的准确率: {accuracy_rf:.2f}")

print("\n随机森林分类报告:")
print(classification_report(y_test, y_pred_rf))

# 6. 获取特征重要性 (Feature Importance)
# 随机森林可以告诉我们哪些特征对模型预测贡献最大
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
sorted_importances = feature_importances.sort_values(ascending=False)

print("\n随机森林的特征重要性:")
print(sorted_importances)

随机森林模型的准确率: 0.96

随机森林分类报告:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      5983
           1       0.99      0.92      0.95      5854
           2       0.98      0.96      0.97      5956
           3       0.98      0.94      0.96      5941
           4       0.93      0.99      0.96      5964

    accuracy                           0.96     29698
   macro avg       0.96      0.96      0.96     29698
weighted avg       0.96      0.96      0.96     29698


随机森林的特征重要性:
Age                     0.207447
Industry                0.187997
Job_duration            0.168295
Institutional_sector    0.140164
Visa_group              0.112606
Arrival_group           0.102541
Business_size           0.062281
gender                  0.018670
dtype: float64


In [None]:
# 合并为二类：是否高收入
df['income_group'] = df['income_level'].replace({
    'very_low_income': 'not_high_income',
    'low_income': 'not_high_income',
    'medium_income': 'not_high_income',
    'high_income': 'high_income',
    'very_high_income': 'high_income'
})

# 选定原始特征
features = ['gender', 'Age', 'Arrival_group', 'Business_size', 'Industry', 'Institutional_sector', 'Job_duration', 'Visa_group']
new_target = 'income_group'

# 删除缺失值
df = df[features + [new_target]].dropna()


# 编码所有分类变量
le_features = LabelEncoder()
le_target = LabelEncoder()

# 编码特征
for col in features:
    df[col] = le_features.fit_transform(df[col])

# 编码目标变量，并获取真实的类别顺序
# 这是解决问题的关键步骤
df[new_target] = le_target.fit_transform(df[new_target])
# 获取真实的类别名称，这与模型内部的0, 1索引是一致的
actual_class_names = le_target.classes_
print("目标变量的实际编码顺序:", actual_class_names)

# 划分数据集
X = df[features]
y = df[new_target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 决策树训练
dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

# 预测与评估
y_pred_dt = dt_model.predict(X_test)
print(f"决策树模型的准确率: {accuracy_score(y_test, y_pred_dt):.2f}")
print("\n决策树分类报告:")
print(classification_report(y_test, y_pred_dt))

# 可视化
plt.figure(figsize=(20, 10))
plot_tree(dt_model, filled=True, feature_names=X.columns, class_names=actual_class_names, fontsize=8)
plt.title("Decision Tree: High vs Not High Income")
plt.show()

In [None]:
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

df = pd.read_csv('assuming_independent_initial_synthetic.csv')

df['income_level'] = pd.qcut(df['median_income'], q=5, labels=['very_low_income', 'low_income', 'medium_income', 'high_income', 'very_high_income'])

df['income_group'] = df['income_level'].replace({
    'very_low_income': 'not_high_income',
    'low_income': 'not_high_income',
    'medium_income': 'not_high_income',
    'high_income': 'high_income',
    'very_high_income': 'high_income'
})

all_features = ['gender', 'Age', 'Arrival_group', 'Business_size', 'Industry', 'Institutional_sector', 'Job_duration', 'Visa_group']
target = 'income_group'

df = df[all_features + [target]].dropna()

le = LabelEncoder()
for col in all_features + [target]:
    df[col] = le.fit_transform(df[col])

# 存储结果
results = []

# 所有特征组合 - 255
for r in range(1, len(all_features) + 1):
    for combo in itertools.combinations(all_features, r):
        try:
            X = df[list(combo)]
            y = df[target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)

            acc = accuracy_score(y_test, model.predict(X_test))
            results.append((list(combo), acc))
        except Exception as e:
            print(f"combination {combo} error: {e}")


top_10 = sorted(results, key=lambda x: x[1], reverse=True)[:10]
print("\n Top 10 combination:")
for i, (combo, acc) in enumerate(top_10, 1):
    print(f"{i}. {combo} → accuracy: {acc:.4f}")


  df['income_group'] = df['income_level'].replace({
