In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# 데이터셋 로드
file_path = 'income.csv'
data = pd.read_csv(file_path)

# 나이 계산 및 비현실적 나이 값 제거
data['age'] = data['year'] - data['year_born']
data = data[(data['age'] >= 0) & (data['age'] <= 120)]

# 음수 수입 값 처리
data['income'] = data['income'].apply(lambda x: x if x >= 0 else None)

# 필수 컬럼에서 NaN 값 제거
data.dropna(subset=['region', 'income', 'family_member', 'gender', 'education_level', 'marriage', 'religion', 'age', 'company_size'], inplace=True)

# 지역 이동 확인 함수 정의 및 적용 (원-핫 인코딩 전에 적용)
def check_region_movement(df):
    df.sort_values('year', inplace=True)
    df['previous_region'] = df['region'].shift(1)
    # 이전 지역이 NaN인 경우 현재 지역으로 설정
    df['previous_region'] = df['previous_region'].fillna(df['region'])
    df['region_movement'] = df.apply(lambda row: row['region'] if row['region'] != row['previous_region'] else None, axis=1)
    return df

data = data.groupby('id').apply(check_region_movement).reset_index(drop=True)

# 필요없는 컬럼 제거
data.drop(columns=['year_born'], inplace=True)

# OneHotEncoder 설정
encoder = OneHotEncoder()
categorical_columns = ['region', 'gender', 'marriage', 'religion']
data_encoded = encoder.fit_transform(data[categorical_columns])
column_names = encoder.get_feature_names_out(categorical_columns)

# 원-핫 인코딩 데이터를 DataFrame으로 변환
data_encoded = pd.DataFrame(data_encoded.toarray(), columns=column_names)

# 인코딩되지 않은 데이터와 병합
data = pd.concat([data.reset_index(drop=True), data_encoded.reset_index(drop=True)], axis=1)
data.drop(columns=categorical_columns, inplace=True)  # 원본 범주형 컬럼 삭제

# 수정된 데이터를 CSV 파일로 저장
output_file_path = 'Processed_Korea_Income_and_Welfare.csv'
data.to_csv(output_file_path, index=False)

print("데이터 처리가 완료되었으며, 다음 파일에 저장되었습니다:", output_file_path)


데이터 처리가 완료되었으며, 다음 파일에 저장되었습니다: Processed_Korea_Income_and_Welfare.csv


In [2]:
#시각화

In [3]:
#위와 동일하되 SMOTE사용

In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from imblearn.over_sampling import SMOTE

# 수치형 데이터 정규화
scaler = MinMaxScaler()
numeric_columns = ['income', 'family_member', 'age']
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# 타겟 변수 설정
data['region_movement'] = data['region_movement'].apply(lambda x: 1 if pd.notna(x) else 0)

# 학습에 사용할 피처 선택
feature_columns = numeric_columns + [col for col in data.columns if 'region_' in col and col != 'region_movement']
X = data[feature_columns]
y = data['region_movement']

# 데이터 분할
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# SMOTE 적용
smote = SMOTE(random_state=random_state)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# 커스텀 스코어로 F1 스코어 사용
f1_scorer = make_scorer(f1_score, pos_label=1)

# 1. 로지스틱 회귀
log_reg_model = LogisticRegression(random_state=random_state, max_iter=1000, class_weight='balanced')
log_reg_scores = cross_val_score(log_reg_model, X_train_res, y_train_res, cv=5, scoring=f1_scorer)
log_reg_model.fit(X_train_res, y_train_res)
y_pred_log_reg = log_reg_model.predict(X_test)
print(f"Logistic Regression Cross-Validation F1 Scores: {log_reg_scores}")
print(f"Logistic Regression Average Cross-Validation F1 Score: {log_reg_scores.mean()}")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg, zero_division=1))

# # 2. 서포트 벡터 머신
# svm_model = SVC(random_state=random_state, class_weight='balanced')
# svm_scores = cross_val_score(svm_model, X_train_res, y_train_res, cv=5, scoring=f1_scorer)
# svm_model.fit(X_train_res, y_train_res)
# y_pred_svm = svm_model.predict(X_test)
# print(f"SVM Cross-Validation F1 Scores: {svm_scores}")
# print(f"SVM Average Cross-Validation F1 Score: {svm_scores.mean()}")
# print("SVM Classification Report:")
# print(classification_report(y_test, y_pred_svm, zero_division=1))

# 3. 그래디언트 부스팅
gb_model = GradientBoostingClassifier(random_state=random_state)
gb_scores = cross_val_score(gb_model, X_train_res, y_train_res, cv=5, scoring=f1_scorer)
gb_model.fit(X_train_res, y_train_res)
y_pred_gb = gb_model.predict(X_test)
print(f"Gradient Boosting Cross-Validation F1 Scores: {gb_scores}")
print(f"Gradient Boosting Average Cross-Validation F1 Score: {gb_scores.mean()}")
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb, zero_division=1))

# 4. 랜덤 포레스트
rf_model = RandomForestClassifier(n_estimators=100, random_state=random_state, class_weight='balanced')
rf_scores = cross_val_score(rf_model, X_train_res, y_train_res, cv=5, scoring=f1_scorer)
rf_model.fit(X_train_res, y_train_res)
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest Cross-Validation F1 Scores: {rf_scores}")
print(f"Random Forest Average Cross-Validation F1 Score: {rf_scores.mean()}")
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=1))

# 5. 의사결정트리
dt_model = DecisionTreeClassifier(random_state=random_state, class_weight='balanced')
dt_scores = cross_val_score(dt_model, X_train_res, y_train_res, cv=5, scoring=f1_scorer)
dt_model.fit(X_train_res, y_train_res)
y_pred_dt = dt_model.predict(X_test)
print(f"Decision Tree Cross-Validation F1 Scores: {dt_scores}")
print(f"Decision Tree Average Cross-Validation F1 Score: {dt_scores.mean()}")
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, zero_division=1))




Logistic Regression Cross-Validation F1 Scores: [0.65494374 0.65896301 0.6612722  0.66379246 0.65760585]
Logistic Regression Average Cross-Validation F1 Score: 0.6593154531262624
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.65      0.78      8192
           1       0.03      0.66      0.05       124

    accuracy                           0.65      8316
   macro avg       0.51      0.65      0.42      8316
weighted avg       0.98      0.65      0.77      8316

Gradient Boosting Cross-Validation F1 Scores: [0.75694077 0.77646528 0.77976012 0.77194293 0.76708653]
Gradient Boosting Average Cross-Validation F1 Score: 0.7704391251985119
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.75      0.85      8192
           1       0.03      0.49      0.06       124

    accuracy                           0.75      8316
   macro avg     

In [6]:

# 테스트 데이터 셋 중 100개의 샘플 출력 및 예측 실행
num_samples = 100
test_samples = X_test.iloc[:num_samples]
true_labels = y_test.iloc[:num_samples]

print("\nTest Samples:")
print(test_samples)

# 각 모델에 대해 예측 실행 및 출력
models = {
    "Logistic Regression": log_reg_model,
    "Gradient Boosting": gb_model,
    "Random Forest": rf_model,
    "Decision Tree": dt_model,
}

predictions = {model_name: model.predict(test_samples) for model_name, model in models.items()}

# 예측값과 실제값 비교 출력
for i in range(num_samples):
    print(f"\nSample {i+1}:")
    print(f"True Label: {true_labels.iloc[i]}")
    for model_name, preds in predictions.items():
        print(f"{model_name} Prediction: {preds[i]}")


Test Samples:
         income  family_member       age  region_Chung-nam  \
4283   0.015055          0.375  0.278481               0.0   
32678  0.012883          0.125  0.708861               1.0   
38163  0.016256          0.250  0.202532               0.0   
20909  0.009493          0.000  0.848101               0.0   
216    0.011637          0.125  0.544304               0.0   
...         ...            ...       ...               ...   
9103   0.039063          0.375  0.329114               0.0   
34009  0.004435          0.000  0.379747               0.0   
5186   0.003927          0.000  0.772152               0.0   
17722  0.009574          0.250  0.430380               0.0   
3000   0.012549          0.375  0.240506               0.0   

       region_Gang-won & Chung-buk  region_Jeolla & Jeju  region_Kyeong-gi  \
4283                           0.0                   0.0               0.0   
32678                          0.0                   0.0               0.0   
38163 