In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

# 1. 데이터 로드
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'native-country', 'income']

# na_values = ? 는 ?로 되어있는 값들을 None값으로 처리한다는 의미
data = pd.read_csv(url, header=None, names=columns, na_values='?', skipinitialspace=True)

# 결측치 제거
data.dropna(inplace=True)

# 이상치 제거
Q1 = data['fnlwgt'].quantile(0.25)
Q3 = data['fnlwgt'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
capital_fnlwgt_outliers = data[(data['fnlwgt'] < lower_bound)]
data = data.drop(capital_fnlwgt_outliers.index)

Q1 = data['capital-gain'].quantile(0.25)
Q3 = data['capital-gain'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
capital_gain_outliers = data[(data['capital-gain'] < lower_bound) | (data['capital-gain'] > upper_bound)]
capital_loss_outliers = data[(data['capital-loss'] < lower_bound) | (data['capital-loss'] > upper_bound)]
data = data.drop(capital_gain_outliers.index)
data = data.drop(capital_loss_outliers.index)

# native-country 지역별로 그룹화
regions = {
  'North America': ['United-States', 'Cuba', 'Jamaica', 'Mexico', 'Puerto-Rico', 'Canada',
                    'Dominican-Republic', 'El-Salvador', 'Guatemala', 'Honduras', 'Haiti',
                    'Trinadad&Tobago', 'Outlying-US(Guam-USVI-etc)'],
  'South America': ['Colombia', 'Ecuador', 'Peru'],
  'Europe': ['England', 'Germany', 'Poland', 'Portugal', 'France', 'Italy', 'Scotland',
              'Greece', 'Ireland', 'Hungary', 'Yugoslavia'],
  'Asia': ['India', 'Iran', 'Philippines', 'Cambodia', 'Thailand', 'Laos', 'Taiwan',
            'Japan', 'China', 'Vietnam', 'Hong']
}

# 순수익
data['capital_gain_loss'] = data['capital-gain'] - data['capital-loss']

# 데이터 프레임에 새로운 'Region' 열 추가
data['region'] = None
for region, countries in regions.items():
  data.loc[data['native-country'].isin(countries), 'region'] = region

# Marital-status 결혼여부 추가
data['marital_group'] = data['marital-status'].apply(lambda x: 'Married' if x.startswith('Married') else 'Not Married')

# 가족 여부 추가
data['family_group'] = data['relationship'].apply(lambda x: 'Not Family' if x.startswith('Not') or x.startswith('Un') else 'Family')

# 범주형 변수 인코딩
categorical_features = ['race', 'sex', 'workclass', 'marital-status',
                        'occupation', 'relationship', 'native-country', 'education','marital_group',
                        'family_group', 'region', 'capital-gain']
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

data['income'] = data['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

# 불필요 변수
data.drop(['fnlwgt'], axis=1, inplace=True)

# 학습용과 테스트용 데이터셋으로 나누기
X = data.drop('income', axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 데이터 표준화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = {
  'XGBoost': (XGBClassifier(), {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 4, 5]
  }),
  'LightGBM': (LGBMClassifier(verbose=-1), {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 4, 5]
  }),
}

random_states = np.arange(1,21)

results = []

# 하이퍼파라미터 튜닝 및 모델 학습
for random_state in random_states:
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
    results.append({
        'Model': model_name,
        'Random State': random_state,
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy,
        'ROC AUC': roc_auc
    })

# 가장 성능이 좋은 모델 찾기
best_result = max(results, key=lambda x: x['Accuracy'])

# 결과 출력
for result in results:
     print(f'Model: {result["Model"]}, Random State: {result["Random State"]}, Best Parameters: {result["Best Parameters"]}, Accuracy: {result["Accuracy"]}, ROC AUC: {result["ROC AUC"]}')

print("\nBest Model Configuration:")
print(f'Model: {best_result["Model"]}, Random State: {best_result["Random State"]}, Best Parameters: {best_result["Best Parameters"]}, Accuracy: {best_result["Accuracy"]}, ROC AUC: {best_result["ROC AUC"]}')

# Model: XGBoost, Random State: 14, Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}, Accuracy: 0.8650763358778626, ROC AUC: 0.8984273438004782

Model: XGBoost, Random State: 1, Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}, Accuracy: 0.8606870229007634, ROC AUC: 0.8954398565603452
Model: LightGBM, Random State: 1, Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}, Accuracy: 0.8587786259541985, ROC AUC: 0.8954743720667477
Model: XGBoost, Random State: 2, Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}, Accuracy: 0.8488549618320611, ROC AUC: 0.8935740916517508
Model: LightGBM, Random State: 2, Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}, Accuracy: 0.8479007633587786, ROC AUC: 0.8945640017114163
Model: XGBoost, Random State: 3, Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}, Accuracy: 0.857442748091603, ROC AUC: 0.897487868544282
Model: LightGBM, Random State: 3, Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}, Accuracy: 0.8561068702290077, ROC AUC: 0.89

In [30]:
# 1. 데이터 로드
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'native-country', 'income']

# na_values = ? 는 ?로 되어있는 값들을 None값으로 처리한다는 의미
data = pd.read_csv(url, header=None, names=columns, na_values='?', skipinitialspace=True)

# 결측치 제거
data.dropna(inplace=True)

print(data.income.unique())


['<=50K' '>50K']
