In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, message=".*xgboost.*")

In [2]:
import pandas as pd

df = pd.read_csv('titanic_cleaned.csv')


In [3]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.2500,False,False,True,False,False,False,False,True
1,1,1,1,38.0,1,0,71.2833,False,False,False,True,False,True,False,False
2,1,3,1,26.0,0,0,7.9250,False,True,False,False,False,False,False,True
3,1,1,1,35.0,1,0,53.1000,False,False,False,True,False,False,False,True
4,0,3,0,35.0,0,0,8.0500,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,False,False,False,False,True,False,False,True
887,1,1,1,19.0,0,0,30.0000,False,True,False,False,False,False,False,True
888,0,3,1,18.0,1,2,23.4500,False,True,False,False,False,False,False,True
889,1,1,0,26.0,0,0,30.0000,False,False,True,False,False,True,False,False


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('Survived', axis=1)
y = df['Survived']

# 스탠다드스케일러 적용
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 스케일링된 데이터를 사용하여 train_test_split 적용
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [5]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (712, 14)
X_test shape: (179, 14)
y_train shape: (712,)
y_test shape: (179,)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

def train_and_evaluate_logistic_regression(X_train, X_test, y_train, y_test, model=None):
    # 모델이 주어지지 않으면 기본 로지스틱 리그레이션 모델 생성
    if model is None:
        model = LogisticRegression(max_iter=100)
    
    # 모델 학습
    model.fit(X_train, y_train)
    
    # 예측
    y_pred = model.predict(X_test)
    # predict_proba는 각 클래스에 속할 확률을 반환하며, predict는 가장 높은 확률의 클래스를 반환
    # 예시: predict_proba = [[0.2, 0.8], [0.6, 0.4]], predict = [1, 0]
    # 여기서는 양성 클래스(1)의 확률을 추출
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # 평가지표 계산
    acc = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auroc = roc_auc_score(y_test, y_pred_proba)
    
    return model, acc, recall, precision, f1, auroc


In [7]:
def add_evaluation_to_df(results_df, test_name, acc, recall, precision, f1, auroc):
    # 평가지표들을 데이터프레임에 하나의 row로 추가
    new_row = {
        'test_name': test_name,
        'accuracy': acc,
        'recall': recall,
        'precision': precision,
        'f1_score': f1,
        'roc_auc': auroc
    }
    return pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

In [8]:
# 함수 호출
model, acc, recall, precision, f1, auroc = train_and_evaluate_logistic_regression(X_train, X_test, y_train, y_test)

In [9]:
# 초기 결과 데이터프레임 생성
results_df = pd.DataFrame(columns=['test_name', 'accuracy', 'recall', 'precision', 'f1_score', 'roc_auc'])

In [10]:
# 함수 호출하여 결과 추가
results_df = add_evaluation_to_df(results_df, 'base line', acc, recall, precision, f1, auroc)

# 결과 출력
results_df

Unnamed: 0,test_name,accuracy,recall,precision,f1_score,roc_auc
0,base line,0.826816,0.783784,0.794521,0.789116,0.895367


In [11]:
# 모델 리스트와 이름 정의
models = []
max_iter_list = [50, 200, 400, 1000]
for max_iter in max_iter_list:
    models.append((LogisticRegression(max_iter=max_iter), f'lr_max_iter_{max_iter}'))

for model, name in models:
    model, acc, recall, precision, f1, auroc = train_and_evaluate_logistic_regression(X_train, X_test, y_train, y_test, model)
    results_df = add_evaluation_to_df(results_df, name, acc, recall, precision, f1, auroc)

results_df

Unnamed: 0,test_name,accuracy,recall,precision,f1_score,roc_auc
0,base line,0.826816,0.783784,0.794521,0.789116,0.895367
1,lr_max_iter_50,0.826816,0.783784,0.794521,0.789116,0.895367
2,lr_max_iter_200,0.826816,0.783784,0.794521,0.789116,0.895367
3,lr_max_iter_400,0.826816,0.783784,0.794521,0.789116,0.895367
4,lr_max_iter_1000,0.826816,0.783784,0.794521,0.789116,0.895367


In [12]:
# 다른 트리 모델들을 사용하여 성능 테스트

# 필요한 라이브러리 임포트
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# 모델 리스트와 이름 정의
tree_models = [
    (DecisionTreeClassifier(), 'decision_tree'),
    (RandomForestClassifier(), 'random_forest'),
    (GradientBoostingClassifier(), 'gradient_boosting'),
    (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 'xgboost')
]

for model, name in tree_models:
    model, acc, recall, precision, f1, auroc = train_and_evaluate_logistic_regression(X_train, X_test, y_train, y_test, model)
    results_df = add_evaluation_to_df(results_df, name, acc, recall, precision, f1, auroc)





In [13]:
results_df.sort_values(by='recall', ascending=False)

Unnamed: 0,test_name,accuracy,recall,precision,f1_score,roc_auc
7,gradient_boosting,0.865922,0.824324,0.847222,0.835616,0.907658
8,xgboost,0.843575,0.824324,0.802632,0.813333,0.914414
5,decision_tree,0.843575,0.810811,0.810811,0.810811,0.842021
6,random_forest,0.843575,0.810811,0.810811,0.810811,0.918919
0,base line,0.826816,0.783784,0.794521,0.789116,0.895367
4,lr_max_iter_1000,0.826816,0.783784,0.794521,0.789116,0.895367
3,lr_max_iter_400,0.826816,0.783784,0.794521,0.789116,0.895367
2,lr_max_iter_200,0.826816,0.783784,0.794521,0.789116,0.895367
1,lr_max_iter_50,0.826816,0.783784,0.794521,0.789116,0.895367
