# 기초 데이터 세팅

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

df = pd.read_csv("./titanic.csv")

df['Initial'] = 0

for index, row in df.iterrows():
    initial_search = row['Name'].split(',')[1].split('.')[0].strip() # Name 컬럼에서 .(dot)을 기준으로 알파벳 문자열 추출
    df.at[index, 'Initial'] = initial_search
    
df['Initial'].replace([
    'Mlle', 'Mme', 'Ms', 'Dr', 'Major', 'Lady', 'Countess', 'Jonkheer', 'Col',
    'Rev', 'Capt', 'Sir', 'Don','the Countess' 
], [
    'Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other',
    'Other', 'Mr', 'Mr', 'Mr', 'Other'
],
    inplace=True)

df.loc[(df['Age'].isnull()) & (df.Initial == 'Mr'), 'Age'] = 33
df.loc[(df['Age'].isnull()) & (df.Initial == 'Mrs'), 'Age'] = 36
df.loc[(df['Age'].isnull()) & (df.Initial == 'Master'), 'Age'] = 5
df.loc[(df['Age'].isnull()) & (df.Initial == 'Miss'), 'Age'] = 22
df.loc[(df['Age'].isnull()) & (df.Initial == 'Other'), 'Age'] = 46

df.dropna(subset='Embarked', inplace=True)
df.drop(['Cabin', 'Name', 'PassengerId', 'Ticket'], axis=1, inplace=True)

df_org = df.copy()
df['Relatives'] = df["SibSp"] + df["Parch"]

df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Age'] = (df['Age'] // 10).astype(int)
df['Fare'] = pd.qcut(df['Fare'], q=9, labels=range(9))
df['Embarked'] = df['Embarked'].map({'S': 1, 'C': 2, 'Q': 3})
initial_mapping = {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Other':4}
df['Initial'] = df['Initial'].map(initial_mapping)

  df.at[index, 'Initial'] = initial_search
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Initial'].replace([


In [2]:
def predict_survival(model, scaler, survived, pclass, sex, age, sibsp, parch, fare, initial):
    input_data = pd.DataFrame({
        'Survived': [survived],
        'Pclass': [pclass],
        'Sex': [0 if sex == 'male' else 1],
        'Age': [age // 10],
        'SibSp': [sibsp],
        'Parch': [parch],
        'Fare': [fare],
        'Initial': [0 if initial == 'Mr' else (1 if initial == 'Miss' else (2 if initial == 'Mrs' else (3 if initial == 'Master' else 4)))],
        'Relatives': [sibsp + parch],
    })
    
    fare_bins = pd.qcut(df_org['Fare'], 9, retbins=True)[1]
    input_data['Fare'] = pd.cut(input_data['Fare'], bins=fare_bins, labels=False, include_lowest=True)

    # 코드 수정
    input_data_for_scaling = input_data.drop(['SibSp', 'Parch'], axis=1)
    input_data_scaled = scaler.transform(input_data_for_scaling)
    # input_data_scaled = scaler.transform(input_data)
    
    prediction = model.predict(input_data_scaled)
    prediction_proba = model.predict_proba(input_data_scaled)

    result = "S" if prediction == 1 else ("C" if prediction == 2 else "Q")
    probability = prediction_proba[0][int(prediction)]
    
    return result, probability

In [3]:
X=df.drop('Embarked', axis=1)
y = df['Embarked']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 풀이 과정

우선, 데이터의 결측값과 다중공선성을 확인해준다.

In [6]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age          0
SibSp        0
Parch        0
Fare         0
Embarked     0
Initial      0
Relatives    0
dtype: int64

In [7]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

df_vif = pd.DataFrame()
df_vif["VIF"] = np.round([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], 2)

df_vif["features"] = X.columns
df_vif.sort_values(by='VIF', ascending=False)

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,VIF,features
4,inf,SibSp
8,inf,Relatives
5,inf,Parch
6,4.93,Fare
3,4.48,Age
1,3.28,Pclass
2,2.75,Sex
0,2.68,Survived
7,2.65,Initial


결측치가 없음을 확인하였고, SibSp와 Parch를 사용하여 Relatives 변수를 만들었으므로 SibSp와 Parch 변수를 제거해준다.

In [8]:
df.drop(labels=['SibSp', 'Parch'], axis=1, inplace=True)

In [9]:
X = df.drop('Embarked', axis=1)
df_vif = pd.DataFrame()
df_vif["VIF"] = np.round([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], 2)

df_vif["features"] = X.columns
df_vif.sort_values(by='VIF', ascending=False)

Unnamed: 0,VIF,features
4,4.92,Fare
3,4.48,Age
1,3.27,Pclass
2,2.73,Sex
0,2.68,Survived
5,2.65,Initial
6,2.33,Relatives


모든 독립변수의 VIF가 5를 넘지 않음을 확인하였다.

In [10]:
# 데이터 셋 쪼개기
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# 데이터 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 로지스틱 회귀

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)

lr_accuracy = accuracy_score(y_test, lr_pred)
print("로지스틱 회귀 모델의 정확도:", lr_accuracy)

lr_report = classification_report(y_test, lr_pred)
print(lr_report)

model = lr_model
result, probability = predict_survival(model, scaler, survived=1, pclass=2, sex='female', age=32, sibsp=1, parch=2, fare=60, initial='Mrs')
print("\n[예측 결과]")
print("예측 결과:", result)
print("생존 확률:", probability)


로지스틱 회귀 모델의 정확도: 0.7303370786516854
              precision    recall  f1-score   support

           1       0.73      1.00      0.84       130
           2       0.00      0.00      0.00        36
           3       0.00      0.00      0.00        12

    accuracy                           0.73       178
   macro avg       0.24      0.33      0.28       178
weighted avg       0.53      0.73      0.62       178


[예측 결과]
예측 결과: S
생존 확률: 0.2334355912695269


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  probability = prediction_proba[0][int(prediction)]


## 로지스틱 회귀 Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

lr_model = LogisticRegression(random_state=42)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print(f"최고 교차검증 정확도: {grid_search.best_score_:.4f}")

best_lr_model = grid_search.best_estimator_
lr_pred = best_lr_model.predict(X_test_scaled)

lr_report = classification_report(y_test, lr_pred)
print("\n[튜닝된 모델의 분류 보고서]")
print(lr_report)

result, probability = predict_survival(best_lr_model, scaler, survived=1, pclass=2, sex='female', age=32, sibsp=1, parch=2, fare=60, initial='Mrs')
print("\n[예측 결과]")
print("예측 결과:", result)
print("생존 확률:", probability)

lr_accuracy = accuracy_score(y_test, lr_pred)
print("\n튜닝된 Logistic Regression 모델의 정확도:", lr_accuracy)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
최적 하이퍼파라미터: {'C': 0.01, 'solver': 'liblinear'}
최고 교차검증 정확도: 0.7229

[튜닝된 모델의 분류 보고서]
              precision    recall  f1-score   support

           1       0.73      1.00      0.84       130
           2       0.00      0.00      0.00        36
           3       0.00      0.00      0.00        12

    accuracy                           0.73       178
   macro avg       0.24      0.33      0.28       178
weighted avg       0.53      0.73      0.62       178


[예측 결과]
예측 결과: S
생존 확률: 0.2352350714193948

튜닝된 Logistic Regression 모델의 정확도: 0.7303370786516854


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  probability = prediction_proba[0][int(prediction)]


## 의사결정나무

In [13]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_scaled, y_train)
tree_pred = tree_model.predict(X_test_scaled)

tree_accuracy = accuracy_score(y_test, tree_pred)
print("Decision Tree 모델의 정확도:", tree_accuracy, "\n")

tree_report = classification_report(y_test, tree_pred)
print(tree_report)

model = tree_model
result, probability = predict_survival(model, scaler, survived=1, pclass=2, sex='female', age=32, sibsp=1, parch=2, fare=60, initial='Mrs')
print("\n[예측 결과]")
print("예측 결과:", result)
print("생존 확률:", probability)

Decision Tree 모델의 정확도: 0.702247191011236 

              precision    recall  f1-score   support

           1       0.76      0.88      0.82       130
           2       0.35      0.19      0.25        36
           3       0.43      0.25      0.32        12

    accuracy                           0.70       178
   macro avg       0.51      0.44      0.46       178
weighted avg       0.66      0.70      0.67       178


[예측 결과]
예측 결과: S
생존 확률: 0.0


  probability = prediction_proba[0][int(prediction)]


## 의사결정나무 Grid Search

In [14]:
tree_model = DecisionTreeClassifier(random_state=42)

param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

grid_search.fit(X_train_scaled, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print(f"최고 교차검증 정확도: {grid_search.best_score_:.4f}")

best_tree_model = grid_search.best_estimator_
tree_pred = best_tree_model.predict(X_test_scaled)

tree_report = classification_report(y_test, tree_pred)
print("\n[튜닝된 모델의 분류 보고서]")
print(tree_report)

model = best_tree_model
result, probability = predict_survival(model, scaler, survived=1, pclass=2, sex='female', age=32, sibsp=1, parch=2, fare=60, initial='Mrs')

print("\n[예측 결과]")
print("예측 결과:", result)
print("생존 확률:", probability)

tree_accuracy = accuracy_score(y_test, tree_pred)
print("\n튜닝된 Decision Tree 모델의 정확도:", tree_accuracy)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
최적 하이퍼파라미터: {'max_depth': 3, 'min_samples_split': 2}
최고 교차검증 정확도: 0.7271

[튜닝된 모델의 분류 보고서]
              precision    recall  f1-score   support

           1       0.80      0.93      0.86       130
           2       0.63      0.33      0.44        36
           3       0.57      0.33      0.42        12

    accuracy                           0.77       178
   macro avg       0.67      0.53      0.57       178
weighted avg       0.75      0.77      0.74       178


[예측 결과]
예측 결과: S
생존 확률: 0.20469798657718122

튜닝된 Decision Tree 모델의 정확도: 0.7696629213483146


  probability = prediction_proba[0][int(prediction)]


## SVM

In [15]:
from sklearn.svm import SVC

svm_model = SVC(random_state=42, probability=True)
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)

svm_accuracy = accuracy_score(y_test, svm_pred)
print("SVM 모델의 정확도:", svm_accuracy)

svm_report = classification_report(y_test, svm_pred)
print(svm_report)

model = svm_model
result, probability = predict_survival(model, scaler, survived=1, pclass=2, sex='female', age=32, sibsp=1, parch=2, fare=60, initial='Mrs')
print("\n[예측 결과]")
print("예측 결과:", result)
print("생존 확률:", probability)

SVM 모델의 정확도: 0.7359550561797753
              precision    recall  f1-score   support

           1       0.74      0.99      0.85       130
           2       0.00      0.00      0.00        36
           3       0.67      0.17      0.27        12

    accuracy                           0.74       178
   macro avg       0.47      0.39      0.37       178
weighted avg       0.58      0.74      0.64       178


[예측 결과]
예측 결과: S
생존 확률: 0.17476624504555993


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  probability = prediction_proba[0][int(prediction)]


정확도 떨어짐. 생존확률은 증가

## Grid Search SVM

In [16]:
svm_model = SVC(random_state=42)

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto'],
    'probability': [True]
}

grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)
print("최적 하이퍼파라미터:", grid_search.best_params_)
print(f"최고 교차검증 정확도: {grid_search.best_score_:.4f}")
best_svm_model = grid_search.best_estimator_
svm_pred = best_svm_model.predict(X_test_scaled)
svm_report = classification_report(y_test, svm_pred)
print("\n[튜닝된 모델의 분류 보고서]")
print(svm_report)
model = best_svm_model
result, probability = predict_survival(model, scaler, survived=1, pclass=2, sex='female', age=32, sibsp=1, parch=2, fare=60, initial='Mrs')
print("\n[예측 결과]")
print("예측 결과:", result)
print("생존 확률:", probability)
svm_accuracy = accuracy_score(y_test, svm_pred)
print("\n튜닝된 SVM 모델의 정확도:", svm_accuracy)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
최적 하이퍼파라미터: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}
최고 교차검증 정확도: 0.7257

[튜닝된 모델의 분류 보고서]
              precision    recall  f1-score   support

           1       0.74      0.98      0.84       130
           2       0.00      0.00      0.00        36
           3       0.43      0.25      0.32        12

    accuracy                           0.73       178
   macro avg       0.39      0.41      0.39       178
weighted avg       0.57      0.73      0.64       178


[예측 결과]
예측 결과: S
생존 확률: 0.16571127595132099

튜닝된 SVM 모델의 정확도: 0.7303370786516854


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  probability = prediction_proba[0][int(prediction)]


## kNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
knn_pred = knn_model.predict(X_test_scaled)

knn_accuracy = accuracy_score(y_test, knn_pred)
print("kNN 모델의 정확도 : ", knn_accuracy)

knn_report = classification_report(y_test, knn_pred)
print(knn_report)

model = knn_model
result, probability = predict_survival(model, scaler, survived=1, pclass=2, sex='female', age=32, sibsp=1, parch=2, fare=60, initial='Mrs')
print("\n[예측 결과]")
print("예측 결과:", result)
print("생존 확률:", probability)

kNN 모델의 정확도 :  0.6966292134831461
              precision    recall  f1-score   support

           1       0.76      0.86      0.81       130
           2       0.35      0.25      0.29        36
           3       0.60      0.25      0.35        12

    accuracy                           0.70       178
   macro avg       0.57      0.45      0.48       178
weighted avg       0.67      0.70      0.67       178


[예측 결과]
예측 결과: S
생존 확률: 0.2


  probability = prediction_proba[0][int(prediction)]


## Grid Search kNN

In [18]:
knn_model = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance']
}
grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)
print("최적 하이퍼파라미터:", grid_search.best_params_)
print(f"최고 교차검증 정확도: {grid_search.best_score_:.4f}")
best_knn_model = grid_search.best_estimator_
knn_pred = best_knn_model.predict(X_test_scaled)
knn_report = classification_report(y_test, knn_pred)
print("\n[튜닝된 모델의 분류 보고서]")
print(knn_report)
model = best_knn_model
result, probability = predict_survival(model, scaler, survived=1, pclass=2, sex='female', age=32, sibsp=1, parch=2, fare=60, initial='Mrs')
print("\n[예측 결과]")
print("예측 결과:", result)
print("생존 확률:", probability)
knn_accuracy = accuracy_score(y_test, knn_pred)
print("\n튜닝된 kNN 모델의 정확도:", knn_accuracy)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
최적 하이퍼파라미터: {'n_neighbors': 9, 'weights': 'uniform'}
최고 교차검증 정확도: 0.7244

[튜닝된 모델의 분류 보고서]
              precision    recall  f1-score   support

           1       0.76      0.93      0.83       130
           2       0.42      0.14      0.21        36
           3       0.50      0.25      0.33        12

    accuracy                           0.72       178
   macro avg       0.56      0.44      0.46       178
weighted avg       0.67      0.72      0.67       178


[예측 결과]
예측 결과: S
생존 확률: 0.1111111111111111

튜닝된 kNN 모델의 정확도: 0.7247191011235955


  probability = prediction_proba[0][int(prediction)]


전처리 안 한 거
--- 최종 모델별 정확도 비교 ---
                     Accuracy
Decision Tree        0.769663
kNN                  0.747191
SVM                  0.730337
Logistic Regression  0.730337

# 최종
## 튜닝된 정확도
Logistic Regression     0.7303370786516854<br/>
Decision Tree:          0.7696629213483146<br/>
SVM:                    0.7303370786516854<br/>
kNN:                    0.7247191011235955<br/>


### (참고: Sirsp, parch 변수를 제거하지 않았을 때의 정확도)
Logistic Regression:  0.730337<br/>
Decision Tree:        0.769663<br/>
SVM:                  0.730337<br/>
kNN:                  0.747191<br/>

## 결론
하이퍼파라미터: {'max_depth': 3, 'min_samples_split': 2}로 튜닝한 의사결정나무의 정확도가 약 0.7697로 가장 높게 나왔다.