In [29]:
import pandas as pd
import csv
import numpy as np

In [30]:
train_df = pd.read_csv("./Data/titanic_train.csv")
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
# 計算各變數的缺失值比例
print(train_df.isnull().mean())

# 增加 Has_Age 和 Has_Cabin 欄位
train_df['Has_Age'] = train_df['Age'].notnull().astype(int)
train_df['Has_Cabin'] = train_df['Cabin'].notnull().astype(int)

# 填補缺失值
train_df.fillna({'Embarked': 'No', 'Age': 0}, inplace=True)
print("----------------------------")

# 再次顯示各變數的缺失值比例
print(train_df.isnull().mean())

# train_df.info

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64
----------------------------
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.000000
Has_Age        0.000000
Has_Cabin      0.000000
dtype: float64


In [32]:
# 資料 EDA
eda_features = ["Pclass", "Sex", "Has_Age", "Has_Cabin", "Embarked"]
for feature in eda_features:
    display(train_df.groupby(feature, as_index=False)["Survived"].mean().round(3).sort_values("Survived", ascending=False))

# 使用 pivot_table 計算數值型變數與 Survived 的關係
num_features = ["Age", "SibSp", "Parch", "Fare"]
display(train_df.pivot_table(values=num_features, columns="Survived", aggfunc="mean").round(3))


Unnamed: 0,Pclass,Survived
0,1,0.63
1,2,0.473
2,3,0.242


Unnamed: 0,Sex,Survived
0,female,0.742
1,male,0.189


Unnamed: 0,Has_Age,Survived
1,1,0.406
0,0,0.294


Unnamed: 0,Has_Cabin,Survived
1,1,0.667
0,0,0.3


Unnamed: 0,Embarked,Survived
1,No,1.0
0,C,0.554
2,Q,0.39
3,S,0.337


Survived,0,1
Age,23.653,24.034
Fare,22.118,48.395
Parch,0.33,0.465
SibSp,0.554,0.474


In [102]:
from sklearn.preprocessing import MinMaxScaler

# 設定 y 變數
train_df_y = train_df["Survived"].astype(int).values  # 直接轉 int，並轉為陣列格式

# 建立 X 變數
fewvar_train_df_x = train_df.copy()

# 類別變數處理
fewvar_train_df_x["Sex"] = (fewvar_train_df_x["Sex"] == "male").astype(int)  # 直接轉 0/1
fewvar_train_df_x = pd.get_dummies(fewvar_train_df_x, columns=["Pclass", "Embarked"], drop_first=False)  # One-Hot Encoding

# 選擇要標準化的變數
features_to_scale = ["Age", "SibSp", "Parch", "Fare"]
scaler = MinMaxScaler()
fewvar_train_df_x[features_to_scale] = scaler.fit_transform(fewvar_train_df_x[features_to_scale])

# 選取最終變數
train_df_x = fewvar_train_df_x[["Sex", "Age", "SibSp", "Parch", "Fare", "Has_Age", "Has_Cabin"] + 
                               list(fewvar_train_df_x.filter(like="Pclass_")) + 
                               list(fewvar_train_df_x.filter(like="Embarked_"))]

columns_order = [col for col in train_df_x.columns if col != "Embarked_No"] + ["Embarked_No"]

# 重新排列 train_df_x 的欄位
train_df_x = train_df_x[columns_order]

# 檢查是否成功調整
train_df_x.head()


Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Has_Age,Has_Cabin,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Embarked_No
0,1,0.275,0.125,0.0,0.014151,1,0,False,False,True,False,False,True,False
1,0,0.475,0.125,0.0,0.139136,1,1,True,False,False,True,False,False,False
2,0,0.325,0.0,0.0,0.015469,1,0,False,False,True,False,False,True,False
3,0,0.4375,0.125,0.0,0.103644,1,1,True,False,False,False,False,True,False
4,1,0.4375,0.0,0.0,0.015713,1,0,False,False,True,False,False,True,False


In [103]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier


def tune_random_forest(train_df_x, train_df_y, n_estimators_range=range(50, 301, 50), cv=5, scoring='accuracy'):
    results = []
    
    for i in n_estimators_range:
        scores = cross_val_score(RandomForestClassifier(n_estimators=i), train_df_x, train_df_y, cv=cv, scoring=scoring)
        mean_score = scores.mean()
        results.append((i, mean_score))
        
        print(f"樹木數量 (n_estimators) = {i}, 交叉驗證準確率 = {mean_score:.4f}")
    
    best_n_estimators, best_score = max(results, key=lambda x: x[1])  # 找到準確率最高的 n_estimators
    
    print("--------------------------------------------------")
    print(f"最佳樹木數量 (n_estimators) = {best_n_estimators}")
    print(f"最佳交叉驗證準確率 = {best_score:.4f}")
    
    return best_n_estimators, best_score

best_n_estimators, best_score = tune_random_forest(train_df_x, train_df_y)


def tune_rf_max_depth(train_x, train_y, rf_tree_num, max_depth_range=range(3, 11), cv=5, scoring='accuracy'):

    results = []
    
    for i in max_depth_range:
        forest = RandomForestClassifier(n_estimators = rf_tree_num, max_depth = i)  # 使用不同的 max_depth
        scores = cross_val_score(forest, train_x, train_y, cv=cv, scoring=scoring)
        mean_score = scores.mean()
        results.append((i, mean_score))
        
        print(f"最大深度 (max_depth) = {i}, 準確率 = {mean_score:.4f}")  # 顯示每次的準確率
    
    best_max_depth, best_score = max(results, key=lambda x: x[1])  # 找到準確率最高的 max_depth
    
    print("--------------------------------------------------")
    print(f"最佳最大深度 (max_depth) = {best_max_depth}")
    print(f"最佳交叉驗證準確率 = {best_score:.4f}")

    return best_max_depth, best_score

best_max_depth, best_score = tune_rf_max_depth(train_df_x, train_df_y, best_n_estimators)

樹木數量 (n_estimators) = 50, 交叉驗證準確率 = 0.7992
樹木數量 (n_estimators) = 100, 交叉驗證準確率 = 0.7947
樹木數量 (n_estimators) = 150, 交叉驗證準確率 = 0.7924
樹木數量 (n_estimators) = 200, 交叉驗證準確率 = 0.7913
樹木數量 (n_estimators) = 250, 交叉驗證準確率 = 0.7958
樹木數量 (n_estimators) = 300, 交叉驗證準確率 = 0.7946
--------------------------------------------------
最佳樹木數量 (n_estimators) = 50
最佳交叉驗證準確率 = 0.7992
最大深度 (max_depth) = 3, 準確率 = 0.7823
最大深度 (max_depth) = 4, 準確率 = 0.7891
最大深度 (max_depth) = 5, 準確率 = 0.8070
最大深度 (max_depth) = 6, 準確率 = 0.8171
最大深度 (max_depth) = 7, 準確率 = 0.8149
最大深度 (max_depth) = 8, 準確率 = 0.8216
最大深度 (max_depth) = 9, 準確率 = 0.8126
最大深度 (max_depth) = 10, 準確率 = 0.8070
--------------------------------------------------
最佳最大深度 (max_depth) = 8
最佳交叉驗證準確率 = 0.8216


In [104]:
from sklearn.neighbors import KNeighborsClassifier

def tune_knn_manual(train_df_x, train_df_y, k_range=range(3, 22, 2), cv=5, scoring='accuracy'):
    knn_temp_score = 0
    knn_n_num = 0
    for i in k_range:
        knn = KNeighborsClassifier(n_neighbors=i)
        scores = cross_val_score(knn, train_df_x, train_df_y, cv=cv, scoring=scoring)
        mean_score = scores.mean()
        if mean_score > knn_temp_score:
            knn_temp_score = mean_score
            knn_n_num = i
        print(f"KNN (k={i}): {scores} | Mean accuracy: {mean_score}")
    
    print("-----------------------------------------------------------------")
    print(f"Final Best k: {knn_n_num} with accuracy: {knn_temp_score}")
    return knn_n_num, knn_temp_score

best_knn_k, best_knn_score = tune_knn_manual(train_df_x, train_df_y)


KNN (k=3): [0.78212291 0.73595506 0.79775281 0.78089888 0.79775281] | Mean accuracy: 0.7788964911179461
KNN (k=5): [0.75977654 0.7752809  0.81460674 0.80337079 0.78651685] | Mean accuracy: 0.7879103634423451
KNN (k=7): [0.78212291 0.78089888 0.83707865 0.80337079 0.80898876] | Mean accuracy: 0.8024919967359236
KNN (k=9): [0.78212291 0.81460674 0.81460674 0.79213483 0.81460674] | Mean accuracy: 0.8036155922415416
KNN (k=11): [0.75977654 0.80898876 0.80898876 0.79775281 0.80337079] | Mean accuracy: 0.795775531981671
KNN (k=13): [0.74860335 0.80898876 0.82022472 0.80898876 0.83707865] | Mean accuracy: 0.8047768501663425
KNN (k=15): [0.75977654 0.80898876 0.81460674 0.80337079 0.83707865] | Mean accuracy: 0.8047642960266147
KNN (k=17): [0.73743017 0.79775281 0.80337079 0.79775281 0.83146067] | Mean accuracy: 0.7935534492498901
KNN (k=19): [0.74860335 0.78651685 0.80337079 0.78089888 0.82022472] | Mean accuracy: 0.7879229175820728
KNN (k=21): [0.74860335 0.79213483 0.80898876 0.78651685 0.8

In [105]:
from xgboost import XGBClassifier

def tune_xgb_n_estimators(train_x, train_y, cv=5, scoring='accuracy', estimator_range=range(50, 301, 50)):
    best_score = 0
    best_n_estimators = 0
    results = []
    
    for n in estimator_range:
        xgbc = XGBClassifier(eval_metric=['logloss', 'auc', 'error'], n_estimators=n)
        scores = cross_val_score(xgbc, train_x, train_y, cv=cv, scoring=scoring)
        mean_score = scores.mean()
        results.append({'n_estimators': n, 'accuracy': mean_score})
        
        if mean_score > best_score:
            best_score = mean_score
            best_n_estimators = n
        
        print(f"xgb n_estimators={n}")
        print(scores)
        print(f"accuracy: {mean_score}")
        print("-----------------------")
    
    print("---------------------------------------------------------------")
    print(f"final_xgbc_n_estimators={best_n_estimators}")
    print("---------------------------------------------------------------")
    
    results_df = pd.DataFrame(results)
    return best_n_estimators, results_df

def tune_xgb_max_depth(train_x, train_y, best_n_estimators, cv=5, scoring='accuracy', depth_range=range(3, 11)):
    best_score = 0
    best_max_depth = 0
    results = []
    
    for depth in depth_range:
        xgbc = XGBClassifier(eval_metric=['logloss', 'auc', 'error'], n_estimators=best_n_estimators, max_depth=depth)
        scores = cross_val_score(xgbc, train_x, train_y, cv=cv, scoring=scoring)
        mean_score = scores.mean()
        results.append({'max_depth': depth, 'accuracy': mean_score})
        
        if mean_score > best_score:
            best_score = mean_score
            best_max_depth = depth
        
        print(f"xgb max_depth={depth}")
        print(scores)
        print(f"accuracy: {mean_score}")
    
    
    print("---------------------------------------------------------------")
    print(f"final_xgbc_max_depth={best_max_depth}")
    print("---------------------------------------------------------------")
    
    results_df = pd.DataFrame(results)
    return best_max_depth, results_df

best_n_estimators, estimator_results = tune_xgb_n_estimators(train_df_x, train_df_y)
best_max_depth, depth_results = tune_xgb_max_depth(train_df_x, train_df_y, best_n_estimators)

xgb n_estimators=50
[0.79888268 0.80898876 0.85955056 0.79775281 0.85955056]
accuracy: 0.8249450756386919
-----------------------
xgb n_estimators=100
[0.78212291 0.8258427  0.85955056 0.79775281 0.84831461]
accuracy: 0.8227167158370472
-----------------------
xgb n_estimators=150
[0.78212291 0.82022472 0.86516854 0.79775281 0.83707865]
accuracy: 0.8204695248258114
-----------------------
xgb n_estimators=200
[0.75977654 0.82022472 0.85393258 0.78651685 0.82022472]
accuracy: 0.8081350825434687
-----------------------
xgb n_estimators=250
[0.75418994 0.8258427  0.85393258 0.78089888 0.81460674]
accuracy: 0.8058941686020965
-----------------------
xgb n_estimators=300
[0.75977654 0.8258427  0.85955056 0.78089888 0.8258427 ]
accuracy: 0.8103822735547046
-----------------------
---------------------------------------------------------------
final_xgbc_n_estimators=50
---------------------------------------------------------------
xgb max_depth=3
[0.80446927 0.79775281 0.88202247 0.79775281

In [106]:
#Build Model 
forest = RandomForestClassifier(n_estimators = best_n_estimators, max_depth = best_max_depth)
forest_fit = forest.fit(train_df_x, train_df_y)

knn = KNeighborsClassifier(n_neighbors = best_knn_k)
knn_fit = knn.fit(train_df_x, train_df_y)

xgb = XGBClassifier(eval_metric=['logloss','auc','error'], n_estimators = best_n_estimators, max_depth = best_max_depth)
xgb_fit = xgb.fit(train_df_x, train_df_y)

In [107]:

test_df = pd.read_csv("./Data/titanic_test.csv")

test_df


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [108]:
# 增加 Has_Age 與 Has_Cabin 欄位
test_df['Has_Age'] = test_df['Age'].notnull().astype(int)
test_df['Has_Cabin'] = test_df['Cabin'].notnull().astype(int)

# 補值處理
test_df['Embarked'] = test_df['Embarked'].fillna("No")
test_df['Age'] = test_df['Age'].fillna(0)

# 選擇需要的變數，並進行轉換
fewvar_test_df_x = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Has_Age', 'Has_Cabin']].copy()

# 性別轉換成數值 (1: 男, 0: 女)
fewvar_test_df_x['Sex'] = (fewvar_test_df_x['Sex'] == 'male').astype(int)

# Pclass 轉為字串
fewvar_test_df_x['Pclass'] = fewvar_test_df_x['Pclass'].astype(str)

# 轉換為 One-Hot Encoding (Dummy Variables)
test_df_x = pd.get_dummies(fewvar_test_df_x)

test_df_x["Embarked_No"] = 0
#測試集Fare平均插值
test_df_x['Fare'] = test_df_x['Fare'].fillna(test_df_x['Fare'].mean())

test_df_x

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Has_Age,Has_Cabin,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Embarked_No
0,1,34.5,0,0,7.8292,1,0,False,False,True,False,True,False,0
1,0,47.0,1,0,7.0000,1,0,False,False,True,False,False,True,0
2,1,62.0,0,0,9.6875,1,0,False,True,False,False,True,False,0
3,1,27.0,0,0,8.6625,1,0,False,False,True,False,False,True,0
4,0,22.0,1,1,12.2875,1,0,False,False,True,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1,0.0,0,0,8.0500,0,0,False,False,True,False,False,True,0
414,0,39.0,0,0,108.9000,1,1,True,False,False,True,False,False,0
415,1,38.5,0,0,7.2500,1,0,False,False,True,False,False,True,0
416,1,0.0,0,0,8.0500,0,0,False,False,True,False,False,True,0


In [110]:
# 進行預測
submit_rf = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': forest_fit.predict(test_df_x).astype(int)
})

# 輸出 CSV
submit_rf.to_csv('submit_rf.csv', index=False)



In [111]:
# 進行預測
submit_xgb = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': xgb_fit.predict(test_df_x).astype(int)
})

# 輸出 CSV
submit_xgb.to_csv('submit_xgb.csv', index=False)



In [114]:
submit_rf

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [115]:
submit_xgb

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
