In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [4]:
# 匯入所需的套件
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.metrics import confusion_matrix, accuracy_score
import optuna

# 1. 載入數據
train_url = "https://raw.githubusercontent.com/LonelyCaesar/-Titanic-Survival-Prediction/main/train.csv"
test_url = "https://raw.githubusercontent.com/LonelyCaesar/-Titanic-Survival-Prediction/main/test.csv"

train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

# 2. 數據前處理
# 填補缺失值
train["Age"].fillna(train["Age"].median(), inplace=True)
test["Age"].fillna(test["Age"].median(), inplace=True)
test["Fare"].fillna(test["Fare"].median(), inplace=True)

# 類別特徵編碼
train["Sex"] = train["Sex"].map({"male": 0, "female": 1})
test["Sex"] = test["Sex"].map({"male": 0, "female": 1})
train["Embarked"].fillna("S", inplace=True)
train["Embarked"] = train["Embarked"].map({"C": 0, "Q": 1, "S": 2})
test["Embarked"] = test["Embarked"].map({"C": 0, "Q": 1, "S": 2})

# 選擇特徵
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = train[features]
y = train["Survived"]
X_test = test[features]

# 分割數據集
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 特徵選擇
# 使用 SelectKBest
skb = SelectKBest(score_func=f_classif, k=5)
X_train_skb = skb.fit_transform(X_train, y_train)
X_valid_skb = skb.transform(X_valid)
X_test_skb = skb.transform(X_test)

# 使用 RFE
model = RandomForestClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=5)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_valid_rfe = rfe.transform(X_valid)
X_test_rfe = rfe.transform(X_test)

# 4. Optuna 超參數調優
def objective(trial):
    n_features = trial.suggest_int("n_features", 3, len(features))
    model = RandomForestClassifier(random_state=42)

    skb = SelectKBest(score_func=f_classif, k=n_features)
    X_train_selected = skb.fit_transform(X_train, y_train)
    X_valid_selected = skb.transform(X_valid)

    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_valid_selected)
    return accuracy_score(y_valid, y_pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 使用最佳特徵數
best_k = study.best_params["n_features"]
skb = SelectKBest(score_func=f_classif, k=best_k)
X_train_best = skb.fit_transform(X_train, y_train)
X_test_best = skb.transform(X_test)

# 5. 模型訓練與評估
model = RandomForestClassifier(random_state=42)

# 對驗證集和測試集進行相同的特徵選擇處理
X_valid_best = skb.transform(X_valid)
X_test_best = skb.transform(X_test)

# 訓練模型
model.fit(X_train_best, y_train)

# 預測驗證集與測試集
y_pred_valid = model.predict(X_valid_best)
y_pred_test = model.predict(X_test_best)

# 混淆矩陣與準確率
conf_matrix = confusion_matrix(y_valid, y_pred_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)

print("混淆矩陣:\n", conf_matrix)
print("準確率: {:.2f}%".format(accuracy * 100))

# 6. 儲存預測結果
submission = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": y_pred_test})
submission.to_csv("submission.csv", index=False)

print("結果已儲存至 submission.csv")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Age"].fillna(train["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["Age"].fillna(test["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

混淆矩陣:
 [[91 14]
 [19 55]]
準確率: 81.56%
結果已儲存至 submission.csv
