In [2]:
# 載入資料與必要的套件
import pandas as pd
from pycaret.classification import *

# 讀取資料
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train.head()

# 查看訓練資料集的前幾行
print("Train Dataset:")
print(train.head())

# 查看訓練資料集的前幾行
print("Train Dataset:")
print(train.head())

# 簡單處理缺失值與特徵工程
train['Age'] = train['Age'].fillna(train['Age'].median())
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
train['Fare'] = train['Fare'].fillna(train['Fare'].median())

# 處理測試集缺失值
test['Age'] = test['Age'].fillna(test['Age'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# 建立新特徵
train['FamilySize'] = train['SibSp'] + train['Parch']
test['FamilySize'] = test['SibSp'] + test['Parch']
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)
test['IsAlone'] = (test['FamilySize'] == 1).astype(int)
# 刪除無用欄位
train.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# 將類別變數轉換為類別型
train['Sex'] = train['Sex'].astype('category')
train['Embarked'] = train['Embarked'].astype('category')
test['Sex'] = test['Sex'].astype('category')
test['Embarked'] = test['Embarked'].astype('category')

# 設定 PyCaret 環境
clf = setup(data=train,
            target='Survived',
            categorical_features=['Sex', 'Embarked'],
            session_id=123)

# 新增所需的庫
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# 已存在的模型列表
models = [
    # 目前已使用的15個模型
]

# 添加新的模型到已存在的模型列表中
models.append(GradientBoostingClassifier())
models.append(AdaBoostClassifier())

# 比較所有模型
best_model = compare_models(n_select=16)  # 比較 16 種模型


Train Dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   Na

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 11)"
4,Transformed data shape,"(891, 13)"
5,Transformed train set shape,"(623, 13)"
6,Transformed test set shape,"(268, 13)"
7,Numeric features,8
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8153,0.8413,0.7239,0.7807,0.7499,0.6041,0.6063,0.06
gbc,Gradient Boosting Classifier,0.8105,0.837,0.6986,0.7846,0.7362,0.5897,0.5943,0.044
catboost,CatBoost Classifier,0.8073,0.8567,0.6861,0.7879,0.7292,0.5816,0.5881,0.439
ada,Ada Boost Classifier,0.7976,0.8298,0.7366,0.7415,0.7341,0.5715,0.5764,0.039
lr,Logistic Regression,0.796,0.8519,0.7072,0.7506,0.7235,0.563,0.5679,0.044
ridge,Ridge Classifier,0.7912,0.8537,0.6989,0.7429,0.717,0.5525,0.5558,0.018
lda,Linear Discriminant Analysis,0.7912,0.8534,0.6989,0.7429,0.717,0.5525,0.5558,0.017
et,Extra Trees Classifier,0.7897,0.8254,0.6864,0.7438,0.7102,0.5467,0.5505,0.051
lightgbm,Light Gradient Boosting Machine,0.7864,0.8315,0.6696,0.7541,0.7031,0.5388,0.5454,0.089
xgboost,Extreme Gradient Boosting,0.7816,0.8108,0.6736,0.7448,0.703,0.5314,0.5368,0.03


In [None]:
# 2. 模型選擇與比較
# 使用 PyCaret 比較多個模型並選擇表現最好的幾個模型
top_models = compare_models(n_select=5)

# 集成多模型
blended_model = blend_models(top_models)

# 3. 超參數優化
tuned_model = tune_model(blended_model, optimize='Accuracy')

# 評估模型
evaluate_model(tuned_model)

# 最終化模型
final_model = finalize_model(tuned_model)

# 預測測試集
predictions = predict_model(final_model, data=test)

# 修復預測結果輸出
output_column = 'Label' if 'Label' in predictions.columns else 'prediction_label'


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8153,0.8413,0.7239,0.7807,0.7499,0.6041,0.6063,0.06
gbc,Gradient Boosting Classifier,0.8105,0.837,0.6986,0.7846,0.7362,0.5897,0.5943,0.044
catboost,CatBoost Classifier,0.8073,0.8567,0.6861,0.7879,0.7292,0.5816,0.5881,0.473
ada,Ada Boost Classifier,0.7976,0.8298,0.7366,0.7415,0.7341,0.5715,0.5764,0.035
lr,Logistic Regression,0.796,0.8519,0.7072,0.7506,0.7235,0.563,0.5679,0.044
ridge,Ridge Classifier,0.7912,0.8537,0.6989,0.7429,0.717,0.5525,0.5558,0.017
lda,Linear Discriminant Analysis,0.7912,0.8534,0.6989,0.7429,0.717,0.5525,0.5558,0.017
et,Extra Trees Classifier,0.7897,0.8254,0.6864,0.7438,0.7102,0.5467,0.5505,0.055
lightgbm,Light Gradient Boosting Machine,0.7864,0.8315,0.6696,0.7541,0.7031,0.5388,0.5454,0.065
xgboost,Extreme Gradient Boosting,0.7816,0.8108,0.6736,0.7448,0.703,0.5314,0.5368,0.027


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8889,0.8868,0.7917,0.9048,0.8444,0.7586,0.7627
1,0.8254,0.8066,0.75,0.7826,0.766,0.6268,0.6272
2,0.8095,0.9316,0.7083,0.7727,0.7391,0.5896,0.591
3,0.8387,0.8829,0.6957,0.8421,0.7619,0.6416,0.6483
4,0.7581,0.7763,0.5833,0.7368,0.6512,0.4698,0.4773
5,0.7742,0.8289,0.625,0.75,0.6818,0.509,0.5141
6,0.9032,0.8849,0.875,0.875,0.875,0.7961,0.7961
7,0.7742,0.8257,0.7917,0.6786,0.7308,0.5383,0.543
8,0.7258,0.8476,0.5417,0.6842,0.6047,0.3991,0.4055
9,0.8548,0.932,0.7917,0.8261,0.8085,0.6917,0.6921


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8889,0.8932,0.7917,0.9048,0.8444,0.7586,0.7627
1,0.8413,0.8205,0.7917,0.7917,0.7917,0.6635,0.6635
2,0.8095,0.9113,0.7083,0.7727,0.7391,0.5896,0.591
3,0.8387,0.874,0.6957,0.8421,0.7619,0.6416,0.6483
4,0.7581,0.7522,0.5417,0.7647,0.6341,0.4612,0.4765
5,0.8226,0.8355,0.7083,0.8095,0.7556,0.6173,0.6207
6,0.9032,0.8728,0.875,0.875,0.875,0.7961,0.7961
7,0.7742,0.8235,0.7917,0.6786,0.7308,0.5383,0.543
8,0.7419,0.841,0.5417,0.7222,0.619,0.4299,0.4401
9,0.8548,0.9057,0.7917,0.8261,0.8085,0.6917,0.6921


Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Submission file saved as 'submission.csv'
