In [1]:
# 載入資料與必要的套件
import pandas as pd
from pycaret.classification import *

# 讀取資料
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train.head()

# 查看訓練資料集的前幾行
print("Train Dataset:")
print(train.head())

# 簡單處理缺失值與特徵工程
train['Age'] = train['Age'].fillna(train['Age'].median())
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
train['Fare'] = train['Fare'].fillna(train['Fare'].median())

# 處理測試集缺失值
test['Age'] = test['Age'].fillna(test['Age'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# 建立新特徵
train['FamilySize'] = train['SibSp'] + train['Parch']
test['FamilySize'] = test['SibSp'] + test['Parch']

# 刪除無用欄位
train.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# 將類別變數轉換為類別型
train['Sex'] = train['Sex'].astype('category')
train['Embarked'] = train['Embarked'].astype('category')
test['Sex'] = test['Sex'].astype('category')
test['Embarked'] = test['Embarked'].astype('category')

# 設定 PyCaret 環境
clf = setup(data=train,
            target='Survived',
            categorical_features=['Sex', 'Embarked'])

# 新增所需的庫
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# 已存在的模型列表
models = [
    # 目前已使用的15個模型
]

# 添加新的模型到已存在的模型列表中
models.append(GradientBoostingClassifier())
models.append(AdaBoostClassifier())


# 比較所有模型
best_model = compare_models(n_select=16)  # 比較 16 種模型


Train Dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   Na

Unnamed: 0,Description,Value
0,Session id,4503
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 10)"
4,Transformed data shape,"(891, 12)"
5,Transformed train set shape,"(623, 12)"
6,Transformed test set shape,"(268, 12)"
7,Numeric features,7
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8234,0.8493,0.7203,0.8017,0.7554,0.6185,0.6236,0.579
gbc,Gradient Boosting Classifier,0.8203,0.8459,0.7034,0.8039,0.7482,0.6099,0.6149,0.051
rf,Random Forest Classifier,0.8121,0.8487,0.6953,0.7914,0.7374,0.5928,0.5982,0.069
lda,Linear Discriminant Analysis,0.8105,0.8551,0.6911,0.7931,0.7325,0.5884,0.5969,0.017
ada,Ada Boost Classifier,0.809,0.8339,0.7203,0.7696,0.7405,0.5904,0.5946,0.039
lr,Logistic Regression,0.8089,0.857,0.6995,0.7841,0.7342,0.5869,0.5938,0.436
ridge,Ridge Classifier,0.8089,0.8553,0.687,0.792,0.7297,0.5845,0.5932,0.019
lightgbm,Light Gradient Boosting Machine,0.8025,0.8478,0.7034,0.7669,0.7305,0.5756,0.5801,0.064
et,Extra Trees Classifier,0.7945,0.8241,0.6828,0.7567,0.7166,0.5564,0.5593,0.062
xgboost,Extreme Gradient Boosting,0.7898,0.8382,0.6995,0.7403,0.7158,0.5499,0.5536,0.081


In [2]:
# 選擇表現最好的模型
print("Best Model:")
print(best_model)

# 調整模型參數（以選擇的第一個模型為例）
tuned_model = tune_model(best_model[0])

# 評估模型
evaluate_model(tuned_model)

# 預測測試集
final_model = finalize_model(tuned_model)
predictions = predict_model(final_model, data=test)

# 修復預測結果輸出
if 'Label' in predictions.columns:
    output_column = 'Label'
else:
    output_column = 'prediction_label'  # 根據 PyCaret 新版本的欄位名稱

Best Model:
[<catboost.core.CatBoostClassifier object at 0x000001E4CA9A05E0>, GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           random_state=4503, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False), RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
               

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7302,0.844,0.5,0.7059,0.5854,0.3939,0.4068
1,0.8571,0.8462,0.7083,0.8947,0.7907,0.6845,0.6952
2,0.8413,0.9081,0.8333,0.7692,0.8,0.6688,0.6702
3,0.9032,0.903,0.8696,0.8696,0.8696,0.7926,0.7926
4,0.7419,0.7522,0.5417,0.7222,0.619,0.4299,0.4401
5,0.8226,0.8607,0.7083,0.8095,0.7556,0.6173,0.6207
6,0.7419,0.7829,0.5417,0.7222,0.619,0.4299,0.4401
7,0.871,0.8838,0.7083,0.9444,0.8095,0.7149,0.7319
8,0.8065,0.8443,0.75,0.75,0.75,0.5921,0.5921
9,0.9032,0.9342,0.7917,0.95,0.8636,0.7896,0.7975


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…