In [1]:
import seaborn as sns
data = sns.load_dataset('titanic')

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [3]:
data = data[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'alone']].copy()


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   alone     891 non-null    bool   
dtypes: bool(1), float64(2), int64(4), object(2)
memory usage: 56.7+ KB


In [5]:
data.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
alone         0
dtype: int64

In [10]:
data['age'].fillna(data['age'].median(), inplace=True)

In [13]:
data['embarked'].fillna(data['embarked'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['embarked'].fillna(data['embarked'].mode()[0],inplace=True)


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       891 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  891 non-null    object 
 8   alone     891 non-null    bool   
dtypes: bool(1), float64(2), int64(4), object(2)
memory usage: 56.7+ KB


In [18]:
data['sex'] = data['sex'].map({'male':0,'female':1})
data['embarked'] = data['embarked'].map({'S':0, 'C':1, 'Q':2})
data['alone'] = data['alone'].astype(int)

In [19]:
data['embarked'].value_counts()

embarked
0    646
1    168
2     77
Name: count, dtype: int64

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    int64  
 3   age       891 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  891 non-null    int64  
 8   alone     891 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 62.8 KB


In [22]:
from sklearn.model_selection import train_test_split

X=data.drop(['survived'], axis=1)
y=data['survived']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [24]:
X_train.shape

(712, 8)

In [27]:
import xgboost as xgb

In [29]:
xgb_model = xgb.XGBClassifier(
    n_estimators = 1000,
    max_depth=6,
    learning_rate = 0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha = 0.1, 
    n_jobs=-1,
    random_state=42,
    eval_metric='logloss'
)

In [30]:
xgb_model.fit(X_train, y_train, verbose=False)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [31]:
y_pred = xgb_model.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score

print(f'XGBoost acc: {accuracy_score(y_test,y_pred):.2f}')

XGBoost acc: 0.81


In [None]:
# 2 XGBoost (next 100 min)
import xgboost as xgb
xgb_model = xgb.XGBClassifier(n_estimators=1000, max_depth=6, learning_rate=0.02,
                              subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, n_jobs=-1,
                              random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train, verbose=False)
print("XGBoost:", accuracy_score(y_test, xgb_model.predict(X_test)))

In [35]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m1.6 MB/s[0m  [33m0:00:00[0mm [31m1.7 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [36]:
import lightgbm as lgb

In [38]:
lboost = lgb.LGBMClassifier(
    n_estimators=1000,
    max_depth=7,
    learning_rate=0.02,
    num_leaves=40,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1, random_state=42
)

In [40]:
lboost.fit(X_train,y_train)



[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000380 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 196
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


0,1,2
,boosting_type,'gbdt'
,num_leaves,40
,max_depth,7
,learning_rate,0.02
,n_estimators,1000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [41]:
y_pred = lboost.predict(X_test)

In [42]:
from sklearn.metrics import accuracy_score
print(f'lightboost acc: {accuracy_score(y_test,y_pred)}')

lightboost acc: 0.8268156424581006


In [46]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.02, verbose=False, random_state=42)
cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1199bcc20>

In [45]:
# 4 FINAL VOTING ENSEMBLE (the one that gets you top 3 %)
from sklearn.ensemble import VotingClassifier
final_ensemble = VotingClassifier(
    estimators=[('xgb', xgb_model), ('lgb', lgb_model), ('rf', rf), ('cat', cat)],
    voting='soft', n_jobs=-1
)
final_ensemble.fit(X_train, y_train)
final_score = accuracy_score(y_test, final_ensemble.predict(X_test))
print(f"FINAL SCORE: {final_score:.5f}")   # ← you want ≥ 0.86000

[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.8/27.8 MB[0m [31m1.5 MB/s[0m  [33m0:00:18[0m[0m eta [36m0:00:01[0m0:01[0m:02[0m
[?25hDownloading graphviz-0.21-py3-none-any.whl (47 kB)
Downloading plotly-6.5.0-py3-none-any.whl (9.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m1.5 MB/s[0m  [33m0:00:06[0m1.5 MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: plotly, graphviz, catboost
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [catboost]
[1A[2KSuccessfully installed catboost-1.2.8 graphviz-0.21 plotly-6.5.0
Note: you may need to restart the kernel to use updated packages.


In [47]:
from sklearn.ensemble import VotingClassifier

final_ensemble = VotingClassifier(
    estimators = [('xgb', xgb_model), ('cat', cat), ('lgb', lboost)],
                 voting='soft', n_jobs = -1)

In [48]:
final_ensemble.fit(X_train, y_train)
final_score = accuracy_score(y_test, final_ensemble.predict(X_test))
print(f"FINAL SCORE: {final_score:.5f}")   # ← you want ≥ 0.86000

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 196
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
