In [23]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.1
Note: you may need to restart the kernel to use updated packages.


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv(r'~\OneDrive\Desktop\Titanic\notebooks\transformed_df.csv')

In [10]:
X = df.drop('Survived', axis = 1)
y = df['Survived']

In [11]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 1)

In [12]:
model = RandomForestClassifier(n_estimators = 100, random_state= 42)

In [20]:
cv_scores = []
feature_importances = []
all_true_vals = []
all_pred_vals = []
test_prediction = np.zeros(len(X))


for fold, (train_idx, val_idx) in enumerate(skf.split(X,y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)
    all_true_vals.extend(y_val)
    all_pred_vals.extend(val_preds)

    test_prediction[val_idx] = val_preds

    fold_accuracy = accuracy_score(y_val, val_preds)
    cv_scores.append(fold_accuracy)

    print(f'\nClassification Report (Fold {fold + 1}):')
    fold_report = classification_report(y_val, val_preds, target_names = ['Died', 'Survived'])
    print(fold_report)

    print(f'\nConfusion Matrix Report (Fold {fold + 1}):')
    fold_report = confusion_matrix(y_val, val_preds)
    print(fold_report)
    
    feature_importances.append(model.feature_importances_)

print(f'Cross-Validation Results:')
print(f'Mean Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})')
print(f'Individual Fold Scores: {[f"{score:.4f}" for score in cv_scores]}')

print('\nClassification report')
overall_report  = classification_report(all_true_vals, all_pred_vals, target_names = ['Died', 'Survived'])
print(overall_report)


print('\nConfusion Matrix report')
overall_cm = confusion_matrix(all_true_vals, all_pred_vals)
print(overall_cm)


Classification Report (Fold 1):
              precision    recall  f1-score   support

        Died       0.82      0.84      0.83       110
    Survived       0.73      0.71      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179


Confusion Matrix Report (Fold 1):
[[92 18]
 [20 49]]

Classification Report (Fold 2):
              precision    recall  f1-score   support

        Died       0.85      0.91      0.88       110
    Survived       0.84      0.75      0.79        68

    accuracy                           0.85       178
   macro avg       0.85      0.83      0.84       178
weighted avg       0.85      0.85      0.85       178


Confusion Matrix Report (Fold 2):
[[100  10]
 [ 17  51]]

Classification Report (Fold 3):
              precision    recall  f1-score   support

        Died       0.81      0.85      0.83       110
    Survived       0.73   

In [21]:
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': np.mean(feature_importances, axis = 0)
}).sort_values('importance', ascending = False)

feature_importance_df.head(10)

Unnamed: 0,feature,importance
1,Age,0.196016
12,Gender_oh,0.175033
7,FarePerPerson,0.148802
4,Fare,0.146891
10,Title_le,0.112109
0,Pclass,0.055955
5,FamilySize,0.040896
2,SibSp,0.028402
11,FareGroup_le,0.025968
8,Embarked_le,0.025841


In [26]:
cv_scores = []
feature_importances = []
all_true_vals = []
all_pred_vals = []
test_prediction = np.zeros(len(X))

model = XGBClassifier(objective = 'binary:logistic', eval_metric='logloss', use_label_encoder = False, random_state= 42)
for fold, (train_idx, val_idx) in enumerate(skf.split(X,y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)
    all_true_vals.extend(y_val)
    all_pred_vals.extend(val_preds)

    test_prediction[val_idx] = val_preds

    fold_accuracy = accuracy_score(y_val, val_preds)
    cv_scores.append(fold_accuracy)

    print(f'\nClassification Report (Fold {fold + 1}):')
    fold_report = classification_report(y_val, val_preds, target_names = ['Died', 'Survived'])
    print(fold_report)

    print(f'\nConfusion Matrix Report (Fold {fold + 1}):')
    fold_report = confusion_matrix(y_val, val_preds)
    print(fold_report)
    
    feature_importances.append(model.feature_importances_)

print(f'Cross-Validation Results:')
print(f'Mean Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})')
print(f'Individual Fold Scores: {[f"{score:.4f}" for score in cv_scores]}')

print('\nClassification report')
overall_report  = classification_report(all_true_vals, all_pred_vals, target_names = ['Died', 'Survived'])
print(overall_report)


print('\nConfusion Matrix report')
overall_cm = confusion_matrix(all_true_vals, all_pred_vals)
print(overall_cm)


Classification Report (Fold 1):
              precision    recall  f1-score   support

        Died       0.81      0.85      0.83       110
    Survived       0.73      0.68      0.71        69

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.78      0.78      0.78       179


Confusion Matrix Report (Fold 1):
[[93 17]
 [22 47]]

Classification Report (Fold 2):
              precision    recall  f1-score   support

        Died       0.84      0.92      0.88       110
    Survived       0.84      0.72      0.78        68

    accuracy                           0.84       178
   macro avg       0.84      0.82      0.83       178
weighted avg       0.84      0.84      0.84       178


Confusion Matrix Report (Fold 2):
[[101   9]
 [ 19  49]]

Classification Report (Fold 3):
              precision    recall  f1-score   support

        Died       0.78      0.84      0.81       110
    Survived       0.70   

In [27]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'Importance': model.feature_importances_}).sort_values('Importance', ascending = False)

In [28]:
feature_importance.head(10)

Unnamed: 0,feature,Importance
12,Gender_oh,0.513222
0,Pclass,0.117324
2,SibSp,0.111474
10,Title_le,0.049995
5,FamilySize,0.041069
7,FarePerPerson,0.03231
4,Fare,0.026106
1,Age,0.023948
3,Parch,0.022841
9,AgeGroup_le,0.021755
