# Titanic v2 – Feature engineering avancé + XGBoost

Features ajoutées : Title, AgeBand, FareBand.


## 1. Imports

In [9]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


## 2. Chargement des données brutes

In [10]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 3. Feature engineering avancé + nettoyage

In [11]:
os.makedirs('../data/clean', exist_ok=True)
os.makedirs('../data/result', exist_ok=True)

test['Survived'] = np.nan
train['is_train'] = 1
test['is_train'] = 0

full = pd.concat([train, test], ignore_index=True)

full['Title'] = full['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
title_map = {
    'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare', 'Mlle': 'Miss',
    'Mme': 'Mrs', 'Lady': 'Rare', 'Countess': 'Rare', 'Jonkheer': 'Rare',
    'Sir': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Capt': 'Rare'
}
full['Title'] = full['Title'].map(title_map).fillna('Rare')

full = full.drop(columns=['Ticket', 'Cabin'])

full['Age'] = full['Age'].fillna(full['Age'].median())
full['Fare'] = full['Fare'].fillna(full['Fare'].median())
full['Embarked'] = full['Embarked'].fillna(full['Embarked'].mode()[0])

full['AgeBand'] = pd.cut(full['Age'], 5, labels=False)
full['FareBand'] = pd.qcut(full['Fare'], 4, labels=False, duplicates='drop')

full['Sex'] = full['Sex'].map({'male': 0, 'female': 1})
full['FamilySize'] = full['SibSp'] + full['Parch'] + 1
full['IsAlone'] = (full['FamilySize'] == 1).astype(int)

full = pd.get_dummies(full, columns=['Embarked', 'Pclass', 'Title'], drop_first=True)

bool_cols = full.select_dtypes(include=['bool']).columns
full[bool_cols] = full[bool_cols].astype(int)

if 'Name' in full.columns:
    full = full.drop(columns=['Name'])

full.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,is_train,AgeBand,FareBand,FamilySize,IsAlone,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,0.0,0,22.0,1,0,7.25,1,1,0,2,0,0,1,0,1,0,1,0,0
1,2,1.0,1,38.0,1,0,71.2833,1,2,3,2,0,0,0,0,0,0,0,1,0
2,3,1.0,1,26.0,0,0,7.925,1,1,1,1,1,0,1,0,1,1,0,0,0
3,4,1.0,1,35.0,1,0,53.1,1,2,3,2,0,0,1,0,0,0,0,1,0
4,5,0.0,0,35.0,0,0,8.05,1,2,1,1,1,0,1,0,1,0,1,0,0


## 4. Séparation train_clean_v2 / test_clean_v2 et sauvegarde CSV

In [12]:
train_clean_v2 = full[full['is_train'] == 1].drop(columns=['is_train'])
test_clean_v2 = full[full['is_train'] == 0].drop(columns=['is_train', 'Survived'])

train_clean_v2.to_csv('../data/clean/train_clean_v2.csv', index=False)
test_clean_v2.to_csv('../data/clean/test_clean_v2.csv', index=False)

train_clean_v2.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,AgeBand,FareBand,FamilySize,IsAlone,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,0.0,0,22.0,1,0,7.25,1,0,2,0,0,1,0,1,0,1,0,0
1,2,1.0,1,38.0,1,0,71.2833,2,3,2,0,0,0,0,0,0,0,1,0
2,3,1.0,1,26.0,0,0,7.925,1,1,1,1,0,1,0,1,1,0,0,0
3,4,1.0,1,35.0,1,0,53.1,2,3,2,0,0,1,0,0,0,0,1,0
4,5,0.0,0,35.0,0,0,8.05,2,1,1,1,0,1,0,1,0,1,0,0


In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

logreg = LogisticRegression(max_iter=5000)

logreg.fit(X_train_scaled, y_train)
logreg_pred = logreg.predict(X_val_scaled)

print("Accuracy LogReg :", accuracy_score(y_val, logreg_pred))
print(classification_report(y_val, logreg_pred))
print(confusion_matrix(y_val, logreg_pred))

X_scaled = scaler.fit_transform(X)

X_test_final_scaled = scaler.transform(
    test_clean_v2.drop(columns=["PassengerId"])
)

pred_test = logreg.predict(X_test_final_scaled).astype(int)

submission = pd.DataFrame({
    "PassengerId": test_clean_v2["PassengerId"],
    "Survived": pred_test
})

submission.to_csv("../data/result/submission_logreg.csv", index=False)
submission.head()


Accuracy LogReg : 0.8491620111731844
              precision    recall  f1-score   support

         0.0       0.86      0.90      0.88       110
         1.0       0.83      0.77      0.80        69

    accuracy                           0.85       179
   macro avg       0.84      0.83      0.84       179
weighted avg       0.85      0.85      0.85       179

[[99 11]
 [16 53]]


Unnamed: 0,PassengerId,Survived
891,892,0
892,893,0
893,894,0
894,895,0
895,896,1
