In [5]:
# 📦 Importation des bibliothèques
import os 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
os.chdir("C:/Users/LENOVO/OneDrive/Bureau/projet/titanic-project")  # corrigé


In [6]:
# 📂 Chargement des datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# On garde les IDs pour la soumission
test_ids = test_df["PassengerId"]


In [7]:
# 🛠 Création de nouvelles variables utiles : FamilySize, IsAlone, Title
for df in [train_df, test_df]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Extraction du titre depuis le nom
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady','Countess','Capt','Col','Don','Dr','Major',
                                       'Rev','Sir','Jonkheer','Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')


In [8]:
# 🧹 Suppression des colonnes inutiles
train_df = train_df.drop(columns=['Cabin', 'Ticket', 'Name'])
test_df = test_df.drop(columns=['Cabin', 'Ticket', 'Name'])

# 🧼 Imputation des valeurs manquantes
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(train_df['Age'].median())

train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Embarked'] = test_df['Embarked'].fillna(train_df['Embarked'].mode()[0])

test_df['Fare'] = test_df['Fare'].fillna(train_df['Fare'].median())



In [9]:
# 🔢 Encodage de Sex
for df in [train_df, test_df]:
    df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})

# 🧠 One-hot encoding pour Embarked et Title
train_df = pd.get_dummies(train_df, columns=['Embarked', 'Title'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Embarked', 'Title'], drop_first=True)


In [10]:
# 🔁 S'assurer que train et test ont les mêmes colonnes
train_cols = train_df.columns.drop('Survived')
test_df = test_df[train_cols]


In [11]:
# 🎯 Séparation des features (X) et de la cible (y)
X = train_df.drop(columns='Survived')
y = train_df['Survived']


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split train/validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Modèle
model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)
model.fit(X_train, y_train)

# Prédictions sur validation
y_pred = model.predict(X_val)

# Évaluation
print("🔍 Accuracy sur validation :", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


🔍 Accuracy sur validation : 0.8212290502793296
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.73      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [13]:
# 🔮 Prédictions sur test.csv
final_preds = model.predict(test_df)


In [14]:
# 💾 Création du fichier submission.csv
submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": final_preds
})

submission.to_csv("submission.csv", index=False)
print("✅ Fichier submission.csv généré avec succès.")


✅ Fichier submission.csv généré avec succès.
