In [306]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("../Titanic project/input/train.csv")
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# print(X,y)

In [307]:
X_train.isnull().sum()
# X_valid.isnull().sum()
# print(X_train)

# df[['Age', 'Cabin', 'Embarked']].isnull().sum()


PassengerId      0
Pclass           0
Name             0
Sex              0
Age            137
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          552
Embarked         2
dtype: int64

In [308]:
X_valid.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             40
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          135
Embarked         0
dtype: int64

In [309]:
import pandas as pd
import numpy as np

def detect_outliers_iqr(df, col, factor=1.5):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

# Lấy danh sách các cột số
num_cols = X_train.select_dtypes(include=[np.number]).columns

# Tạo DataFrame để thống kê outlier
outlier_summary = []

for col in num_cols:
    n_outliers, lower, upper = detect_outliers_iqr(df, col)
    outlier_summary.append({
        'Column': col,
        'Outlier Count': n_outliers,
        'Lower Bound': round(lower, 2),
        'Upper Bound': round(upper, 2),
        'Outlier %': round(n_outliers / len(df) * 100, 2)
    })

outlier_df = pd.DataFrame(outlier_summary)
print(outlier_df)


        Column  Outlier Count  Lower Bound  Upper Bound  Outlier %
0  PassengerId              0      -444.00      1336.00       0.00
1       Pclass              0         0.50         4.50       0.00
2          Age             11        -6.69        64.81       1.23
3        SibSp             46        -1.50         2.50       5.16
4        Parch            213         0.00         0.00      23.91
5         Fare            116       -26.72        65.63      13.02


In [310]:
import numpy as np
import pandas as pd

def preprocess(df, mean_age=None, mode_embarked=None, fit_encoders=False, encoders=None):
    df = df.copy()

    # Nếu chưa truyền mean/mode, tính từ df (thường là train)
    if mean_age is None:
        mean_age = df['Age'].mean()
    if mode_embarked is None:
        mode_embarked = df['Embarked'].mode()[0]

    # Điền giá trị thiếu
    df['Age'] = df['Age'].fillna(mean_age)
    df['Embarked'] = df['Embarked'].fillna(mode_embarked)

    # Xử lý outlier
    df['Age'] = df['Age'].clip(0, 65)
    df['SibSp'] = df['SibSp'].clip(0, 5)
    df['Parch'] = df['Parch'].clip(0, 4)
    df['Fare'] = np.log1p(df['Fare'])

    # --- MÃ HÓA ---

    # 1️Phân nhóm tuổi (Age bins)
    bins = [0, 12, 18, 35, 60, 100]
    labels = ['Child', 'Teen', 'Adult', 'MidAge', 'Senior']
    df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

    # 2️One-hot encode cho biến phân loại
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'AgeGroup'], drop_first=True)
    df = df.drop(['Cabin', 'Age','PassengerId', 'Name', 'Ticket','SibSp', 'Parch'], axis=1)

    return df, mean_age, mode_embarked


In [311]:
X_train_prep, mean_age, mode_embarked = preprocess(X_train)
X_valid_prep, _, _ = preprocess(X_valid, mean_age, mode_embarked)


In [312]:
print(X_valid_prep)

     Pclass      Fare  Sex_male  Embarked_Q  Embarked_S  AgeGroup_Teen  \
565       3  3.224858      True       False        True          False   
160       3  2.839078      True       False        True          False   
553       3  2.107178      True       False       False          False   
860       3  2.715244      True       False        True          False   
241       3  2.803360     False        True       False          False   
..      ...       ...       ...         ...         ...            ...   
880       2  3.295837     False       False        True          False   
91        3  2.180892      True       False        True          False   
883       2  2.442347      True       False        True          False   
473       2  2.694066     False       False       False          False   
637       2  3.305054      True       False        True          False   

     AgeGroup_Adult  AgeGroup_MidAge  AgeGroup_Senior  
565            True            False            False  

In [313]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import pandas as pd

# --- Tách dữ liệu nếu chưa có ---
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Danh sách mô hình ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# --- Huấn luyện và đánh giá ---
results = []

for name, model in models.items():
    model.fit(X_train_prep, y_train)
    preds = model.predict(X_valid_prep)
    
    acc = accuracy_score(y_valid, preds)
    prec = precision_score(y_valid, preds)
    rec = recall_score(y_valid, preds)
    f1 = f1_score(y_valid, preds)
    
    results.append((name, acc, prec, rec, f1))
    print(f"📘 {name} Report:\n{classification_report(y_valid, preds)}")
    print("-" * 60)

# --- Tổng hợp kết quả ---
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
results_df = results_df.sort_values(by="F1-Score", ascending=False)
# print("\n📊 Tổng hợp kết quả:")
print(results_df)


📘 Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       110
           1       0.71      0.68      0.70        69

    accuracy                           0.77       179
   macro avg       0.76      0.75      0.76       179
weighted avg       0.77      0.77      0.77       179

------------------------------------------------------------
📘 Decision Tree Report:
              precision    recall  f1-score   support

           0       0.80      0.85      0.83       110
           1       0.74      0.67      0.70        69

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.78      0.78      0.78       179

------------------------------------------------------------
📘 Random Forest Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       110
           1       0.81      0.72    

In [314]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Bộ tham số cần thử
param_grid = {
    'max_depth': [3, 5, 7, 9, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Tạo model và grid search
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1',   # có thể đổi thành 'accuracy' hoặc 'recall'
    cv=5,           # 5-fold cross-validation
    n_jobs=-1,      # tận dụng toàn bộ CPU
    verbose=1
)

grid_search.fit(X_train_prep, y_train)

# In ra kết quả tốt nhất
print("✅ Best Params:", grid_search.best_params_)
print("✅ Best F1-score:", grid_search.best_score_)


Fitting 5 folds for each of 90 candidates, totalling 450 fits
✅ Best Params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
✅ Best F1-score: 0.7390793424690789


In [315]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import numpy as np

# === 1️⃣ Định nghĩa grid cần thử ===
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],       # số cây
    'max_depth': [None, 5, 8, 12, 15, 20],          # độ sâu cây
    'min_samples_split': [2, 5, 10],                # min mẫu để chia node
    'min_samples_leaf': [1, 2, 4],                  # min mẫu ở lá
    'max_features': ['sqrt', 'log2'],               # số feature xét khi chia
    'bootstrap': [True, False]                      # có dùng bootstrap không
}

# === 2️⃣ Tạo model gốc ===
rf = RandomForestClassifier(random_state=42)

# === 3️⃣ Dùng RandomizedSearchCV ===
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=30,             # thử 30 tổ hợp ngẫu nhiên
    scoring='f1',          # tối ưu theo F1
    cv=5,                  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1              # tận dụng toàn bộ CPU
)

# === 4️⃣ Train ===
random_search.fit(X_train_prep, y_train)

# === 5️⃣ Kết quả ===
print("✅ Best Params:", random_search.best_params_)
print("✅ Best F1-score (CV):", random_search.best_score_)

# === 6️⃣ Đánh giá lại trên tập validation ===
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_valid_prep)

print("\n📊 Classification Report (Validation set):")
print(classification_report(y_valid, y_pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
✅ Best Params: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': False}
✅ Best F1-score (CV): 0.7566083203212021

📊 Classification Report (Validation set):
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       110
           1       0.78      0.77      0.77        69

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

