In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("../Titanic project/input/train.csv")
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# print(X,y)

In [93]:
X_train.isnull().sum()
# X_valid.isnull().sum()
# print(X_train)

# df[['Age', 'Cabin', 'Embarked']].isnull().sum()


PassengerId      0
Pclass           0
Name             0
Sex              0
Age            137
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          552
Embarked         2
dtype: int64

In [94]:
X_valid.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             40
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          135
Embarked         0
dtype: int64

In [95]:
import pandas as pd
import numpy as np

def detect_outliers_iqr(df, col, factor=1.5):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

# Lấy danh sách các cột số
num_cols = X_train.select_dtypes(include=[np.number]).columns

# Tạo DataFrame để thống kê outlier
outlier_summary = []

for col in num_cols:
    n_outliers, lower, upper = detect_outliers_iqr(df, col)
    outlier_summary.append({
        'Column': col,
        'Outlier Count': n_outliers,
        'Lower Bound': round(lower, 2),
        'Upper Bound': round(upper, 2),
        'Outlier %': round(n_outliers / len(df) * 100, 2)
    })

outlier_df = pd.DataFrame(outlier_summary)
print(outlier_df)


        Column  Outlier Count  Lower Bound  Upper Bound  Outlier %
0  PassengerId              0      -444.00      1336.00       0.00
1       Pclass              0         0.50         4.50       0.00
2          Age             11        -6.69        64.81       1.23
3        SibSp             46        -1.50         2.50       5.16
4        Parch            213         0.00         0.00      23.91
5         Fare            116       -26.72        65.63      13.02


In [96]:
# import numpy as np
# import pandas as pd

# def preprocess(df, mean_age=None, mode_embarked=None, fit_encoders=False, encoders=None):
#     df = df.copy()

#     # Nếu chưa truyền mean/mode, tính từ df (thường là train)
#     if mean_age is None:
#         mean_age = df['Age'].mean()
#     if mode_embarked is None:
#         mode_embarked = df['Embarked'].mode()[0]

#     # Điền giá trị thiếu
#     df['Age'] = df['Age'].fillna(mean_age)
#     df['Embarked'] = df['Embarked'].fillna(mode_embarked)

#     # Xử lý outlier
#     df['Age'] = df['Age'].clip(0, 65)
#     df['SibSp'] = df['SibSp'].clip(0, 5)
#     df['Parch'] = df['Parch'].clip(0, 4)
#     df['Fare'] = np.log1p(df['Fare'])

#     # --- MÃ HÓA ---

#     # 1️Phân nhóm tuổi (Age bins)
#     bins = [0, 12, 18, 35, 60, 100]
#     labels = ['Child', 'Teen', 'Adult', 'MidAge', 'Senior']
#     df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

#     # 2️One-hot encode cho biến phân loại
#     df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'AgeGroup'], drop_first=True)
#     df = df.drop(['Cabin', 'Age','PassengerId', 'Name', 'Ticket','SibSp', 'Parch'], axis=1)

#     return df, mean_age, mode_embarked


import numpy as np
import pandas as pd

def preprocess(df, mean_age=None, mode_embarked=None):
    df = df.copy()

    if mean_age is None:
        mean_age = df['Age'].mean()
    if mode_embarked is None:
        mode_embarked = df['Embarked'].mode()[0]

    df['Age'] = df['Age'].fillna(mean_age)
    df['Embarked'] = df['Embarked'].fillna(mode_embarked)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Age'] = df['Age'].clip(0,65)
    df['SibSp'] = df['SibSp'].clip(0,5)
    df['Parch'] = df['Parch'].clip(0,4)
    df['Fare'] = np.log1p(df['Fare'])

    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(
        ['Lady', 'Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    df['Ticket_prefix'] = df['Ticket'].str.extract('([A-Za-z./]+)', expand=False)
    df['Ticket_prefix'] = df['Ticket_prefix'].fillna('NONE')
    rare_prefix = df['Ticket_prefix'].value_counts()[df['Ticket_prefix'].value_counts() < 10].index
    df['Ticket_prefix'] = df['Ticket_prefix'].replace(rare_prefix, 'Rare')
    df['Ticket_number'] = df['Ticket'].str.extract('(\d+)', expand=False)
    df['Ticket_number'] = df['Ticket_number'].fillna(0).astype(int)
    df['Ticket_number'] = np.log1p(df['Ticket_number'])

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize']==1).astype(int)

    df = pd.get_dummies(df, columns=['Sex','Embarked','Title','Ticket_prefix'], drop_first=False)
    df = df.drop(['PassengerId','Cabin','Name','Ticket'], axis=1)

    return df, mean_age, mode_embarked


  df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
  df['Ticket_number'] = df['Ticket'].str.extract('(\d+)', expand=False)


In [97]:
X_train_prep, mean_age, mode_embarked = preprocess(X_train)
X_valid_prep, _, _ = preprocess(X_valid, mean_age, mode_embarked)
columns_train = X_train_prep.columns
for col in columns_train:
    if col not in X_valid_prep.columns:
        X_valid_prep[col] = 0
X_valid_prep = X_valid_prep[columns_train]

In [98]:
print(X_train_prep)

     Pclass        Age  SibSp  Parch      Fare  Ticket_number  FamilySize  \
692       3  29.807687      0      0  4.051712       7.379008           1   
481       2  29.807687      0      0  0.000000      12.387790           1   
527       1  29.807687      0      0  5.406181       9.769041           1   
855       3  18.000000      0      1  2.336987      12.879252           2   
801       2  31.000000      1      1  3.305054      10.371051           3   
..      ...        ...    ...    ...       ...            ...         ...   
359       3  29.807687      0      0  2.183711      12.709816           1   
258       1  35.000000      0      0  6.240917       9.784479           1   
736       3  48.000000      1      3  3.566005       8.796188           5   
462       1  47.000000      0      0  3.676301      11.620173           1   
507       1  29.807687      0      0  3.316003      11.621134           1   

     IsAlone  Sex_female  Sex_male  ...  Title_Miss  Title_Mr  Title_Mrs  \

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import pandas as pd

# --- Tách dữ liệu nếu chưa có ---
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Danh sách mô hình ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# --- Huấn luyện và đánh giá ---
results = []

for name, model in models.items():
    model.fit(X_train_prep, y_train)
    preds = model.predict(X_valid_prep)
    
    acc = accuracy_score(y_valid, preds)
    prec = precision_score(y_valid, preds)
    rec = recall_score(y_valid, preds)
    f1 = f1_score(y_valid, preds)
    
    results.append((name, acc, prec, rec, f1))
    print(f"📘 {name} Report:\n{classification_report(y_valid, preds)}")
    print("-" * 60)

# --- Tổng hợp kết quả ---
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
results_df = results_df.sort_values(by="F1-Score", ascending=False)
# print("\n📊 Tổng hợp kết quả:")
print(results_df)


📘 Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.85      0.90      0.88       110
           1       0.83      0.75      0.79        69

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179

------------------------------------------------------------
📘 Decision Tree Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       110
           1       0.68      0.65      0.67        69

    accuracy                           0.75       179
   macro avg       0.73      0.73      0.73       179
weighted avg       0.75      0.75      0.75       179

------------------------------------------------------------
📘 Random Forest Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       110
           1       0.77      0.74    

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📘 XGBoost Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       110
           1       0.77      0.74      0.76        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179

------------------------------------------------------------
                 Model  Accuracy  Precision    Recall  F1-Score
0  Logistic Regression  0.843575   0.825397  0.753623  0.787879
2        Random Forest  0.815642   0.772727  0.739130  0.755556
4              XGBoost  0.815642   0.772727  0.739130  0.755556
1        Decision Tree  0.748603   0.681818  0.652174  0.666667
3                  KNN  0.759777   0.732143  0.594203  0.656000


In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

# --- Logistic Regression ---
logreg = LogisticRegression(random_state=42, max_iter=5000)

# --- Grid / distribution cho RandomizedSearch ---
param_dist = {
    'C': np.logspace(-3, 3, 20),          # thử C từ 0.001 → 1000
    'penalty': ['l1', 'l2', 'elasticnet'], # loại regularization
    'solver': ['saga'],                    # saga hỗ trợ l1/l2/elasticnet
    'class_weight': [None, 'balanced'],
    'l1_ratio': np.linspace(0,1,10)       # chỉ dùng nếu penalty='elasticnet'
}

# --- RandomizedSearchCV ---
random_search_lr = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_dist,
    n_iter=30,
    scoring='f1',       # tối ưu theo F1
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)


# --- Train ---
random_search_lr.fit(X_train_prep, y_train)

# --- Kết quả ---
print("✅ Best Params:", random_search_lr.best_params_)
print("✅ Best F1-score (CV):", random_search_lr.best_score_)

# --- Validation ---
best_lr = random_search_lr.best_estimator_
y_pred = best_lr.predict(X_valid_prep)
from sklearn.metrics import classification_report
print(classification_report(y_valid, y_pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits




✅ Best Params: {'solver': 'saga', 'penalty': 'l1', 'l1_ratio': np.float64(0.5555555555555556), 'class_weight': 'balanced', 'C': np.float64(6.158482110660261)}
✅ Best F1-score (CV): 0.7572208490349419
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       110
           1       0.76      0.83      0.79        69

    accuracy                           0.83       179
   macro avg       0.82      0.83      0.83       179
weighted avg       0.84      0.83      0.83       179



In [101]:
import pandas as pd

test_df = pd.read_csv("../Titanic project/input/test.csv")

# 2️⃣ Preprocess train
Df_Final, mean_age, mode_embarked = preprocess(X)

# 3️⃣ Preprocess test (dùng mean và mode từ train)
X_test_prep, _, _ = preprocess(test_df, mean_age, mode_embarked)

# 4️⃣ Đồng bộ cột
for col in columns_train:
    if col not in X_test_prep.columns:
        X_test_prep[col] = 0
X_test_prep = X_test_prep[columns_train]

best_params = random_search_lr.best_params_
best_lr_final =  LogisticRegression(
    **best_params,
    random_state=42,
    max_iter=10000,
)
best_lr_final.fit(Df_Final, y)
# 4️⃣ Dự đoán trên test set đã preprocess
y_test_pred = best_lr_final.predict(X_test_prep)

# 5️⃣ Xuất file submission
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_test_pred
})
submission.to_csv('submission.csv', index=False)


