In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

# ------------------- Preprocess -------------------
def preprocess(df, mean_age=None, mode_embarked=None):
    df = df.copy()

    if mean_age is None:
        mean_age = df['Age'].mean()
    if mode_embarked is None:
        mode_embarked = df['Embarked'].mode()[0]

    df['Age'] = df['Age'].fillna(mean_age)
    df['Embarked'] = df['Embarked'].fillna(mode_embarked)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Age'] = df['Age'].clip(0,65)
    df['SibSp'] = df['SibSp'].clip(0,5)
    df['Parch'] = df['Parch'].clip(0,4)
    df['Fare'] = np.log1p(df['Fare'])

    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(
        ['Lady', 'Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    df['Ticket_prefix'] = df['Ticket'].str.extract('([A-Za-z./]+)', expand=False)
    df['Ticket_prefix'] = df['Ticket_prefix'].fillna('NONE')
    rare_prefix = df['Ticket_prefix'].value_counts()[df['Ticket_prefix'].value_counts() < 10].index
    df['Ticket_prefix'] = df['Ticket_prefix'].replace(rare_prefix, 'Rare')
    df['Ticket_number'] = df['Ticket'].str.extract('(\d+)', expand=False)
    df['Ticket_number'] = df['Ticket_number'].fillna(0).astype(int)
    df['Ticket_number'] = np.log1p(df['Ticket_number'])

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize']==1).astype(int)

    df = pd.get_dummies(df, columns=['Sex','Embarked','Title','Ticket_prefix'], drop_first=False)
    df = df.drop(['PassengerId','Cabin','Name','Ticket'], axis=1)

    return df, mean_age, mode_embarked

# ------------------- Load data -------------------
train_df = pd.read_csv("../Titanic project/input/train.csv")
y = train_df['Survived']
X = train_df.drop('Survived', axis=1)
# X_prep, mean_age, mode_embarked = preprocess(X)


X_train_raw, X_valid_raw, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, mean_age, mode_embarked = preprocess(X_train_raw)

X_valid, _, _ = preprocess(X_valid_raw, mean_age=mean_age, mode_embarked=mode_embarked)

# ⚠ Fix cột thiếu ở validation (one-hot)
for col in X_train.columns:
    if col not in X_valid.columns:
        X_valid[col] = 0
X_valid = X_valid[X_train.columns]  # sắp xếp theo train
# ------------------- Models -------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=3000, solver='saga', penalty='l2'),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# ------------------- Train & Evaluate -------------------
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
results = []

for name, model in models.items():
    # Fit trên train set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)    


    if name == "XGBoost":
        skf = StratifiedKFold(n_splits=5)
        f1_scores = []
        for train_idx, val_idx in skf.split(X_train, y_train):
            X_tr, X_va = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_va = y_train.iloc[train_idx], y_train.iloc[val_idx]
            model.fit(X_tr, y_tr)
            y_pred_va = model.predict(X_va)
            f1_scores.append(f1_score(y_va, y_pred_va))
        mean_cv_f1 = np.mean(f1_scores)
        acc = accuracy_score(y_valid, y_pred)
        f1_val = f1_score(y_valid, y_pred)    
        report = classification_report(y_valid, y_pred, output_dict=True)
        results.append({
            "Model": name,
            "Accuracy": acc,
            "Precision": report['1']['precision'],
            "Recall": report['1']['recall'],
            "F1-Score": f1_val,
            "Mean_CV_F1": mean_cv_f1
        })
    else:
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
        mean_cv_f1 = cv_scores.mean()
        acc = accuracy_score(y_valid, y_pred)
        f1_val = f1_score(y_valid, y_pred)    
        report = classification_report(y_valid, y_pred, output_dict=True)
        results.append({
            "Model": name,
            "Accuracy": acc,
            "Precision": report['1']['precision'],
            "Recall": report['1']['recall'],
            "F1-Score": f1_val,
            "Mean_CV_F1": mean_cv_f1
        })

results_df = pd.DataFrame(results).sort_values(by='F1-Score', ascending=False)
print(results_df)


from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
def optimize_logistic_regression(X_train, y_train, X_valid, y_valid, use_polynomial=True, verbose=True):
    """
    Optimize Logistic Regression với:
    - Scaling
    - L1 feature selection
    - Polynomial features (tùy chọn)
    - GridSearchCV (tuning C, penalty, l1_ratio)
    
    Trả về dict với kết quả Best Params, CV F1, Validation Accuracy/F1/Precision/Recall
    """

    if verbose: print("Step 1: Scaling features...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)

    if verbose: print("Step 2: Feature selection (L1 penalty)...")
    selector = SelectFromModel(
        LogisticRegression(penalty='l1', solver='saga', C=1, max_iter=5000, random_state=42)
    )
    selector.fit(X_train_scaled, y_train)
    X_train_sel = selector.transform(X_train_scaled)
    X_valid_sel = selector.transform(X_valid_scaled)
    if verbose: print(f"Selected {X_train_sel.shape[1]} features from {X_train_scaled.shape[1]} original features.")

    if use_polynomial:
        if verbose: print("Step 3: Generating Polynomial Features (degree=2)...")
        poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
        X_train_poly = poly.fit_transform(X_train_sel)
        X_valid_poly = poly.transform(X_valid_sel)
        if verbose: print(f"Polynomial features expanded to {X_train_poly.shape[1]} features.")
    else:
        X_train_poly, X_valid_poly = X_train_sel, X_valid_sel

    if verbose: print("Step 4: GridSearchCV for Logistic Regression hyperparameters...")
    param_grid = {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'l1_ratio': [0, 0.5, 1]  # chỉ dùng khi penalty='elasticnet'
    }
    logreg = LogisticRegression(max_iter=5000, solver='saga', random_state=42)
    grid = GridSearchCV(logreg, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
    grid.fit(X_train_poly, y_train)

    if verbose: print("Step 5: Evaluating on validation set...")
    y_pred = grid.predict(X_valid_poly)
    acc = accuracy_score(y_valid, y_pred)
    f1_val = f1_score(y_valid, y_pred)
    report = classification_report(y_valid, y_pred, output_dict=True)

    results = {
        'Best Params': grid.best_params_,
        'Best CV F1': grid.best_score_,
        'Validation Accuracy': acc,
        'Validation F1': f1_val,
        'Precision': report['1']['precision'],
        'Recall': report['1']['recall']
    }

    if verbose:
        print("\n=== Logistic Regression Optimization Results ===")
        for k, v in results.items():
            print(f"{k}: {v}")

    return results

# Giả sử bạn đã preprocess X_train và X_valid
logreg_results = optimize_logistic_regression(X_train, y_train, X_valid, y_valid, use_polynomial=True)
print(logreg_results)



                 Model  Accuracy  Precision    Recall  F1-Score  Mean_CV_F1
4              XGBoost  0.860335   0.845070  0.810811  0.827586    0.725880
1        Random Forest  0.826816   0.811594  0.756757  0.783217    0.735135
0  Logistic Regression  0.810056   0.777778  0.756757  0.767123    0.760594
2        Decision Tree  0.782123   0.733333  0.743243  0.738255    0.721761
3                  KNN  0.748603   0.723077  0.635135  0.676259    0.633657
Step 1: Scaling features...
Step 2: Feature selection (L1 penalty)...
Selected 19 features from 24 original features.
Step 3: Generating Polynomial Features (degree=2)...
Polynomial features expanded to 209 features.
Step 4: GridSearchCV for Logistic Regression hyperparameters...
Fitting 5 folds for each of 36 candidates, totalling 180 fits




Step 5: Evaluating on validation set...

=== Logistic Regression Optimization Results ===
Best Params: {'C': 0.01, 'l1_ratio': 0, 'penalty': 'l2'}
Best CV F1: 0.7661766834393332
Validation Accuracy: 0.8044692737430168
Validation F1: 0.75177304964539
Precision: 0.7910447761194029
Recall: 0.7162162162162162
{'Best Params': {'C': 0.01, 'l1_ratio': 0, 'penalty': 'l2'}, 'Best CV F1': np.float64(0.7661766834393332), 'Validation Accuracy': 0.8044692737430168, 'Validation F1': 0.75177304964539, 'Precision': 0.7910447761194029, 'Recall': 0.7162162162162162}
