In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("../Titanic project/input/train.csv")
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# print(X,y)

In [2]:
X_train.isnull().sum()
# X_valid.isnull().sum()
# print(X_train)

# df[['Age', 'Cabin', 'Embarked']].isnull().sum()


PassengerId      0
Pclass           0
Name             0
Sex              0
Age            137
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          552
Embarked         2
dtype: int64

In [3]:
X_valid.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             40
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          135
Embarked         0
dtype: int64

In [4]:
import pandas as pd
import numpy as np

def detect_outliers_iqr(df, col, factor=1.5):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

# L·∫•y danh s√°ch c√°c c·ªôt s·ªë
num_cols = X_train.select_dtypes(include=[np.number]).columns

# T·∫°o DataFrame ƒë·ªÉ th·ªëng k√™ outlier
outlier_summary = []

for col in num_cols:
    n_outliers, lower, upper = detect_outliers_iqr(df, col)
    outlier_summary.append({
        'Column': col,
        'Outlier Count': n_outliers,
        'Lower Bound': round(lower, 2),
        'Upper Bound': round(upper, 2),
        'Outlier %': round(n_outliers / len(df) * 100, 2)
    })

outlier_df = pd.DataFrame(outlier_summary)
print(outlier_df)


        Column  Outlier Count  Lower Bound  Upper Bound  Outlier %
0  PassengerId              0      -444.00      1336.00       0.00
1       Pclass              0         0.50         4.50       0.00
2          Age             11        -6.69        64.81       1.23
3        SibSp             46        -1.50         2.50       5.16
4        Parch            213         0.00         0.00      23.91
5         Fare            116       -26.72        65.63      13.02


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

# ------------------- Preprocess -------------------
def preprocess(df, mean_age=None, mode_embarked=None):
    df = df.copy()

    if mean_age is None:
        mean_age = df['Age'].mean()
    if mode_embarked is None:
        mode_embarked = df['Embarked'].mode()[0]

    df['Age'] = df['Age'].fillna(mean_age)
    df['Embarked'] = df['Embarked'].fillna(mode_embarked)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Age'] = df['Age'].clip(0,65)
    df['SibSp'] = df['SibSp'].clip(0,5)
    df['Parch'] = df['Parch'].clip(0,4)
    df['Fare'] = np.log1p(df['Fare'])

    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(
        ['Lady', 'Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    df['Ticket_prefix'] = df['Ticket'].str.extract('([A-Za-z./]+)', expand=False)
    df['Ticket_prefix'] = df['Ticket_prefix'].fillna('NONE')
    rare_prefix = df['Ticket_prefix'].value_counts()[df['Ticket_prefix'].value_counts() < 10].index
    df['Ticket_prefix'] = df['Ticket_prefix'].replace(rare_prefix, 'Rare')
    df['Ticket_number'] = df['Ticket'].str.extract('(\d+)', expand=False)
    df['Ticket_number'] = df['Ticket_number'].fillna(0).astype(int)
    df['Ticket_number'] = np.log1p(df['Ticket_number'])

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize']==1).astype(int)

    df = pd.get_dummies(df, columns=['Sex','Embarked','Title','Ticket_prefix'], drop_first=False)
    df = df.drop(['PassengerId','Cabin','Name','Ticket'], axis=1)

    return df, mean_age, mode_embarked

# ------------------- Load data -------------------
train_df = pd.read_csv("../Titanic project/input/train.csv")
y = train_df['Survived']
X = train_df.drop('Survived', axis=1)
X_prep, mean_age, mode_embarked = preprocess(X)

# Split train/validation
X_train, X_valid, y_train, y_valid = train_test_split(X_prep, y, test_size=0.2, random_state=42)

# ------------------- Models -------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=3000, solver='saga', penalty='l2'),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# ------------------- Train & Evaluate -------------------
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)
    report = classification_report(y_valid, y_pred, output_dict=True)
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": report['1']['precision'],
        "Recall": report['1']['recall'],
        "F1-Score": report['1']['f1-score']
    })

results_df = pd.DataFrame(results).sort_values(by='F1-Score', ascending=False)
print(results_df)


                 Model  Accuracy  Precision    Recall  F1-Score
4              XGBoost  0.860335   0.845070  0.810811  0.827586
1        Random Forest  0.832402   0.814286  0.770270  0.791667
0  Logistic Regression  0.810056   0.777778  0.756757  0.767123
2        Decision Tree  0.798883   0.756757  0.756757  0.756757
3                  KNN  0.748603   0.723077  0.635135  0.676259


In [6]:
X_train_prep, mean_age, mode_embarked = preprocess(X_train)
X_valid_prep, _, _ = preprocess(X_valid, mean_age, mode_embarked)


KeyError: 'Embarked'

In [None]:
print(X_valid_prep)

     Pclass      Fare  Sex_male  Embarked_Q  Embarked_S  AgeGroup_Teen  \
565       3  3.224858      True       False        True          False   
160       3  2.839078      True       False        True          False   
553       3  2.107178      True       False       False          False   
860       3  2.715244      True       False        True          False   
241       3  2.803360     False        True       False          False   
..      ...       ...       ...         ...         ...            ...   
880       2  3.295837     False       False        True          False   
91        3  2.180892      True       False        True          False   
883       2  2.442347      True       False        True          False   
473       2  2.694066     False       False       False          False   
637       2  3.305054      True       False        True          False   

     AgeGroup_Adult  AgeGroup_MidAge  AgeGroup_Senior  
565            True            False            False  

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import pandas as pd

# --- T√°ch d·ªØ li·ªáu n·∫øu ch∆∞a c√≥ ---
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Danh s√°ch m√¥ h√¨nh ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# --- Hu·∫•n luy·ªán v√† ƒë√°nh gi√° ---
results = []

for name, model in models.items():
    model.fit(X_train_prep, y_train)
    preds = model.predict(X_valid_prep)
    
    acc = accuracy_score(y_valid, preds)
    prec = precision_score(y_valid, preds)
    rec = recall_score(y_valid, preds)
    f1 = f1_score(y_valid, preds)
    
    results.append((name, acc, prec, rec, f1))
    print(f"üìò {name} Report:\n{classification_report(y_valid, preds)}")
    print("-" * 60)

# --- T·ªïng h·ª£p k·∫øt qu·∫£ ---
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
results_df = results_df.sort_values(by="F1-Score", ascending=False)
# print("\nüìä T·ªïng h·ª£p k·∫øt qu·∫£:")
print(results_df)


üìò Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       110
           1       0.71      0.68      0.70        69

    accuracy                           0.77       179
   macro avg       0.76      0.75      0.76       179
weighted avg       0.77      0.77      0.77       179

------------------------------------------------------------
üìò Decision Tree Report:
              precision    recall  f1-score   support

           0       0.80      0.85      0.83       110
           1       0.74      0.67      0.70        69

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.78      0.78      0.78       179

------------------------------------------------------------
üìò Random Forest Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       110
           1       0.81     

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# B·ªô tham s·ªë c·∫ßn th·ª≠
param_grid = {
    'max_depth': [3, 5, 7, 9, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# T·∫°o model v√† grid search
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1',   # c√≥ th·ªÉ ƒë·ªïi th√†nh 'accuracy' ho·∫∑c 'recall'
    cv=5,           # 5-fold cross-validation
    n_jobs=-1,      # t·∫≠n d·ª•ng to√†n b·ªô CPU
    verbose=1
)

grid_search.fit(X_train_prep, y_train)

# In ra k·∫øt qu·∫£ t·ªët nh·∫•t
print("‚úÖ Best Params:", grid_search.best_params_)
print("‚úÖ Best F1-score:", grid_search.best_score_)


Fitting 5 folds for each of 90 candidates, totalling 450 fits
‚úÖ Best Params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
‚úÖ Best F1-score: 0.7390793424690789


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import numpy as np

# === 1Ô∏è‚É£ ƒê·ªãnh nghƒ©a grid c·∫ßn th·ª≠ ===
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],       # s·ªë c√¢y
    'max_depth': [None, 5, 8, 12, 15, 20],          # ƒë·ªô s√¢u c√¢y
    'min_samples_split': [2, 5, 10],                # min m·∫´u ƒë·ªÉ chia node
    'min_samples_leaf': [1, 2, 4],                  # min m·∫´u ·ªü l√°
    'max_features': ['sqrt', 'log2'],               # s·ªë feature x√©t khi chia
    'bootstrap': [True, False]                      # c√≥ d√πng bootstrap kh√¥ng
}

# === 2Ô∏è‚É£ T·∫°o model g·ªëc ===
rf = RandomForestClassifier(random_state=42)

# === 3Ô∏è‚É£ D√πng RandomizedSearchCV ===
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=30,             # th·ª≠ 30 t·ªï h·ª£p ng·∫´u nhi√™n
    scoring='f1',          # t·ªëi ∆∞u theo F1
    cv=5,                  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1              # t·∫≠n d·ª•ng to√†n b·ªô CPU
)

# === 4Ô∏è‚É£ Train ===
random_search.fit(X_train_prep, y_train)

# === 5Ô∏è‚É£ K·∫øt qu·∫£ ===
print("‚úÖ Best Params:", random_search.best_params_)
print("‚úÖ Best F1-score (CV):", random_search.best_score_)

# === 6Ô∏è‚É£ ƒê√°nh gi√° l·∫°i tr√™n t·∫≠p validation ===
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_valid_prep)

print("\nüìä Classification Report (Validation set):")
print(classification_report(y_valid, y_pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
‚úÖ Best Params: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': False}
‚úÖ Best F1-score (CV): 0.7566083203212021

üìä Classification Report (Validation set):
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       110
           1       0.78      0.77      0.77        69

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179



In [None]:
# Preprocessing to√†n b·ªô train
X_prep, mean_age, mode_embarked = preprocess(X)

# T·∫°o RandomForest v·ªõi best_params t√¨m ƒë∆∞·ª£c
best_rf_final = RandomForestClassifier(
    n_estimators=random_search.best_params_['n_estimators'],
    max_depth=random_search.best_params_['max_depth'],
    min_samples_split=random_search.best_params_['min_samples_split'],
    min_samples_leaf=random_search.best_params_['min_samples_leaf'],
    max_features=random_search.best_params_['max_features'],
    bootstrap=random_search.best_params_['bootstrap'],
    random_state=42
)

# Train model tr√™n to√†n b·ªô train
best_rf_final.fit(X_prep, y)


In [None]:
test_df = pd.read_csv("../Titanic project/input/test.csv")
X_test_prep, _, _ = preprocess(test_df, mean_age, mode_embarked)

y_test_pred = best_rf_final.predict(X_test_prep)
import pandas as pd

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_test_pred
})

submission.to_csv('submission.csv', index=False)
