### Import necessary libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Load train.csv file

In [2]:
# titanic = pd.read_csv('/kaggle/input/titanic/train.csv')
titanic = pd.read_csv('../input/train.csv')
df = titanic.copy()
test_dataset = pd.read_csv('../input/test.csv')
df_test = test_dataset.copy()
print("Successfully load training data.")

Successfully load training data.


## Handle missing value

In [3]:
# Age
master = df['Name'].str.contains(r',\s*Master.', regex=True)
df_master = df[master].copy()

mean_age = df['Age'].mean()
mean_age_master = df_master['Age'].mean()

df_master['Age'] = df_master['Age'].fillna(mean_age_master)
df[master] = df_master

df['Age'] = df['Age'].fillna(mean_age)
df_test['Age'] = df_test['Age'].fillna(mean_age)

# Cabin
df = df.drop(['Cabin'], axis=1)
df_test = df_test.drop(['Cabin'], axis=1)

# Embarked and Fare
df['Embarked'] = df['Embarked'].fillna("S")
df_test['Embarked'] = df_test['Embarked'].fillna("S")

fare_mean = df['Fare'].mean()
df_test['Fare'] = df_test['Fare'].fillna(fare_mean)

## Feature Engineering

In [4]:
# Age_bin
df['Age_bin'] = pd.cut(df['Age'], bins=[-0.1, 12, 19, 26, 33, 46, 61, 100], labels=False)
df['Age_bin'] = df['Age_bin'].astype('category')
df_test['Age_bin'] = pd.cut(df_test['Age'], bins=[-0.1, 12, 19, 26, 33, 46, 61, 100], labels=False)
df_test['Age_bin'] = df_test['Age_bin'].astype('category')

# Fare_bin
df['Fare_bin'] = pd.cut(df['Fare'], bins=[0, 7, 8, 10, 14.45, 20, 31, 50, 100, 1000000], labels=False)
df_test['Fare_bin'] = pd.cut(df_test['Fare'], bins=[0, 7, 8, 10, 14.45, 20, 31, 50, 100, 1000000], labels=False)

missing_fare_bin = df[df['Fare_bin'].isnull() & (df['Fare'] == 0.0)]
df.loc[missing_fare_bin.index, 'Fare_bin'] = 0

missing_fare_bin_test = df_test[df_test['Fare_bin'].isnull() & (df_test['Fare'] == 0.0)]
df_test.loc[missing_fare_bin_test.index, 'Fare_bin'] = 0

df['Fare_bin'] = df['Fare_bin'].astype('category')
df_test['Fare_bin'] = df_test['Fare_bin'].astype('category')

# FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

# FamilyGroup
df['FamilyGroup'] = pd.cut(df['FamilySize'], bins=[0, 1, 4, 7, 20], labels=False)
df['FamilyGroup'] = df['FamilyGroup'].astype('category')

df_test['FamilyGroup'] = pd.cut(df_test['FamilySize'], bins=[0, 1, 4, 7, 20], labels=False)
df_test['FamilyGroup'] = df_test['FamilyGroup'].astype('category')

In [5]:
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    int64   
 3   Name         891 non-null    object  
 4   Sex          891 non-null    object  
 5   Age          891 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Embarked     891 non-null    object  
 11  Age_bin      891 non-null    category
 12  Fare_bin     891 non-null    category
 13  FamilySize   891 non-null    int64   
 14  FamilyGroup  891 non-null    category
dtypes: category(3), float64(2), int64(6), object(4)
memory usage: 87.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (to

In [6]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'FamilyGroup', 'SibSp', 'Parch'], axis=1) # remove these feature because 1. not analyze it yet and 2. have other things to show that features (Pclass instead of fare and cabin and ticket)
df_test = df_test.drop(['PassengerId', 'Name', 'Ticket', 'FamilyGroup', 'SibSp', 'Parch'], axis=1)

df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})

df['Embarked'] = df['Embarked'].map({'C': 0, 'S': 1, 'Q': 2})
df_test['Embarked'] = df_test['Embarked'].map({'C': 0, 'S': 1, 'Q': 2})
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Survived    891 non-null    int64   
 1   Pclass      891 non-null    int64   
 2   Sex         891 non-null    int64   
 3   Age         891 non-null    float64 
 4   Fare        891 non-null    float64 
 5   Embarked    891 non-null    int64   
 6   Age_bin     891 non-null    category
 7   Fare_bin    891 non-null    category
 8   FamilySize  891 non-null    int64   
dtypes: category(2), float64(2), int64(5)
memory usage: 51.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Pclass      418 non-null    int64   
 1   Sex         418 non-null    int64   
 2   Age         418 non-null    float64 
 3   Fare        418 non-null    float64 
 4   Embarked 

## Model (AI generated)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

df_input = df.drop(['Survived'], axis=1)
df_outcome = df['Survived']

# Prepare input and output
X = df_input
y = df_outcome.astype(int)  # Ensure target is integer

# Split data
# Try multiple models and compare their performance

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "SVC": SVC(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

results = {}
for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name}: Validation Accuracy = {acc:.4f}")

best_model_name = max(results, key=results.get)
print(f"\nBest model: {best_model_name} with accuracy {results[best_model_name]:.4f}")

# Use the best model for further prediction
model = models[best_model_name]

# Train model
# model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Save model to file
joblib.dump(model, "feature_titanic_random_forest_model.joblib")
print("Model saved to feature_titanic_random_forest_model.joblib")

Logistic Regression: Validation Accuracy = 0.8097
Decision Tree: Validation Accuracy = 0.7799
Random Forest: Validation Accuracy = 0.8134
Gradient Boosting: Validation Accuracy = 0.8022
AdaBoost: Validation Accuracy = 0.7985
SVC: Validation Accuracy = 0.6567
KNN: Validation Accuracy = 0.7052
Naive Bayes: Validation Accuracy = 0.7799

Best model: Random Forest with accuracy 0.8134
Validation Accuracy: 0.8134
Model saved to feature_titanic_random_forest_model.joblib


## Hyperparameter Tunning

In [8]:
# Add these cells after your model selection code

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Assuming 'model' is the best model from your selection
best_model = model

# Define param_grids for each possible model
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 0.9, 1.0]
    },
    "AdaBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    },
    "SVC": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    "KNN": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    },
    "Naive Bayes": {}  # No hyperparameters to tune for GaussianNB
}

# Get the param_grid for the best model
model_name = best_model_name  # From your earlier code
if model_name in param_grids:
    param_grid = param_grids[model_name]
    if param_grid:  # Only tune if there are params
        grid_search = RandomizedSearchCV(
            estimator=best_model,
            param_distributions=param_grid,
            cv=10,
            scoring='accuracy',
            n_jobs=-1,
            verbose=2
        )
        grid_search.fit(X_train, y_train)
        tuned_model = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV score: {grid_search.best_score_:.4f}")
    else:
        tuned_model = best_model  # No tuning needed
        print("No hyperparameters to tune for this model.")
else:
    tuned_model = best_model
    print("Param grid not defined for this model.")

# Evaluate tuned model
y_pred_tuned = tuned_model.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print(f"Tuned Model Validation Accuracy: {accuracy_tuned:.4f}")

# Save tuned model
joblib.dump(tuned_model, f"feature_titanic_tuned_{model_name.lower().replace(' ', '_')}_model.joblib")
print(f"Tuned model saved as feature_titanic_tuned_{model_name.lower().replace(' ', '_')}_model.joblib")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best parameters: {'n_estimators': 100, 'min_samples_split': 10, 'max_depth': 10}
Best CV score: 0.8345
Tuned Model Validation Accuracy: 0.8284
Tuned model saved as feature_titanic_tuned_random_forest_model.joblib


## Evaluate performance (AI Generated)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate model performance on test set
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[136  21]
 [ 29  82]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       157
           1       0.80      0.74      0.77       111

    accuracy                           0.81       268
   macro avg       0.81      0.80      0.81       268
weighted avg       0.81      0.81      0.81       268



## Prediction on test sets (AI Generated)

In [10]:
# Drop columns not used in training
X_submit = df_test

submit_pred = model.predict(X_submit)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_dataset['PassengerId'],
    'Survived': submit_pred.astype(int)
})

# Export to CSV
submission.to_csv('feature_submit_prediction.csv', index=False)
print("Prediction file saved as feature_submit_prediction.csv")

Prediction file saved as feature_submit_prediction.csv
