In [2]:
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For data preprocessing
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # Evaluation metrics
import tensorflow as tf  # For building Neural Network
from tensorflow.keras.models import Sequential  # Sequential model for Neural Network
from tensorflow.keras.layers import Dense  # Dense layer for Neural Network

In [7]:
# Load the Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
# Data Preprocessing
# ------------------
# Drop unnecessary columns that are not useful for prediction
data = data.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)

# Handle missing values

# Fill missing 'Age' values with the median age
data['Age'].fillna(data['Age'].median(), inplace=True)

# Fill missing 'Embarked' values with the most frequent value (mode)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [10]:
# Convert categorical variables to numerical using Label Encoding
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])  # Convert 'Sex' to numerical (0 for female, 1 for male)
data['Embarked'] = le.fit_transform(data['Embarked'])  # Convert 'Embarked' to numerical


In [15]:
# Define features (X) and target (y)
# Features: All columns except 'Survived'
# Target: 'Survived' column (0 = Did not survive, 1 = Survived)
X = data.drop(['Survived','Name'], axis=1)
y = data['Survived']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Standardize the features to have zero mean and unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit and transform the training data
X_test = scaler.transform(X_test)  # Transform the test data using the same scaler


In [18]:
# Model 1: Logistic Regression
# ----------------------------
lr_model = LogisticRegression(random_state=42)  # Initialize Logistic Regression model
lr_model.fit(X_train, y_train)  # Train the model on the training data
y_pred_lr = lr_model.predict(X_test)  # Predict on the test data

# Model 2: Random Forest
# ----------------------
rf_model = RandomForestClassifier(random_state=42)  # Initialize Random Forest model
rf_model.fit(X_train, y_train)  # Train the model on the training data
y_pred_rf = rf_model.predict(X_test)  # Predict on the test data

# Model 3: Neural Network
# -----------------------
# Initialize a Sequential Neural Network model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer with 64 neurons
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron (binary classification)
])
# Compile the model with Adam optimizer and binary cross-entropy loss
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model for 50 epochs with a batch size of 32
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
# Predict on the test data (convert probabilities to binary predictions)
y_pred_nn = (nn_model.predict(X_test) > 0.5).astype(int).flatten()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


In [20]:
# Evaluate Model Performance
# --------------------------
def evaluate_model(y_true, y_pred, model_name):
    """
    Evaluate the performance of a model using accuracy, precision, recall, and F1-score.
    
    Parameters:
    - y_true: True labels
    - y_pred: Predicted labels
    - model_name: Name of the model (for printing purposes)
    
    Returns:
    - Tuple containing accuracy, precision, recall, and F1-score
    """
    accuracy = accuracy_score(y_true, y_pred)  # Calculate accuracy
    precision = precision_score(y_true, y_pred)  # Calculate precision
    recall = recall_score(y_true, y_pred)  # Calculate recall
    f1 = f1_score(y_true, y_pred)  # Calculate F1-score
    
    # Print the evaluation metrics
    print(f"{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("-" * 30)
    
    return accuracy, precision, recall, f1


In [21]:
# Evaluate all models
lr_metrics = evaluate_model(y_test, y_pred_lr, "Logistic Regression")
rf_metrics = evaluate_model(y_test, y_pred_rf, "Random Forest")
nn_metrics = evaluate_model(y_test, y_pred_nn, "Neural Network")

# Compare Model Performance
# -------------------------
# Create a DataFrame to compare the performance of all models
metrics_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Neural Network'],
    'Accuracy': [lr_metrics[0], rf_metrics[0], nn_metrics[0]],
    'Precision': [lr_metrics[1], rf_metrics[1], nn_metrics[1]],
    'Recall': [lr_metrics[2], rf_metrics[2], nn_metrics[2]],
    'F1-Score': [lr_metrics[3], rf_metrics[3], nn_metrics[3]]
})

# Print the comparison table
print(metrics_df)

Logistic Regression Metrics:
Accuracy: 0.8045
Precision: 0.7826
Recall: 0.7297
F1-Score: 0.7552
------------------------------
Random Forest Metrics:
Accuracy: 0.8212
Precision: 0.8088
Recall: 0.7432
F1-Score: 0.7746
------------------------------
Neural Network Metrics:
Accuracy: 0.8268
Precision: 0.8413
Recall: 0.7162
F1-Score: 0.7737
------------------------------
                 Model  Accuracy  Precision    Recall  F1-Score
0  Logistic Regression  0.804469   0.782609  0.729730  0.755245
1        Random Forest  0.821229   0.808824  0.743243  0.774648
2       Neural Network  0.826816   0.841270  0.716216  0.773723
