In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

# Set plot style for better aesthetics
sns.set(style="whitegrid")

# Load the Titanic dataset from OpenML
titanic = fetch_openml(name='titanic', version=1, as_frame=True)
X = titanic.data
y = titanic.target

# Display the shape of the dataset
print(f"Dataset shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Define preprocessing steps for numerical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())  # Feature scaling
])

# Define preprocessing steps for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Define the complete pipeline
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('feature_selection', VarianceThreshold(threshold=0.01)),  # Remove low-variance features
    ('classifier', RandomForestClassifier(random_state=42))  # Classification model
])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Retrieve feature names after preprocessing
# Note: This requires accessing the steps within the pipeline
# and may vary depending on the versions of scikit-learn

Dataset shape: (1309, 13)
Target distribution:
survived
0    809
1    500
Name: count, dtype: int64

Model Accuracy: 0.9504

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       162
           1       0.94      0.93      0.93       100

    accuracy                           0.95       262
   macro avg       0.95      0.95      0.95       262
weighted avg       0.95      0.95      0.95       262

