# Lab 3 Project (Titanic)
Jason Ballard
31 March 2025

Import the external Python libraries used (e.g., pandas, numpy, matplotlib, seaborn, sklearn and more).

## Section 1. Import and Inspect the Data

In [None]:
# all imports get moved to the top - import each only once

import seaborn as sns
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.neural_network import MLPClassifier

In [None]:
# Load Titanic dataset
df = sns.load_dataset('titanic')

features = list(df.columns)
print(features)
print(len(features))
# Understand the data
print(df.info())       # See column types and missing values
print(df.head(3))      # Peek at the structure
print(df.describe())   # Summary stats for numerical columns

# Section 2. Data Exploration and Preparation

## 2.1 Handle Missing Values, Clean Data, and Feature Engineering

In [None]:
# Preprocessing: Fill missing values and create new features
df['family_size'] = df['sibsp'] + df['parch'] + 1
df['age'] = df['age'].fillna(df['age'].median())
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])
df['sex'] = df['sex'].map({'male': 0, 'female': 1, 'unknown': -1})

label_encoder = LabelEncoder()
df['embark_town_encoded'] = label_encoder.fit_transform(df['embark_town'])

<!-- ### Reflection 2.3

1. Why might family size be a useful feature for predicting survival? famil;y size is a good prediction of survivalbility for the female and younger children of the families
2. Why convert categorical data to numeric?  the conversion allows computations to be run on the data. -->

# Section 3. Feature Selection and Justification

- Select two or more input features (numerical for regression, numerical and/or categorical for classification)
- Use 'Survived' as the target

First:
- input features: alone
- target: survived

Second:
- input features - embark_town
- target: survived

Third:
- input features -  age and family_size (embark_town)
- target: survived
- Justify your selection with reasoning.

## 3.1 Choose features and target

In [None]:
# Define the available features and target
features = ['alone', 'age', 'family_size', 'embark_town_encoded', 'sex']
target = 'survived'

## 3.2 Define X (features) and y (target)
- Assign input features to X a pandas DataFrame with 1 or more input features
- Assign target variable to y (as applicable) - a pandas Series with a single target feature
- Again - use comments to run a single case at a time

- The follow starts with only the statements needed for case 1. 
- Double brackets [[ ]]  makes a 2D DataFrame
- Single brackets [ ]  make a 1D Series

In [None]:
def select_case(case_number, df):
    if case_number == 1:
        feature_list = ['alone']
    elif case_number == 2:
        feature_list = ['embark_town_encoded']
    elif case_number == 3:
        feature_list = ['age', 'family_size', 'embark_town_encoded']
    else:
        raise ValueError("Invalid case number.")

    X = df[feature_list].dropna()
    y = df.loc[X.index, 'survived']
    return X, y, feature_list

### 3.3 Select and run a Specific Case (loop)

In [None]:
for case in [1, 2, 3]:
    X, y, features = select_case(case, df)
    print(f"Running Case {case} with features: {features}")

### 3.4 Eval Features 
Plot correlations, value counts, or feature distributions

Help to justify why the features were selected (especially for reports)

In [None]:
# Correlation of numerical features with 'survived'
print(df[features + ['survived']].corr()['survived'].sort_values(ascending=False))

# Visualize feature distribution
sns.pairplot(df[features + ['survived']], hue='survived')
plt.show()

### Reflection 3:

1. Why are these features selected? **the features selected provide the most tell of survivability**
2. Are there any features that are likely to be highly predictive of survival? **Yes age and class**

# Section 4. Train a Classification Model (Decision Tree)

## 4.1 Basic Train/Test split 

In [None]:
# def stratified_split(X, y, test_size=0.2, random_state=42):
#     splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
#     for train_idx, test_idx in splitter.split(X, y):
#         return X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
    
#     X_train, X_test, y_train, y_test = stratified_split(X, y)
#     print(f"Train set size: {len(X_train)}, Test set size: {len(X_test)}")
#     return X_train, X_test, y_train, y_test

In [None]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)

for train_indices, test_indices in splitter.split(X, y):
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]


## 4.2 Stratified Train/Test split

In [None]:
tree_model = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_model.fit(X_train, y_train)

## 4.3 Compare Results


In [None]:
print("Original Class Distribution:\n", y.value_counts(normalize=True))
print("Train Set Class Distribution:\n", y_train.value_counts(normalize=True))
print("Test Set Class Distribution:\n", y_test.value_counts(normalize=True))


### 4.4 Eval the predictions

In [None]:
y_pred = tree_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


plt.figure(figsize=(12, 6))
plot_tree(tree_model, feature_names=X.columns, class_names=['Died', 'Survived'], filled=True)
plt.title('Decision Tree Visualization')
plt.show()


### Reflection 4:

1. Why might stratification improve model performance? **This ensures that the data is equallly representivate across the whole data set.**
2. How close are the training and test distributions to the original dataset? **identical**
3. Which split method produced better class balance? **I am not sure because th enumbers are so close**

## Section 5. Compare Alternative Models (SVC, NN) 

In a Support Vector Machine, the kernel function defines how the algorithm transforms data to find a hyperplane that separates the classes. If the data is not linearly separable, changing the kernel can help the model find a better decision boundary.

SVC Kernel: Common Types

RBF (Radial Basis Function) – Most commonly used; handles non-linear data well (default)
Linear – Best for linearly separable data (straight line separation)
Polynomial – Useful when the data follows a curved pattern
Sigmoid – Similar to a neural network activation function; less common
Commenting the options in and out in the code can be helpful. The analyst decides which to use based on their understanding of the results. 

In [None]:
# kernels = ['rbf', 'linear', 'poly', 'sigmoid']
# for kernel in kernels:
#     model = SVC(kernel=kernel)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     acc = accuracy_score(y_test, y_pred)
#     print(f"SVC Kernel: {kernel} | Accuracy: {acc:.4f}")


### 5.1 Train and Evaluate Model (SVC)

In [None]:
kernels = ['linear', 'rbf', 'poly', 'sigmoid']

for kernel in kernels:
    svc = SVC(kernel=kernel)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    
    print(f"\nSVC Kernel: {kernel}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


### 5.1A  Dynamic plot- 

In [None]:
plot_feature = X.columns[0]  # Dynamically pick the first feature

survived_vals = X_test.loc[y_test == 1, plot_feature]
not_survived_vals = X_test.loc[y_test == 0, plot_feature]

plt.figure(figsize=(8, 6))
plt.scatter(survived_vals, y_test[y_test == 1], c='yellow', marker='s', label='Survived')
plt.scatter(not_survived_vals, y_test[y_test == 0], c='cyan', marker='^', label='Not Survived')

if hasattr(svc, 'support_vectors_'):
    support_x = svc.support_vectors_[:, 0]
    support_y = svc.support_vectors_[:, 1] if svc.support_vectors_.shape[1] > 1 else None
    if support_y is not None:
        plt.scatter(support_x, support_y, c='black', marker='+', label='Support Vectors')
    else:
        plt.scatter(support_x, [0]*len(support_x), c='black', marker='+', label='Support Vectors')

plt.xlabel(plot_feature)
plt.ylabel('Survived')
plt.title('Support Vectors (SVC)')
plt.legend()
plt.show()


## 5.2 Train and Evaluate Model (NN MLP)

In [None]:
nn_model = MLPClassifier(hidden_layer_sizes=(50, 25, 10), solver='lbfgs')
nn_model.fit(X_train, y_train) 

y_pred_nn = nn_model.predict(X_test)

print("Results for Neural Network on test data:")
print(classification_report(y_test, y_pred_nn))

cm_nn = confusion_matrix(y_test, y_pred_nn)
sns.heatmap(cm_nn, annot=True, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Model comparison Summary Table
results = {
    "Model": ["Decision Tree", "SVC (rbf)", "SVC (linear)", "Neural Net"],
    "Accuracy": [dt_acc, svc_rbf_acc, svc_linear_acc, nn_acc]  # replace with your actual values
}
pd.DataFrame(results)


#### Reflection 5:
How well did each model perform?

Are there any surprising results?

Why might one model outperform the others?

## Section 6. Final Thoughts & Insights

### 6.1 Summarize Findings
What indicators are strong predictors of gender?

Decision Tree performed well but overfit slightly on training data.

Neural Network showed moderate improvement but introduced complexity.


### 6.2 Discuss Challenges Faced
Small sample size could limit generalizability.

Missing values (if any) could bias the model.

### 6.3 Next Steps
Test more features (e.g., BMI class). 

Try hyperparameter tuning for better results.