
# Spaceship Titanic — Fixed-Order Decision Tree 
# Made a Jupiter notebook with the progress made on the tree

**Team rule tree:** Split on `CryoSleep → VIP → HomePlanet → Destination`.  
**Stopping:** ≥95% Y, ≤5% N, `n<10` majority, or no features left → majority.  
**Missing values** are treated as literal `"NA"` categories.

This notebook gives you:
1. Minimal EDA (tables + 2 plots you can paste into the report)  
2. A readable tree trained on an 80/20 split + metrics  
3. Refit on full train and optional `submission.csv`
4. Please add as needed




In [None]:

import os
import numpy as np
import pandas as pd
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import matplotlib.pyplot as plt  

FEATURE_ORDER = ["CryoSleep", "VIP", "HomePlanet", "Destination"]
TARGET = "Transported"
RANDOM_STATE = 42

TRAIN_PATH = "train.csv"
TEST_PATH  = "test.csv"


In [None]:

# 1) Helpers: stats + majority

def class_stats(y: pd.Series):
    n = len(y)
    if n == 0:
        return 0, 0, 0.0
    pos = int((y == True).sum())
    neg = n - pos
    rate = pos / n if n else 0.0
    return pos, neg, rate

def majority_bool(pos: int, neg: int) -> bool:
    # deterministic; used only when we must choose in small nodes / ties
    return pos >= neg


In [None]:

# 2) Load + prep

def load_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
    if not os.path.exists(train_path):
        raise FileNotFoundError("Put Kaggle's train.csv in the same folder as this notebook.")
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path) if os.path.exists(test_path) else None
    return train, test

def prep(df: pd.DataFrame) -> pd.DataFrame:
    keep = [c for c in ["PassengerId"] + FEATURE_ORDER + [TARGET] if c in df.columns]
    df = df[keep].copy()
    for col in FEATURE_ORDER:
        if col in df.columns:
            df[col] = df[col].where(df[col].notna(), "NA").astype(str)
    if TARGET in df.columns and df[TARGET].dtype != bool:
        df[TARGET] = df[TARGET].map({True: True, False: False, "True": True, "False": False, 1: True, 0: False}).astype(bool)
    return df

train_raw, test_raw = load_data()
train = prep(train_raw)
test  = prep(test_raw) if test_raw is not None else None

train.head()


### 3) Mini EDA — counts and two simple plots

In [None]:

# Group counts used by our hand-made tree
grouped = train.groupby(FEATURE_ORDER)[TARGET].agg(['sum','count'])
grouped['not_transported'] = grouped['count'] - grouped['sum']
grouped['pct_transported'] = (grouped['sum'] / grouped['count']).round(4)
grouped.reset_index().head(15)


In [None]:

# Plot 1: Transported rate by CryoSleep
rate_by_cryo = train.groupby("CryoSleep")[TARGET].mean().sort_values(ascending=False)
plt.figure()
rate_by_cryo.plot(kind="bar")
plt.title("Transported Rate by CryoSleep")
plt.xlabel("CryoSleep")
plt.ylabel("Mean Transported")
plt.show()


In [None]:

# Plot 2: Transported rate by HomePlanet
rate_by_home = train.groupby("HomePlanet")[TARGET].mean().sort_values(ascending=False)
plt.figure()
rate_by_home.plot(kind="bar")
plt.title("Transported Rate by HomePlanet")
plt.xlabel("HomePlanet")
plt.ylabel("Mean Transported")
plt.show()


### 4) Build the fixed-order decision tree

In [None]:

from dataclasses import dataclass

@dataclass
class Node:
    depth: int
    n: int
    positives: int
    negatives: int
    rate: float
    is_leaf: bool
    prediction: Optional[bool] = None
    feature: Optional[str] = None
    children: Dict[Any, "Node"] = field(default_factory=dict)
    reason: str = ""

    def pretty(self, indent: str = "  ") -> str:
        line = f"{indent*self.depth}Node(depth={self.depth}, n={self.n}, pos={self.positives}, neg={self.negatives}, rate={self.rate:.3f})"
        if self.is_leaf:
            lab = "Y" if self.prediction else "N"
            return f"{line} -> LEAF: predict {lab} ({self.reason})"
        s = f"{line} -> split on [{self.feature}]"
        for val, child in self.children.items():
            s += "\n" + f"{indent*self.depth}- if {self.feature} == {val!r}:\n" + child.pretty(indent)
        return s


def build_tree(df: pd.DataFrame, features: List[str], depth: int = 0) -> Node:
    pos, neg, rate = class_stats(df[TARGET])
    n = len(df)

    # Stopping rules
    if n > 0 and rate >= 0.95:
        return Node(depth, n, pos, neg, rate, True, True, None, {}, "≥95% transported (Y)")
    if n > 0 and rate <= 0.05:
        return Node(depth, n, pos, neg, rate, True, False, None, {}, "≤5% transported (N)")
    if n < 10:
        return Node(depth, n, pos, neg, rate, True, majority_bool(pos, neg), None, {}, "n<10 -> majority")
    if not features:
        return Node(depth, n, pos, neg, rate, True, majority_bool(pos, neg), None, {}, "no features left -> majority")

    feat = features[0]
    node = Node(depth, n, pos, neg, rate, False, None, feat, {}, "")
    values = list(pd.Series(df[feat]).unique())
    if "NA" not in values:
        values.append("NA")

    for v in values:
        sub = df[df[feat] == v]
        if len(sub) == 0:
            child = Node(depth+1, 0, 0, 0, rate, True, majority_bool(pos, neg), None, {}, "empty -> parent majority")
        else:
            child = build_tree(sub, features[1:], depth+1)
        node.children[v] = child
    return node


def predict_one(node: Node, row: pd.Series) -> bool:
    while not node.is_leaf:
        feat = node.feature
        val = str(row.get(feat, "NA"))
        if val in node.children:
            node = node.children[val]
        elif "NA" in node.children:
            node = node.children["NA"]
        else:
            return majority_bool(node.positives, node.negatives)
    return bool(node.prediction)


def predict_df(node: Node, df: pd.DataFrame):
    return np.array([predict_one(node, r) for _, r in df.iterrows()], dtype=bool)


### 5) Train/valid split, fit, metrics, and readable tree

In [None]:

tr, va = train_test_split(train, test_size=0.2, random_state=RANDOM_STATE, stratify=train[TARGET])

tree = build_tree(tr, FEATURE_ORDER)
print("=== TREE (trained on 80%) ===")
print(tree.pretty())

va_pred = predict_df(tree, va)
acc = accuracy_score(va[TARGET], va_pred)
prec = precision_score(va[TARGET], va_pred, zero_division=0)
rec  = recall_score(va[TARGET], va_pred, zero_division=0)
f1   = f1_score(va[TARGET], va_pred, zero_division=0)
cm   = confusion_matrix(va[TARGET], va_pred)

print(f"\n[Validation] Accuracy={acc:.4f} Precision={prec:.4f} Recall={rec:.4f} F1={f1:.4f}")
print("Confusion Matrix:\n", cm)


### 6) Refit on full train and (optional) submission

In [None]:

full_tree = build_tree(train, FEATURE_ORDER)
print("=== TREE (refit on 100% train) ===")
print(full_tree.pretty())

train_pred = predict_df(full_tree, train)
acc = accuracy_score(train[TARGET], train_pred)
prec = precision_score(train[TARGET], train_pred, zero_division=0)
rec  = recall_score(train[TARGET], train_pred, zero_division=0)
f1   = f1_score(train[TARGET], train_pred, zero_division=0)
cm   = confusion_matrix(train[TARGET], train_pred)
print(f"\n[Train] Accuracy={acc:.4f} Precision={prec:.4f} Recall={rec:.4f} F1={f1:.4f}")
print("Confusion Matrix:\n", cm)

if test is not None and "PassengerId" in test.columns:
    test_pred = predict_df(full_tree, test)
    sub = pd.DataFrame({"PassengerId": test["PassengerId"], "Transported": test_pred})
    sub.to_csv("submission.csv", index=False)
    print(f"\nWrote submission.csv with {len(sub)} rows.")
else:
    print("\nNo test.csv found — skipping submission.")
