In [3]:
# ===== Global Setup: imports, RNG, and small helpers =====
import numpy as np

# Reproducibility
RNG = np.random.default_rng(42)

In [9]:
# Import pandas library first
import pandas as pd
df = pd.read_csv(r"C:\Users\JOSHUVA\OneDrive\Desktop\Study Materials\Machine Learning\Assignment_1\default+of+credit+card+clients\default of credit card clients.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [13]:
import numpy as np

# ====== USER CONFIG ======
CSV_PATH = r"C:\Users\JOSHUVA\OneDrive\Desktop\Study Materials\Machine Learning\Assignment_1\default+of+credit+card+clients\default of credit card clients.csv"
TEST_SIZE = 0.2
SEED = 42
# =========================

# Known names in the dataset
KNOWN_TARGET_NAMES = {
    'default payment next month',
    'default_payment_next_month',
    'default',
}
KNOWN_DROP_COLS = {'id'}
CATEGORICAL_NAMES = {
    'SEX', 'EDUCATION', 'MARRIAGE',
    'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'
}

# -----------------------------
# Helpers: string processing
# -----------------------------
def _safe_name(s):
    return s.strip().lower().replace(' ', '_')

def _upper(s):
    return s.strip().upper()

def _try_float(x):
    try:
        return float(x)
    except:
        return np.nan

# -----------------------------
# Loading CSV (NumPy + builtins only)
# -----------------------------
def read_csv_numpy_only(path, assume_header=True):
    """
    Reads a CSV using only builtins (open/read/split) + NumPy.
    Returns (header_list, data_2d_list_of_floats).
    """
    try:
        with open(path, 'r', encoding='utf-8-sig') as f:
            lines = f.read().splitlines()
    except Exception as e:
        print("Failed to read CSV. Check CSV_PATH.")
        print(e)
        return None, None

    if not lines:
        print("CSV is empty.")
        return None, None

    header = None
    start = 0
    if assume_header:
        header = [h.strip() for h in lines[0].split(',')]
        start = 1

    raw_rows = []
    expected_cols = None
    for line in lines[start:]:
        if not line.strip():
            continue
        parts = [p.strip() for p in line.split(',')]
        if expected_cols is None:
            expected_cols = len(parts)
        if len(parts) != expected_cols:
            # skip malformed
            continue
        row = [_try_float(x) for x in parts]
        raw_rows.append(row)

    if len(raw_rows) == 0:
        print("No data rows found after header.")
        return header, None

    data = np.array(raw_rows, dtype=float)
    return header, data

def one_hot_encode_numpy_only(X, feature_names, categorical_indices):
    """
    One-hot encode selected columns (by index) using NumPy only.
    Returns (X_new, feature_names_new, one_hot_mask).
    """
    X_list = []
    names_new = []
    one_hot_mask = []
    cat_set = set(categorical_indices)
    n, d = X.shape

    for j in range(d):
        col = X[:, j]
        if j in cat_set:
            cats = np.unique(col.astype(int))
            for c in cats:
                oh = (col.astype(int) == c).astype(np.float32).reshape(n, 1)
                X_list.append(oh)
                names_new.append(f"{feature_names[j]}=={c}")
                one_hot_mask.append(True)
        else:
            X_list.append(col.reshape(n, 1))
            names_new.append(feature_names[j])
            one_hot_mask.append(False)

    X_new = np.hstack(X_list).astype(np.float32)
    return X_new, names_new, np.array(one_hot_mask, dtype=bool)

def load_credit_default_csv_numpy_only(csv_path, assume_header=True, do_one_hot=True, verbose=True):
    """
    Loads the UCI credit default dataset from CSV with NumPy & builtins only.
    Returns:
        X: (n_samples, n_features) float32
        y: (n_samples,) int32 in {0,1}
        feature_names: list of names for X columns
        one_hot_mask: boolean mask of shape (n_features,)
    """
    header, values = read_csv_numpy_only(csv_path, assume_header=assume_header)
    if values is None:
        raise ValueError("Failed to load data.")

    # Impute NaN with column means
    col_means = np.nanmean(values, axis=0)
    inds = np.where(np.isnan(values))
    values[inds] = col_means[inds[1]]

    if header is not None:
        raw_names = [h.strip() for h in header]
        safe_names = [_safe_name(h) for h in raw_names]
        upper_names = [_upper(h) for h in raw_names]
        name_to_idx = {n: i for i, n in enumerate(safe_names)}

        # Find target
        target_idx = None
        for cand in KNOWN_TARGET_NAMES:
            if cand in name_to_idx:
                target_idx = name_to_idx[cand]
                break
        if target_idx is None:
            target_idx = values.shape[1] - 1  # fallback to last column

        # Drop ID if present
        drop_indices = set()
        for i, safe in enumerate(safe_names):
            if safe in KNOWN_DROP_COLS:
                drop_indices.add(i)

        y = values[:, target_idx].astype(np.int32)

        keep_mask = np.ones(values.shape[1], dtype=bool)
        keep_mask[target_idx] = False
        if len(drop_indices) > 0:
            for di in drop_indices:
                keep_mask[di] = False

        X = values[:, keep_mask]
        kept_names = [raw_names[i] for i in range(len(raw_names)) if keep_mask[i]]
        kept_safe = [safe_names[i] for i in range(len(safe_names)) if keep_mask[i]]
        kept_upper = [upper_names[i] for i in range(len(upper_names)) if keep_mask[i]]

        # One-hot for categorical
        one_hot_mask = np.zeros(X.shape[1], dtype=bool)
        if do_one_hot:
            cat_indices = [i for i, nm in enumerate(kept_upper) if nm in CATEGORICAL_NAMES]
            if len(cat_indices) > 0:
                X, new_names, one_hot_mask = one_hot_encode_numpy_only(X, kept_names, cat_indices)
                feature_names = new_names
            else:
                feature_names = kept_names
        else:
            feature_names = kept_names
    else:
        # No header: assume last column is target, first is ID to drop
        y = values[:, -1].astype(np.int32)
        X = values[:, 1:-1]
        feature_names = [f"x{i}" for i in range(X.shape[1])]
        one_hot_mask = np.zeros(X.shape[1], dtype=bool)

    if verbose:
        pos = int(y.sum())
        print(f"Loaded: X.shape={X.shape}, y.shape={y.shape}, positives={pos} ({100.0*y.mean():.2f}%)")
    return X.astype(np.float32), y.astype(np.int32), feature_names, one_hot_mask

# -----------------------------
# Train/Test Split (stratified)
# -----------------------------
def stratified_train_test_split(X, y, test_size=0.2, seed=42):
    rng = np.random.default_rng(seed)
    classes = np.unique(y)
    train_idx = []
    test_idx = []
    for c in classes:
        idx = np.where(y == c)[0]
        rng.shuffle(idx)
        n_test = int(round(len(idx) * test_size))
        test_idx.append(idx[:n_test])
        train_idx.append(idx[n_test:])
    train_idx = np.concatenate(train_idx)
    test_idx = np.concatenate(test_idx)
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

# -----------------------------
# Standard Scaler
# -----------------------------
class StandardScaler:
    def __init__(self, mask_scale=None):
        self.mean_ = None
        self.std_ = None
        self.mask_scale = mask_scale

    def fit(self, X):
        if self.mask_scale is None:
            self.mask_scale = np.ones(X.shape[1], dtype=bool)
        self.mean_ = np.zeros(X.shape[1], dtype=np.float32)
        self.std_ = np.ones(X.shape[1], dtype=np.float32)
        cols = np.where(self.mask_scale)[0]
        if len(cols) > 0:
            self.mean_[cols] = X[:, cols].mean(axis=0)
            std = X[:, cols].std(axis=0)
            std[std == 0] = 1.0
            self.std_[cols] = std
        return self

    def transform(self, X):
        return (X - self.mean_) / self.std_

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

# -----------------------------
# Metrics
# -----------------------------
def sigmoid(z):
    z = np.clip(z, -500.0, 500.0)
    return 1.0 / (1.0 + np.exp(-z))

def accuracy_score(y_true, y_pred):
    return (y_true == y_pred).mean()

def precision_recall_f1(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    precision = tp / (tp + fp + 1e-12)
    recall = tp / (tp + fn + 1e-12)
    f1 = 2 * precision * recall / (precision + recall + 1e-12)
    return precision, recall, f1

def confusion_matrix(y_true, y_pred):
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    tp = np.sum((y_true == 1) & (y_pred == 1))
    return np.array([[tn, fp], [fn, tp]], dtype=np.int32)

# -----------------------------
# Models
# -----------------------------
class Perceptron:
    def __init__(self, learning_rate=0.001, epochs=5, shuffle=True, seed=42):
        self.lr = learning_rate
        self.epochs = epochs
        self.shuffle = shuffle
        self.seed = seed
        self.w = None  # includes bias at last index

    def _add_bias(self, X):
        n = X.shape[0]
        return np.hstack([X, np.ones((n, 1), dtype=X.dtype)])

    def fit(self, X, y):
        y_bin = np.where(y == 1, 1, -1).astype(np.int32)
        Xb = self._add_bias(X)
        rng = np.random.default_rng(self.seed)
        self.w = np.zeros(Xb.shape[1], dtype=np.float32)

        for _ in range(self.epochs):
            idx = np.arange(Xb.shape[0])
            if self.shuffle:
                rng.shuffle(idx)
            for i in idx:
                xi = Xb[i]
                yi = y_bin[i]
                if yi * (xi @ self.w) <= 0:
                    self.w += self.lr * yi * xi
        return self

    def decision_function(self, X):
        return self._add_bias(X) @ self.w

    def predict(self, X):
        return (self.decision_function(X) >= 0).astype(np.int32)

class LogisticRegressionGD:
    def __init__(self, learning_rate=0.1, epochs=200, l2=1e-4, batch_size=256, seed=42):
        self.lr = learning_rate
        self.epochs = epochs
        self.l2 = l2
        self.batch_size = batch_size
        self.seed = seed
        self.w = None  # includes bias

    def _add_bias(self, X):
        n = X.shape[0]
        return np.hstack([X, np.ones((n, 1), dtype=X.dtype)])

    def fit(self, X, y):
        Xb = self._add_bias(X).astype(np.float32)
        y = y.astype(np.float32)
        n, d = Xb.shape
        rng = np.random.default_rng(self.seed)
        self.w = rng.normal(0.0, 0.01, size=d).astype(np.float32)

        for _ in range(self.epochs):
            if self.batch_size is None:
                batches = [(Xb, y)]
            else:
                idx = np.arange(n)
                rng.shuffle(idx)
                batches = []
                for start in range(0, n, self.batch_size):
                    end = min(start + self.batch_size, n)
                    bi = idx[start:end]
                    batches.append((Xb[bi], y[bi]))

            for Xb_batch, y_batch in batches:
                logits = Xb_batch @ self.w
                p = sigmoid(logits)
                grad = (Xb_batch.T @ (p - y_batch)) / Xb_batch.shape[0]
                if self.l2 > 0:
                    grad[:-1] += self.l2 * self.w[:-1]  # don't regularize bias
                self.w -= self.lr * grad
        return self

    def predict_proba(self, X):
        Xb = self._add_bias(X)
        return sigmoid(Xb @ self.w)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(np.int32)

class SimpleNN:
    def __init__(self, hidden_units=32, activation='relu', learning_rate=0.01,
                 epochs=50, l2=1e-4, batch_size=256, seed=42):
        self.h = hidden_units
        self.activation = activation
        self.lr = learning_rate
        self.epochs = epochs
        self.l2 = l2
        self.batch_size = batch_size
        self.seed = seed
        self.W1 = None; self.b1 = None
        self.W2 = None; self.b2 = None

    def _init_params(self, in_dim):
        rng = np.random.default_rng(self.seed)
        if self.activation == 'relu':
            self.W1 = (rng.standard_normal((in_dim, self.h)) * np.sqrt(2.0 / in_dim)).astype(np.float32)
        else:
            self.W1 = (rng.standard_normal((in_dim, self.h)) * np.sqrt(1.0 / in_dim)).astype(np.float32)
        self.b1 = np.zeros((1, self.h), dtype=np.float32)
        self.W2 = (rng.standard_normal((self.h, 1)) * np.sqrt(1.0 / self.h)).astype(np.float32)
        self.b2 = np.zeros((1, 1), dtype=np.float32)

    def _act(self, Z):
        if self.activation == 'relu':
            return np.maximum(0.0, Z)
        elif self.activation == 'tanh':
            return np.tanh(Z)
        else:
            raise ValueError("activation must be 'relu' or 'tanh'")

    def _act_deriv(self, Z):
        if self.activation == 'relu':
            return (Z > 0.0).astype(np.float32)
        elif self.activation == 'tanh':
            A = np.tanh(Z)
            return 1.0 - A*A

    def fit(self, X, y):
        X = X.astype(np.float32)
        y = y.reshape(-1, 1).astype(np.float32)
        n, d = X.shape
        self._init_params(d)
        rng = np.random.default_rng(self.seed)

        for _ in range(self.epochs):
            if self.batch_size is None:
                batches = [(X, y)]
            else:
                idx = np.arange(n)
                rng.shuffle(idx)
                batches = []
                for start in range(0, n, self.batch_size):
                    end = min(start + self.batch_size, n)
                    bi = idx[start:end]
                    batches.append((X[bi], y[bi]))

            for Xb, yb in batches:
                # Forward
                Z1 = Xb @ self.W1 + self.b1
                A1 = self._act(Z1)
                Z2 = A1 @ self.W2 + self.b2
                A2 = sigmoid(Z2)

                # Backprop (BCE)
                m = Xb.shape[0]
                dZ2 = (A2 - yb) / m
                dW2 = A1.T @ dZ2
                db2 = np.sum(dZ2, axis=0, keepdims=True)

                dA1 = dZ2 @ self.W2.T
                dZ1 = dA1 * self._act_deriv(Z1)
                dW1 = Xb.T @ dZ1
                db1 = np.sum(dZ1, axis=0, keepdims=True)

                if self.l2 > 0:
                    dW2 += self.l2 * self.W2
                    dW1 += self.l2 * self.W1

                # Update
                self.W2 -= self.lr * dW2
                self.b2 -= self.lr * db2
                self.W1 -= self.lr * dW1
                self.b1 -= self.lr * db1
        return self

    def predict_proba(self, X):
        X = X.astype(np.float32)
        Z1 = X @ self.W1 + self.b1
        A1 = self._act(Z1)
        Z2 = A1 @ self.W2 + self.b2
        return sigmoid(Z2).ravel()

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(np.int32)

# -----------------------------
# Run experiment
# -----------------------------
def run_experiment():
    X, y, feature_names, one_hot_mask = load_credit_default_csv_numpy_only(
        CSV_PATH, assume_header=True, do_one_hot=True, verbose=True
    )

    X_train, X_test, y_train, y_test = stratified_train_test_split(
        X, y, test_size=TEST_SIZE, seed=SEED
    )

    # Scale numeric (non one-hot) features
    if one_hot_mask is not None and one_hot_mask.size == X.shape[1]:
        scale_mask = ~one_hot_mask
    else:
        scale_mask = np.ones(X.shape[1], dtype=bool)

    scaler = StandardScaler(mask_scale=scale_mask)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # -------- Perceptron --------
    print("\n=== Perceptron ===")
    perceptron = Perceptron(learning_rate=0.001, epochs=5, shuffle=True, seed=SEED)
    perceptron.fit(X_train, y_train)
    y_pred_p = perceptron.predict(X_test)
    acc_p = accuracy_score(y_test, y_pred_p)
    pr_p, rc_p, f1_p = precision_recall_f1(y_test, y_pred_p)
    cm_p = confusion_matrix(y_test, y_pred_p)
    print(f"Accuracy: {acc_p:.4f} | Precision: {pr_p:.4f} | Recall: {rc_p:.4f} | F1: {f1_p:.4f}")
    print("Confusion Matrix:\n", cm_p)

    # ---- Logistic Regression ----
    print("\n=== Logistic Regression (GD) ===")
    logreg = LogisticRegressionGD(learning_rate=0.1, epochs=200, l2=1e-4, batch_size=256, seed=SEED)
    logreg.fit(X_train, y_train)
    y_pred_l = logreg.predict(X_test, threshold=0.5)
    acc_l = accuracy_score(y_test, y_pred_l)
    pr_l, rc_l, f1_l = precision_recall_f1(y_test, y_pred_l)
    cm_l = confusion_matrix(y_test, y_pred_l)
    print(f"Accuracy: {acc_l:.4f} | Precision: {pr_l:.4f} | Recall: {rc_l:.4f} | F1: {f1_l:.4f}")
    print("Confusion Matrix:\n", cm_l)

    # ---------- Simple NN --------
    print("\n=== Simple Neural Network (1 hidden layer) ===")
    nn = SimpleNN(hidden_units=32, activation='relu', learning_rate=0.01,
                  epochs=50, l2=1e-4, batch_size=256, seed=SEED)
    nn.fit(X_train, y_train)
    y_pred_n = nn.predict(X_test, threshold=0.5)
    acc_n = accuracy_score(y_test, y_pred_n)
    pr_n, rc_n, f1_n = precision_recall_f1(y_test, y_pred_n)
    cm_n = confusion_matrix(y_test, y_pred_n)
    print(f"Accuracy: {acc_n:.4f} | Precision: {pr_n:.4f} | Recall: {rc_n:.4f} | F1: {f1_n:.4f}")
    print("Confusion Matrix:\n", cm_n)

if __name__ == "__main__":
    run_experiment()

Loaded: X.shape=(30000, 91), y.shape=(30000,), positives=6636 (22.12%)

=== Perceptron ===
Accuracy: 0.7553 | Precision: 0.4061 | Recall: 0.2298 | F1: 0.2936
Confusion Matrix:
 [[4227  446]
 [1022  305]]

=== Logistic Regression (GD) ===
Accuracy: 0.8172 | Precision: 0.6681 | Recall: 0.3444 | F1: 0.4545
Confusion Matrix:
 [[4446  227]
 [ 870  457]]

=== Simple Neural Network (1 hidden layer) ===
Accuracy: 0.8100 | Precision: 0.6398 | Recall: 0.3225 | F1: 0.4289
Confusion Matrix:
 [[4432  241]
 [ 899  428]]
