In [None]:
import numpy as np
import pandas as pd 
from pathlib import Path
from matplotlib import pyplot as plt 

In [None]:
# ▸▸ Amazon Reviews – download via Kaggle API if not present ▸▸
csv_path = Path('datasets/reviews/amazon_review_ID.shuf.lrn.csv')
if not csv_path.exists():
    !kaggle competitions download -c 184-702-tu-ml-2025-s-reviews -p datasets/reviews --force
    !unzip -o datasets/reviews/184-702-tu-ml-2025-s-reviews.zip -d datasets/reviews

df = pd.read_csv(csv_path)
print(df.shape)

# Dataset Reviews

In [None]:
X_all = df.drop(columns=['ID', 'Class']).values         
y_all = df['Class'].values                              

# schuffle
perm   = np.random.permutation(len(df))
X_all  = X_all[perm]            
y_all  = y_all[perm]

# train/ test split (70/30)
split  = int(0.7 * X_all.shape[0])
X_train, X_test = X_all[:split, :],  X_all[split:, :]
y_train_string, y_test_string = y_all[:split],     y_all[split:]
n1, m1 = X_train.shape
print(n1,m1)


# Lable encode
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train_string)  
y_test  = le.transform(y_test_string) 



print(f"X_train = {X_train.shape}, X_test = {X_test.shape} y_train = {y_train.shape}, y_test {y_test.shape} ")



# Dataset Credit Card


In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
df_c = fetch_ucirepo(id=350) 

In [None]:
X_credit  = df_c.data.features.copy()
y_credit  = df_c.data.targets.iloc[:, 0]         

print(repr(X_credit.columns.tolist()))

# remove leading / trailing blanks inall column labels
X_credit.columns = X_credit.columns.str.strip()

X_credit = X_credit.drop(columns='ID', errors='ignore')

# In the raw data there are some values which diont fit to a category so we collapse them into "other" bucket
X_credit['X3'] = X_credit['X3'].replace({0: 4, 5: 4, 6: 4})
X_credit['X4']  = X_credit['X4'].replace({0: 3})

# One-hot encode the three categorical columns
cat_cols = ['X2', 'X3', 'X4']   
X_cat = pd.get_dummies(X_credit[cat_cols], drop_first=True)     

# numeric + ordinal columns (everything that is NOT categorical)
num_cols = X_credit.columns.difference(cat_cols)
X_num = X_credit[num_cols].astype(float)

# stitch everything together
X_pre = pd.concat([X_cat, X_num], axis=1)

# Convert to NumPy, shuffle, 70 / 30 split
X_all_c = X_pre.to_numpy(dtype=np.float32)
y_all_c = y_credit.to_numpy(dtype=np.int64)

perm = np.random.permutation(len(X_all_c))
X_all_c, y_all_c = X_all_c[perm], y_all_c[perm]

split = int(0.7 * len(X_all))
X_train_c, X_test_c = X_all_c[:split],  X_all_c[split:]
y_train_c, y_test_c = y_all_c[:split],  y_all_c[split:]

print("train shape:", X_train_c.shape, "test shape:", X_test_c.shape)

# Model Build

In [None]:
def he_init(fan_in: int, fan_out: int):
    std_dev = np.sqrt(2.0 / fan_in)
    return np.random.randn(fan_in, fan_out) * std_dev


def params(n_x, hidden_sizes, n_y):
    if isinstance(hidden_sizes, int):
        hidden_sizes = [hidden_sizes]

    layer_sizes = [n_x] + list(hidden_sizes) + [n_y]

    weights = []  
    biases  = []  

    for fan_in, fan_out in zip(layer_sizes[:-1], layer_sizes[1:]):
        weights.append(he_init(fan_in, fan_out))
        biases.append(np.zeros((1, fan_out)))

    return weights, biases


def activation(Z, kind):
    if kind == "relu":
        return np.maximum(0, Z)
    if kind == "sigmoid":
        return 1.0 / (1.0 + np.exp(-Z))
    if kind == "tanh":
        return np.tanh(Z)
    raise ValueError(f"Unknown activation: {kind}")


def deriv_activation(Z, A, kind):
    if kind == "relu":
        return (Z > 0).astype(Z.dtype)
    if kind == "sigmoid":
        return A * (1.0 - A)
    if kind == "tanh":
        return 1.0 - np.power(A, 2)
    raise ValueError(f"Unknown activation: {kind}")


def softmax(Z):
    shiftZ = Z - np.max(Z, axis=1, keepdims=True)  # subtract max per row
    expZ   = np.exp(shiftZ)
    return expZ / np.sum(expZ, axis=1, keepdims=True)


def one_hot(y):
    n_classes = y.max() + 1
    out = np.zeros((y.size, n_classes))
    out[np.arange(y.size), y] = 1
    return out


def forward_prop(Ws, bs, X, act):
    """
        Zs : list of pre‑activations for hidden layers
        As : list of post‑activations for hidden layers  (same length as Zs)
        ZL : logits of the output layer (pre‑softmax)
        AL : probabilities after softmax
    """
    Zs = []             
    As = [X]             

    for W, b in zip(Ws[:-1], bs[:-1]):
        Z = As[-1] @ W + b            # matrix multiply + bias
        A = activation(Z, act)        
        Zs.append(Z)
        As.append(A)

    # Output layer
    ZL = As[-1] @ Ws[-1] + bs[-1]
    AL = softmax(ZL)

    return Zs, As[1:], ZL, AL        



def backward_prop(Zs, As, ZL, AL, Ws, X, y, act):
   
    m = X.shape[0]
    y_onehot = one_hot(y)

    # Gradient at the output layer 
    dZ = AL - y_onehot                # (m, n_classes)

    dWs = []
    dbs = []

    total_layers = len(Ws)
    # We iterate backwards (L‑1, L‑2, ..., 0)
    for layer_idx in reversed(range(total_layers)):
        A_prev = As[layer_idx - 1] if layer_idx > 0 else X

        dW = (A_prev.T @ dZ) / m      # weight gradient
        db = np.mean(dZ, axis=0, keepdims=True)  # bias gradient

        # We Store gradients at the front so order matches Ws later on
        dWs.insert(0, dW)
        dbs.insert(0, db)

        # Move gradient one layer backward, unless we just finished first layer
        if layer_idx > 0:
            dA_prev = dZ @ Ws[layer_idx].T
            dZ = dA_prev * deriv_activation(Zs[layer_idx - 1], As[layer_idx - 1], act)

    return dWs, dbs


def update_params(Ws, bs, dWs, dbs, alpha: float = 0.01):
    for i in range(len(Ws)):
        Ws[i] -= alpha * dWs[i]
        bs[i] -= alpha * dbs[i]
    return Ws, bs


# Fit and Train

In [None]:
def compute_loss(AL, y):
    m = y.shape[0]
    y_oh = one_hot(y)
    return -np.sum(y_oh * np.log(AL + 1e-15)) / m  # 1e‑15 avoids log(0)


def predict(Ws, bs, X, act):
    _, _, _, AL = forward_prop(Ws, bs, X, act)
    return np.argmax(AL, axis=1)


def fit(X, y, *, n_h, n_layers, iters, alpha, act, es_tol=1e-4, es_patience=10):
    n_x = X.shape[1]
    n_y = y.max() + 1

    hidden_sizes = [n_h] * n_layers if n_layers > 1 else n_h

    Ws, bs = params(n_x, hidden_sizes, n_y)
    
    # Early Stopping
    best_loss = np.inf
    stale     = 0

    for i in range(iters):
        Zs, As, ZL, AL = forward_prop(Ws, bs, X, act)
        loss = compute_loss(AL, y)
        dWs, dbs = backward_prop(Zs, As, ZL, AL, Ws, X, y, act)
        Ws, bs = update_params(Ws, bs, dWs, dbs, alpha)
        
        
        if loss < best_loss - es_tol:
            best_loss = loss
            stale     = 0
        else:
            stale    += 1
            if stale >= es_patience:
                if i % 100 != 0:       
                    print(f"iter {i:5d}: loss {loss:.4f}  (early stop)")
                break
        
        if i % 100 == 0:
            print(f"iter {i:5d}: loss {loss:.4f}")

    
    return Ws, bs

def inspect_model(weights, biases, dtype=np.float64):
    n_params = sum(w.size for w in weights) + sum(b.size for b in biases)
    ram = n_params * np.dtype(dtype).itemsize
    return n_params, ram



# Gridsearch

In [None]:
from itertools import product
import numpy as np          # ← NEW

def grid_search(X_train, y_train,
                X_val, y_val,
                param_grid,
                scaler,
                n_samples=None,      
                verbose=True):

    keys, values = zip(*param_grid.items())
    combos = [dict(zip(keys, v)) for v in product(*values)]

    if n_samples is not None and n_samples < len(combos):
        rng = np.random.default_rng(42)          
        combos = rng.choice(combos, size=n_samples, replace=False)
        combos = list(combos)                    
    
    best_score   = -np.inf
    best_params  = None
    best_weights = None

    for i, params in enumerate(combos, start=1):
        fit_kwargs = params.copy()

        X_tr = scaler.transform(X_train) if scaler is not None else X_train
        Ws, bs = fit(X_tr, y_train, **fit_kwargs)

        X_v = scaler.transform(X_val) if scaler is not None else X_val
        y_pred = predict(Ws, bs, X_v, act=fit_kwargs.get("act", "relu"))
        score = np.mean(y_pred == y_val)

        if verbose:
            print(f"[{i:03d}/{len(combos)}] {params} → acc={score:.3f}")

        if score > best_score:
            best_score, best_params, best_weights = score, params, (Ws, bs)

    return best_params, best_weights, best_score


# Evalutaion

## 1. Reviews

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

param_grid = {
    "n_h":     [10, 64, 500],
    "n_layers": [1, 2, 3],
    "alpha":   [0.1, 0.05, 0.01],
    "iters":   [50, 100, 500],
    "act":     ["relu", "tanh", "sigmoid"],
}

best_params, best_weights, best_val_acc = grid_search(
    X_train, y_train,
    X_test, y_test,
    param_grid,
    scaler=scaler,
    n_samples=40
    )     

print("\nBest combo:", best_params)
print("Validation accuracy:", best_val_acc)

n_params, ram_bytes = inspect_model(*best_weights, dtype=np.float64)
print(f"Learnable parameters: {n_params:,}")
print(f"Estimated RAM for weights+biases: {ram_bytes/1024:.1f} KiB")

X_test_s = scaler.transform(X_test)
y_pred   = predict(*best_weights, X_test_s, act=best_params["act"])
test_acc = np.mean(y_pred == y_test)
print(f"Test accuracy: {test_acc:.3f}")

## 2. Credit Card

In [None]:
scaler_c = StandardScaler().fit(X_train_c)


param_grid_c = {
    "n_h":      [10, 64, 500],
    "n_layers": [1, 2, 3],
    "alpha":    [0.1, 0.05, 0.01],
    "iters":    [50, 100, 500],
    "act":      ["relu", "tanh", "sigmoid"],
}

best_params_c, best_weights_c, best_val_acc_c = grid_search(
    X_train_c, y_train_c,
    X_test_c,  y_test_c,          
    param_grid_c,
    scaler=scaler_c, 
    n_samples=40             
    
)

print("\nBest combo (_c):", best_params_c)
print("Validation accuracy (_c):", best_val_acc_c)


n_params_c, ram_bytes_c = inspect_model(*best_weights_c, dtype=np.float64)
print(f"Learnable parameters: {n_params_c:,}")
print(f"Estimated RAM for weights+biases: {ram_bytes_c/1024:.1f} KiB")

X_test_s_c = scaler_c.transform(X_test_c)         
y_pred_c   = predict(*best_weights_c, X_test_s_c, act=best_params_c["act"])
test_acc_c = np.mean(y_pred_c == y_test_c)
print(f"Test accuracy (_c): {test_acc_c:.3f}")
