In [19]:
%pip install numpy pandas

Note: you may need to restart the kernel to use updated packages.


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from itertools import combinations_with_replacement

In [21]:
df = pd.read_csv("../regression_breast_cancer_data.csv")

scaler = StandardScaler()

selected_feature = [
    "Node Positive Ratio",
    "M Stage Encoded",
    "N Stage Encoded",
    "Estrogen Status Encoded",
    "Progesterone Status Encoded",
]

X = df[selected_feature]
y = df["Survival Months"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [22]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

degrees = list(range(1, 6))
for d in degrees:
    poly = PolynomialFeatures(degree=d, include_bias=False)
    X_train_poly = poly.fit_transform(X_scaled_train)
    X_test_poly = poly.transform(X_scaled_test)

    model = LinearRegression()
    model.fit(X_train_poly, y_train)

    y_train_pred = model.predict(X_train_poly)
    y_test_pred = model.predict(X_test_poly)

    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    print(f"Degree {d}: R² train = {r2_train:.3f}, R² test = {r2_test:.3f}")

Degree 1: R² train = 0.030, R² test = 0.060
Degree 2: R² train = 0.037, R² test = 0.071
Degree 3: R² train = 0.048, R² test = 0.053
Degree 4: R² train = 0.052, R² test = -5.902
Degree 5: R² train = 0.059, R² test = -19.861


In [23]:
class PolyRegressionScratch:
    def __init__(self, degree=2, learning_rate=0.01, n_iterations=1000):
        self.degree = degree
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None 
        self.powers = None

    def _expand_features(self, X):
        n_samples, n_features = X.shape
        self.powers = []
        for deg in range(1, self.degree + 1):
            for comb in combinations_with_replacement(range(n_features), deg):
                count = [0] * n_features
                for idx in comb:
                    count[idx] += 1
                self.powers.append(count)

        X_poly = np.ones((n_samples, 1))
        for power in self.powers:
            term = np.prod(X**power, axis=1).reshape(-1, 1)
            X_poly = np.hstack([X_poly, term])
        return X_poly

    def predict(self, X):
        X_poly = self._expand_features(X)
        return X_poly @ self.weights

    def compute_cost(self, X, y):
        m = len(y)
        y_pred = self.predict(X)
        return (1 / m) * np.sum((y_pred - y) ** 2)

    def fit(self, X, y):
        X_poly = self._expand_features(X)
        n_samples, n_features_poly = X_poly.shape
        self.weights = np.ones(n_features_poly)

        for _ in range(self.n_iterations):
            y_pred = X_poly @ self.weights
            gradient = (2 / n_samples) * X_poly.T @ (y_pred - y)
            self.weights -= self.learning_rate * gradient


In [24]:
def r2_score_scratch(y_true, y_pred):
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_residual / ss_total)

In [25]:
model = PolyRegressionScratch(degree=2, learning_rate=0.01, n_iterations=10000)
model.fit(X_scaled_train, y_train)

In [26]:
y_pred = model.predict(X_scaled_test)
r2 = r2_score_scratch(y_test, y_pred)
print(f"R² test (scratch): {r2:.3f}")

R² test (scratch): 0.071
