In [161]:
%pip install numpy pandas matplotlib scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [163]:
df = pd.read_csv("../regression_breast_cancer_data.csv")

df.head()

Unnamed: 0,Age,differentiate,Tumor Size,Survival Months,Node Positive Ratio,T Stage Encoded,M Stage Encoded,N Stage Encoded,Estrogen Status Encoded,Progesterone Status Encoded
0,68,3,4,60,0.041667,0.0,1,0.0,1,1
1,50,2,35,62,0.357143,1.0,1,1.0,1,1
2,58,2,63,75,0.5,2.0,1,2.0,1,1
3,58,3,18,84,0.5,0.0,1,0.0,1,1
4,47,3,41,50,0.333333,1.0,1,0.0,1,1


In [164]:
selected_feature = [
    "N Stage Encoded",
]

X = df[selected_feature].values
y = df["Survival Months"].values

y = y.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(-1, 1))
X_test_scaled = scaler.transform(X_test.reshape(-1, 1))


In [165]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

degrees = list(range(1, 6))
for d in degrees:
    poly = PolynomialFeatures(degree=d, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    y_train_pred = model.predict(X_train_poly)
    y_test_pred = model.predict(X_test_poly)
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    print(f"Degree {d}: R² train = {r2_train:.3f}, R² test = {r2_test:.3f}")

Degree 1: R² train = 0.015, R² test = 0.034
Degree 2: R² train = 0.015, R² test = 0.035
Degree 3: R² train = 0.015, R² test = 0.035
Degree 4: R² train = 0.015, R² test = 0.035
Degree 5: R² train = 0.015, R² test = 0.035


In [166]:
class PolyRegressionScratch:
    """
    Custom implementation of quadratic regression (degree 2) using gradient descent.
    """

    def __init__(self, alpha=1e-3, iterations=1000):
        self.a2 = 1  # coef of x^2
        self.a1 = 1  # coef of x
        self.a0 = 1  # intercept
        self.alpha = alpha
        self.iterations = iterations

    def predict(self, x):
        return self.a2 * x**2 + self.a1 * x + self.a0

    # Cost function (MSE)
    def compute_cost(self, x, y):
        m = len(x)
        predictions = self.predict(x)
        cost = (1 / m) * np.sum((predictions - y) ** 2)
        return cost

    def fit(self, x, y):
        m = len(x)

        for i in range(self.iterations):
            predictions = self.predict(x)

            # Gradients
            dJ_da0 = (2 / m) * np.sum(predictions - y)
            dJ_da1 = (2 / m) * np.sum((predictions - y) * x)
            dJ_da2 = (2 / m) * np.sum((predictions - y) * x**2)

            # Update coefficients
            self.a0 -= self.alpha * dJ_da0
            self.a1 -= self.alpha * dJ_da1
            self.a2 -= self.alpha * dJ_da2

            if i % 1000 == 0:
                cost = self.compute_cost(x, y)
                print(f"Iteration {i}, Cost: {cost}")

        print(f"Optimized a0 (Intercept): {self.a0}")
        print(f"Optimized a1 (coef of x): {self.a1}")
        print(f"Optimized a2 (coef of x^2): {self.a2}")


In [167]:
def r2_score_scratch(y_true, y_pred):
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_residual / ss_total)

In [168]:
alpha = 1e-3
iterations = 10000

model = PolyRegressionScratch(alpha=alpha, iterations=iterations)
model.fit(X_train_scaled, y_train)

Iteration 0, Cost: 5290.278684969754
Iteration 1000, Cost: 682.4484427125227
Iteration 2000, Cost: 577.1468161327846
Iteration 3000, Cost: 549.5893642045271


Iteration 4000, Cost: 534.5543316292567
Iteration 5000, Cost: 526.0422591499216
Iteration 6000, Cost: 521.2172965220146
Iteration 7000, Cost: 518.482219327507
Iteration 8000, Cost: 516.9318121519656
Iteration 9000, Cost: 516.0529475582314
Optimized a0 (Intercept): 70.13887820532936
Optimized a1 (coef of x): -4.107267020359993
Optimized a2 (coef of x^2): 0.8562881238315246


In [169]:
y_pred = model.predict(X_test_scaled)
r2 = r2_score_scratch(y_test, y_pred)
print(f"R² test (scratch): {r2:.3f}")

R² test (scratch): 0.029
