# Regresja Liniowa i Logistyczna w czystym NumPy

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Wczytanie danych
data = pd.read_csv("../data/ObesityDataSet.csv")

### 1. CZĘŚĆ: Regresja liniowa (zamknięta forma)


In [None]:
# Przewidujemy wagę (Weight) na podstawie innych cech numerycznych

numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.drop('Weight')
# numerical_features = ["Age", "Height", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
X = data[numerical_features].values
y = data["Weight"].values.reshape(-1, 1)

# Dodanie biasu (kolumna jedynek)
X_b = np.hstack([np.ones((X.shape[0], 1)), X])

# Zamknięta forma: theta = (X^T X)^(-1) X^T y
theta_closed = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y

# Predykcja
y_pred_closed = X_b @ theta_closed

# Mean Squared Error
mse_closed = np.mean((y - y_pred_closed) ** 2)
print("Zamknięta forma - MSE:", mse_closed)

### 2. CZĘŚĆ: Regresja Liniowa – Gradient Descent

In [None]:
# Funkcja kosztu – Mean Squared Error
def mse_loss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

# Funkcja obliczająca gradient MSE względem parametrów theta
def mse_gradient(X, y, theta):
    return (2 / X.shape[0]) * X.T @ (X @ theta - y)

# Normalizacja danych
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
X_b = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])

# Inicjalizacja
theta_gd = np.random.randn(X_b.shape[1], 1)
learning_rate = 0.01
n_epochs = 1000

# Gradient descent

# for epoch in range(n_epochs):
#     gradients = mse_gradient(X_b, y, theta_gd)
#     theta_gd -= learning_rate * gradients

batch_size = 32
n_samples = X_b.shape[0]

for epoch in range(n_epochs):
    # Losowe przetasowanie danych
    indices = np.random.permutation(n_samples)
    X_b_shuffled = X_b[indices]
    y_shuffled = y[indices]

    for i in range(0, n_samples, batch_size):
        X_batch = X_b_shuffled[i:i+batch_size]
        y_batch = y_shuffled[i:i+batch_size]

        gradients = mse_gradient(X_batch, y_batch, theta_gd)
        theta_gd -= learning_rate * gradients

# Predykcja i MSE
y_pred_gd = X_b @ theta_gd
mse_gd = mse_loss(y, y_pred_gd)
print("Gradient Descent - MSE:", mse_gd)

### 3. CZĘŚĆ: Regresja Logistyczna – Gradient Descent

In [None]:
# Binarna klasyfikacja: czy ktoś jest "Overweight" lub "Obesity" vs reszta
data["target_binary"] = data["NObeyesdad"].apply(lambda x: 1 if ("Overweight" in x or "Obesity" in x) else 0)

# Proste cechy: Age i Weight
numerical_cols = data.select_dtypes(include=['int64', 'float64']).drop(columns=["target_binary"]).columns
X_cls = data[numerical_cols].values
y_cls = data["target_binary"].values.reshape(-1, 1)

# Normalizacja
scaler = StandardScaler()
X_cls_scaled = scaler.fit_transform(X_cls)
X_cls_b = np.hstack([np.ones((X_cls_scaled.shape[0], 1)), X_cls_scaled])

# Sigmoid
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Cross-Entropy Loss
def cross_entropy(y_true, y_pred):
    eps = 1e-10
    return -np.mean(y_true * np.log(y_pred + eps) + (1 - y_true) * np.log(1 - y_pred + eps))

# Gradient descent for logistic regression
batch_size = 32
num_samples = X_cls_b.shape[0]
theta_log = np.random.randn(X_cls_b.shape[1], 1)
learning_rate = 0.05
n_epochs = 1000

for epoch in range(n_epochs):
    indices = np.arange(num_samples)
    np.random.shuffle(indices)  # tasowanie danych

    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        batch_idx = indices[start_idx:end_idx]

        X_batch = X_cls_b[batch_idx]
        y_batch = y_cls[batch_idx]

        logits = X_batch @ theta_log
        y_pred = sigmoid(logits)
        gradients = X_batch.T @ (y_pred - y_batch) / y_batch.shape[0]
        theta_log -= learning_rate * gradients

# Ewaluacja
predicted_classes = (sigmoid(X_cls_b @ theta_log) >= 0.5).astype(int)
accuracy = np.mean(predicted_classes == y_cls)
print("Logistyczna regresja (Gradient Descent) - Accuracy:", accuracy)

### 4. CZĘŚĆ: Porównanie z scikit-learn

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression

# Regresja liniowa
lr = LinearRegression()
lr.fit(X, y)
mse_sklearn = np.mean((lr.predict(X) - y) ** 2)
print("sklearn LinearRegression - MSE:", mse_sklearn)

# Regresja logistyczna
logreg = LogisticRegression()
logreg.fit(X_cls_scaled, data["target_binary"])
acc_sklearn = logreg.score(X_cls_scaled, data["target_binary"])
print("sklearn LogisticRegression - Accuracy:", acc_sklearn)