In [342]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from data_process import process_data
from constants import numeric_features, categorical_features
from ai_models.shared import load_train_with_validation_data, load_processed_data

In [343]:
df = load_processed_data()
df = df.dropna(subset=["Curricular units 1st sem (enrolled)"])

In [344]:
X = df.drop(columns=["Curricular units 1st sem (enrolled)"])
y = df["Curricular units 1st sem (enrolled)"]

In [345]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [346]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [347]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [348]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [349]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw).toarray()
X_test = full_pipeline.transform(X_test_raw).toarray()
X_val = full_pipeline.transform(X_val_raw).toarray()

In [350]:
def closed_form_linear_regression(X_train, y_train, X_test):
    y = y_train.to_numpy() if hasattr(y_train, 'to_numpy') else y_train
    X_train_bias = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    X_test_bias  = np.hstack([np.ones((X_test.shape[0], 1)),  X_test])  # ← poprawka tutaj
    XtX = X_train_bias.T @ X_train_bias
    Xty = X_train_bias.T @ y
    w   = np.linalg.pinv(XtX) @ Xty
    y_pred = X_test_bias @ w
    return y_pred, w



$$
\mathbf{W} = (X^T X)^{-1} X^T Y
$$

Gdzie:
- W — wektor współczynników regresji,
- X — macierz cech (z dodanym biasem),
- Y — wektor wartości docelowych (target),

In [351]:
def print_metrics(y_true, y_pred, label):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{label} Results:")
    print(f"  MSE : {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE : {mae:.4f}")
    print(f"  R2  : {r2:.4f}\n")

y_test_pred, weights = closed_form_linear_regression(X_train, y_train, X_test)
y_val_pred, _  = closed_form_linear_regression(X_train, y_train, X_val)
print_metrics(y_test, y_test_pred, "Test")
print_metrics(y_val, y_val_pred, "Validation")

Test Results:
  MSE : 0.2266
  RMSE: 0.4760
  MAE : 0.3109
  R2  : 0.9657

Validation Results:
  MSE : 0.2396
  RMSE: 0.4895
  MAE : 0.3143
  R2  : 0.9641

