In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [3]:
np.random.seed(42)

n_samples = 1000
mu = 1
sigma2 = 0.2
sigma = np.sqrt(sigma2)

x = np.random.uniform(0, 2*np.pi, n_samples)

x1 = x ** 3
x2 = np.sin(x)

noise = np.random.normal(mu, sigma, n_samples)

y = 2 - x1 + 3 * x2 + noise

df = pd.DataFrame({"x1": x1, "x2": x2, "y": y})

df.head()

Unnamed: 0,x1,x2,y
0,13.032707,0.709147,-7.825796
1,213.152138,-0.304745,-211.663558
2,97.288625,-0.993607,-97.099416
3,53.220265,-0.580945,-51.690038
4,0.942039,0.830661,4.80029


In [6]:
X = df[["x1", "x2"]]
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
def regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    ndei = rmse / np.std(y_true)
    return mse, rmse, ndei

In [12]:
X_train_a = X_train[["x2"]]
X_test_a = X_test[["x2"]]

model_a = LinearRegression(fit_intercept=True)
model_a.fit(X_train_a, y_train)

y_pred_train = model_a.predict(X_train_a)
y_pred_test = model_a.predict(X_test_a)

train_metrics_a = regression_metrics(y_train, y_pred_train)
test_metrics_a = regression_metrics(y_test, y_pred_test)

train_metrics_a, test_metrics_a

((2722.784513877546,
  np.float64(52.180307721184874),
  np.float64(0.7189231131525)),
 (2418.371458169814,
  np.float64(49.17694030915114),
  np.float64(0.7243218572196835)))

In [15]:
scaler = StandardScaler()

X_train_a_scaled = scaler.fit_transform(X_train_a)
X_test_a_scaled = scaler.transform(X_test_a)

model_a_scaled = LinearRegression()
model_a_scaled.fit(X_train_a_scaled, y_train)

y_pred_train = model_a_scaled.predict(X_train_a_scaled)
y_pred_test = model_a_scaled.predict(X_test_a_scaled)

train_metrics_a_scaled = regression_metrics(y_train, y_pred_train)
test_metrics_a_scaled = regression_metrics(y_test, y_pred_test)

train_metrics_a_scaled, test_metrics_a_scaled

((2722.784513877546,
  np.float64(52.180307721184874),
  np.float64(0.7189231131525)),
 (2418.3714581698146,
  np.float64(49.17694030915114),
  np.float64(0.7243218572196835)))

In [17]:
model_b = LinearRegression(fit_intercept=False)
model_b.fit(X_train, y_train)

y_pred_train = model_b.predict(X_train)
y_pred_test = model_b.predict(X_test)

train_metrics_b = regression_metrics(y_train, y_pred_train)
test_metrics_b = regression_metrics(y_test, y_pred_test)

train_metrics_b, test_metrics_b

((4.061958970807997,
  np.float64(2.015430219781374),
  np.float64(0.027767934518304734)),
 (4.223536281553006,
  np.float64(2.0551243956395937),
  np.float64(0.030269705876559947)))

In [18]:
model_c = LinearRegression(fit_intercept=True)
model_c.fit(X_train, y_train)

y_pred_train = model_c.predict(X_train)
y_pred_test = model_c.predict(X_test)

train_metrics_c = regression_metrics(y_train, y_pred_train)
test_metrics_c = regression_metrics(y_test, y_pred_test)

model_c.coef_, model_c.intercept_

(array([-1.00050827,  2.98714397]), np.float64(3.0775816678771477))