Задача: научиться обучать простейшую нейросетевую модель, практически эквивалентную алгоритму линейной регрессии, с помощью метода градиентного спуска в pytorch

https://www.kaggle.com/datasets/camnugent/california-housing-prices/code?datasetId=5227&searchQuery=NN

In [170]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [171]:
df = pd.read_csv('housing.csv')

In [149]:
# df = df.dropna().drop('ocean_proximity', axis=1)

In [172]:
df = df.dropna()

In [173]:
df['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [174]:
df['ocean_proximity'] = df['ocean_proximity'].map({'NEAR BAY':1, '<1H OCEAN':2, 'INLAND':3, 'NEAR OCEAN':4, 'ISLAND':5})

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  int64  
dtypes: float64(9), int64(1)
memory usage: 1.7 MB


In [176]:
features = df.drop("median_house_value", axis=1)
labels = df["median_house_value"]

In [177]:
scaler = StandardScaler()
features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

In [178]:
features.values

array([[-1.32731375,  1.05171726,  0.98216331, ..., -0.97683327,
         2.34516291, -1.71481644],
       [-1.32232256,  1.04235526, -0.60621017, ...,  1.67037262,
         2.33263161, -1.71481644],
       [-1.33230494,  1.03767426,  1.85576873, ..., -0.84342665,
         1.78293943, -1.71481644],
       ...,
       [-0.82320322,  1.77727236, -0.92388486, ..., -0.17377773,
        -1.14317103,  0.62656975],
       [-0.87311515,  1.77727236, -0.84446619, ..., -0.39350628,
        -1.05513604,  0.62656975],
       [-0.83318561,  1.74918635, -1.00330353, ...,  0.07995643,
        -0.78060586,  0.62656975]])

In [179]:
X_np = features.to_numpy()
y_np = labels.to_numpy()

In [180]:
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np/1000, test_size=0.2, random_state=42)

In [181]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16346, 9), (4087, 9), (16346,), (4087,))

In [182]:
y_train.shape = (y_train.shape[0], 1)
y_test.shape = (y_test.shape[0], 1)

In [183]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16346, 9), (4087, 9), (16346, 1), (4087, 1))

In [184]:
class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()

input_size = X_train.shape[1]
output_size = 1
learning_rate = 0.001
num_epochs = 100

model = LinearRegression(input_size, output_size)

criterion = nn.MSELoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


for epoch in range(num_epochs):

    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Эпоха [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

print('Параметры модели:')
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f'{name}: {param.data.numpy()}')

Эпоха [10/100], Loss: 54205.2500
Эпоха [20/100], Loss: 52300.3828
Эпоха [30/100], Loss: 50471.4648
Эпоха [40/100], Loss: 48715.4258
Эпоха [50/100], Loss: 47029.2734
Эпоха [60/100], Loss: 45410.1953
Эпоха [70/100], Loss: 43855.4648
Эпоха [80/100], Loss: 42362.5000
Эпоха [90/100], Loss: 40928.7891
Эпоха [100/100], Loss: 39551.9766
Параметры модели:
linear.weight: [[-1.246897   -2.7881444   2.5287776   2.4479282   0.9156868  -0.79199004
   0.7644151  14.238003   -4.221607  ]]
linear.bias: [37.417034]


In [185]:
with torch.no_grad():
    predicted = model(X_test).detach().numpy()

In [186]:
def calculate_mse(predicted, actual):

    assert len(predicted) == len(actual)

    squared_errors = np.square(predicted - actual)
    sum_squared_errors = np.sum(squared_errors)
    mse = sum_squared_errors / len(predicted)

    return mse

In [187]:
mse = calculate_mse(predicted, y_test.numpy())
print("TEST MSE:", mse)

TEST MSE: 40159.56936628334
