In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
from torch.utils.data import TensorDataset, DataLoader

In [3]:
df = pd.read_csv('housing.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df.dropna(inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [8]:
df = pd.get_dummies(df, columns=['ocean_proximity'])

In [9]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
scaler = MinMaxScaler()

In [12]:
names = df.columns
d = scaler.fit_transform(df)

scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.902266,0.0,0.0,0.0,1.0,0.0
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.06721,0.186976,0.538027,0.708247,0.0,0.0,0.0,1.0,0.0
2,0.210159,0.564293,1.0,0.03726,0.02933,0.013818,0.028943,0.466028,0.695051,0.0,0.0,0.0,1.0,0.0
3,0.209163,0.564293,1.0,0.032352,0.036313,0.015555,0.035849,0.354699,0.672783,0.0,0.0,0.0,1.0,0.0
4,0.209163,0.564293,1.0,0.04133,0.043296,0.015752,0.042427,0.230776,0.674638,0.0,0.0,0.0,1.0,0.0


In [13]:
value = scaled_df['median_house_value']

In [14]:
value

0        0.902266
1        0.708247
2        0.695051
3        0.672783
4        0.674638
           ...   
20428    0.130105
20429    0.128043
20430    0.159383
20431    0.143713
20432    0.153403
Name: median_house_value, Length: 20433, dtype: float64

In [15]:
scaled_df.drop('median_house_value', axis=1, inplace=True)

### Linear regression

In [16]:
X_train = torch.tensor(scaled_df.values[::2], dtype=torch.float32)

In [17]:
X_test = torch.tensor(scaled_df.values[1::2], dtype=torch.float32)

In [18]:
y_train = torch.tensor(value.values[::2], dtype=torch.float32)

In [19]:
y_test = torch.tensor(value.values[1::2], dtype=torch.float32)

In [20]:
X_train

tensor([[0.2112, 0.5675, 0.7843,  ..., 0.0000, 1.0000, 0.0000],
        [0.2102, 0.5643, 1.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.2092, 0.5643, 1.0000,  ..., 0.0000, 1.0000, 0.0000],
        ...,
        [0.3247, 0.7375, 0.4706,  ..., 0.0000, 0.0000, 0.0000],
        [0.3118, 0.7322, 0.3137,  ..., 0.0000, 0.0000, 0.0000],
        [0.3098, 0.7258, 0.2941,  ..., 0.0000, 0.0000, 0.0000]])

In [21]:
batch_size = 10
dataset = TensorDataset(X_train, y_train)
# Randomly reading mini-batches
data_iter = DataLoader(dataset, batch_size, shuffle=True)

# Read a batch to see how it works
for X, y in data_iter:
    print(X, y)
    break

tensor([[0.7072, 0.1477, 0.1373, 0.2571, 0.2731, 0.1626, 0.2799, 0.2597, 0.0000,
         1.0000, 0.0000, 0.0000, 0.0000],
        [0.4064, 0.6132, 0.2353, 0.0838, 0.1026, 0.0074, 0.0158, 0.1776, 0.0000,
         1.0000, 0.0000, 0.0000, 0.0000],
        [0.7131, 0.0638, 0.2157, 0.0928, 0.1431, 0.0469, 0.1196, 0.1260, 1.0000,
         0.0000, 0.0000, 0.0000, 0.0000],
        [0.6414, 0.1137, 0.5098, 0.0457, 0.0531, 0.0121, 0.0332, 0.3738, 1.0000,
         0.0000, 0.0000, 0.0000, 0.0000],
        [0.7181, 0.0276, 0.7843, 0.0616, 0.0726, 0.0507, 0.0765, 0.1114, 0.0000,
         0.0000, 0.0000, 0.0000, 1.0000],
        [0.5966, 0.1498, 0.8039, 0.0283, 0.0469, 0.0302, 0.0490, 0.2223, 1.0000,
         0.0000, 0.0000, 0.0000, 0.0000],
        [0.3058, 0.6514, 0.5490, 0.0276, 0.0268, 0.0120, 0.0284, 0.2664, 0.0000,
         1.0000, 0.0000, 0.0000, 0.0000],
        [0.1992, 0.5717, 0.6863, 0.0234, 0.0309, 0.0163, 0.0386, 0.0981, 0.0000,
         0.0000, 0.0000, 1.0000, 0.0000],
        [0.6106,

In [22]:
inputs = X_train.shape[1]
model = torch.nn.Sequential(torch.nn.Linear(inputs, 1))

In [23]:
model

Sequential(
  (0): Linear(in_features=13, out_features=1, bias=True)
)

In [24]:
true_w = torch.randn(X_train.shape[1], dtype=torch.float32)
true_w

tensor([ 1.5682,  2.1345, -1.0208,  0.2714,  0.1695,  0.4788,  0.2524, -1.4712,
         0.4939, -0.0929,  1.3914,  1.8194,  1.7034])

In [25]:
true_b = torch.zeros((1))
true_b

tensor([0.])

In [26]:
model[0].weight.data = true_w.clone().detach().requires_grad_(True).reshape((1, X_train.shape[1]))
model[0].bias.data = torch.tensor([true_b], requires_grad = True)

In [27]:
loss = torch.nn.MSELoss(reduction='mean')

In [28]:
trainer = torch.optim.SGD(model.parameters(), lr=0.001) #stohastic gradient descent

In [29]:
num_epochs = 100
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        trainer.zero_grad()
        l = loss(model(X).reshape(-1), y)
        l.backward()
        trainer.step()
    l = loss(model(X_train).reshape(-1), y_train)
    if epoch % 5 == 0:
        print('epoch %d, loss: %f' % (epoch, l.item()),'|\tw', model[0].weight.data, '|\tb', model[0].bias.data)

epoch 5, loss: 0.187019 |	w tensor([[ 1.4685,  1.6386, -0.9700,  0.1927,  0.0357,  0.4171,  0.1254, -1.2965,
          0.8313,  0.1129,  1.3919,  1.1954,  1.0660]]) |	b tensor([-0.7177])
epoch 10, loss: 0.130913 |	w tensor([[ 1.4370,  1.4911, -0.6761,  0.1665, -0.0294,  0.3838,  0.0656, -1.0077,
          0.8435,  0.2259,  1.3924,  1.0446,  0.9361]]) |	b tensor([-0.8729])
epoch 15, loss: 0.100133 |	w tensor([[ 1.3896,  1.3920, -0.4667,  0.1552, -0.0724,  0.3598,  0.0268, -0.7352,
          0.8177,  0.2675,  1.3926,  0.9705,  0.8866]]) |	b tensor([-0.9803])
epoch 20, loss: 0.080125 |	w tensor([[ 1.3451,  1.3194, -0.3201,  0.1533, -0.0999,  0.3424,  0.0024, -0.4865,
          0.7964,  0.2942,  1.3927,  0.9207,  0.8577]]) |	b tensor([-1.0535])
epoch 25, loss: 0.066184 |	w tensor([[ 1.3013,  1.2606, -0.2196,  0.1563, -0.1178,  0.3292, -0.0130, -0.2656,
          0.7773,  0.3120,  1.3926,  0.8836,  0.8378]]) |	b tensor([-1.1119])
epoch 30, loss: 0.056191 |	w tensor([[ 1.2608,  1.2119, -0.14

In [30]:
w = model[0].weight.data

In [31]:
b = model[0].bias.data

In [32]:
b

tensor([-1.2841])

In [33]:
def model_(x):
    return x @ w.T + b

In [34]:
y_predicted = model_(X_test)

In [35]:
y_predicted.T

tensor([[0.7929, 0.6364, 0.5194,  ..., 0.3327, 0.2664, 0.2089]])

## MSE

In [36]:
mse = (((y_test - y_predicted.T)**2).sum())/len(y_test)
mse

tensor(0.0275)

In [37]:
from sklearn.metrics import mean_squared_error

In [45]:
mean_squared_error(y_test, y_predicted)

0.027469471