# Regression w/ California Houssing

### 1_Import and load dataset

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
import torch
import torch.nn as nn # clas Model(nn.Module)
from torch.utils.data import Dataset, DataLoader
# Dataset -> Column: x, y
# DataLoader -> mini-batch: 
# batch: 한 번에 신경망에 여러 입력 데이터를 묶어 전달하는 과정에서 '여러 입력 데이터의 묶음'을 'batch' 정의.
# mini-batch: 학습용 데이터셋에서 특정 개수로 이뤄진 배치를 랜덤으로 뽑은 것.

In [9]:
from sklearn.datasets import fetch_california_housing

ch = fetch_california_housing()
df = pd.DataFrame(ch.data, columns = ch.feature_names)
df['target'] = ch.target
print(df.shape)
df.head()

(20640, 9)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


### 2_Dataset -> DataLoader

In [10]:
df.iloc[:, :-1].head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [11]:
df.iloc[:, :-1].values

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [13]:
type(df.iloc[:, :-1].values)

numpy.ndarray

In [12]:
df.shape[1], len(df)

(9, 20640)

In [25]:
class MyDataset(Dataset):
    def __init__(self, df = df):
        self.df = df
        self.x = self.df.iloc[:, :-1].values # numpy.ndarray
        self.y = self.df.iloc[:, -1:].values # numpy.ndarray
        
    ## Ovveride 01
    def __len__(self):
        return len(self.df) # self.df.shape[0]
    
    ## Override 02
    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        # return x, y
        return torch.tensor(x, dtype = torch.float), torch.tensor(y, dtype = torch.float)

In [28]:
sample_ds = MyDataset(df)
sample = next(iter(sample_ds))
sample[0].shape, sample[1].shape, sample_ds

(torch.Size([8]), torch.Size([1]), <__main__.MyDataset at 0x281321940>)

In [36]:
import os
def prepare_loaders(df = df, index = 15640, batch_size = 2*512):
    # 1) Train, Valid Split
    train = df[:index].reset_index(drop = True)
    valid = df[index:].reset_index(drop = True)

    # 2) train, valid --> MyDataset --> train_ds, valid_ds 
    train_ds = MyDataset(df = train)
    valid_ds = MyDataset(df = valid)

    # 3) train_ds, valid_ds --> DataLoader --> train_loader, valid_loader
    train_loader = DataLoader(train_ds, batch_size = batch_size, num_workers = os.cpu_count(), shuffle = True, drop_last = True)
    valid_loader = DataLoader(valid_ds, batch_size = batch_size, num_workers = os.cpu_count(), shuffle = False, drop_last = True)
    print("DataLoader Completed")
    return train_loader, valid_loader

train_loader, valid_loader = prepare_loaders(df = df, index = 15640, batch_size = 2*512)

DataLoader Completed


In [37]:
os.cpu_count()

8

In [42]:
type(train_loader)

torch.utils.data.dataloader.DataLoader

In [43]:
sample = next(iter(train_loader))

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/miniconda3/envs/pytorch_gpu/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/miniconda3/envs/pytorch_gpu/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'MyDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

### 3_Model

In [46]:
device = torch.device("mps:0") if torch.backends.mps.is_available() else torch.device("cpu") # 기억할 것!
#device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
torch.backends.mps.is_available()

True

In [47]:
a = torch.randn(1, 2)
a

tensor([[-0.0668,  0.7401]])

In [49]:
b = a.to(device)
b

  nonzero_finite_vals = torch.masked_select(


tensor([[-0.0668,  0.7401]], device='mps:0')

In [52]:
c = a.cuda() # m2 macbook air엔 cuda가 없음
c

AssertionError: Torch not compiled with CUDA enabled

In [56]:
class Model(nn.Module):
    def __init__(self, input_dim = 8, output_dim = 1):
        super().__init__()
        # frame
        # 1)
        self.fc1 = nn.Linear(8, 6)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(6, 3)
        self.relu2 = nn.ReLU()
        self.last = nn.Linear(3, 1)
        
        # 2)
        # self.seq = nn.Sequential(
        #    nn.Linear(8, 4), nn.ReLU(), nn.Linear(4, 1)
        #)
        
    # 모델(nn.Module) 만들 때, override 01
    def forward(self, x):
        # X's inputshape = |(bs, 8)|
        # 1)
        output = self.last(self.relu2(self(fc2(self.relu1(self.fc1(x)))))
                           
        # 2)
        # output = self.seq(x)
        return output

SyntaxError: invalid syntax (536410290.py, line 25)