<a href="https://colab.research.google.com/github/Greg8128/4.10-Kaggle-house-prices/blob/main/Pytorch_Lightning_4_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import os
if 'd2l' not in sys.modules:
    for lib in ['d2l', 'torch', 'statistics', 'pytorch-lightning']: 
        print("Now installing: " + lib)   
        os.system('pip install '+ lib) 
print("All libraries installed!")


import hashlib                                                                  
import os
import tarfile
import zipfile
import requests
import statistics
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch import nn
import torch.utils.data as _data
from d2l import torch as d2l
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import pytorch_lightning as pl

# Download
def download(name, cache_dir=os.path.join('..', 'data')):  
    assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # Hit cache
    print(f'Downloading {fname} from {url}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

def download_extract(name, folder=None): 
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, 'Only zip/tar files can be extracted.'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all(): 
    for name in DATA_HUB:
        print(name)
        print(DATA_HUB[name])
        print(download(name))

DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'  

DATA_HUB['kaggle_house_train'] = ( 
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')
DATA_HUB['kaggle_house_test'] = (
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

# Get training data and test data
train_data = pd.read_csv(download('kaggle_house_train'))
test_data  = pd.read_csv(download('kaggle_house_test'))
print(download('kaggle_house_train'))

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features = pd.get_dummies(all_features, dummy_na=True)


print(train_data.shape)
print(test_data.shape)

print (all_features.shape)

# If test data were inaccessible, mean and standard deviation could be
# calculated from training data
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# After standardizing the data all means vanish, hence we can set missing
# values to 0
all_features[numeric_features] = all_features[numeric_features].fillna(0)

# `Dummy_na=True` considers "na" (missing value) as a valid feature value, and
# creates an indicator feature for it
all_features = pd.get_dummies(all_features, dummy_na=True)
print(all_features.shape)

n_train = train_data.shape[0]

IN_FEATURES = all_features.shape[1]
print(IN_FEATURES)
train_features = torch.tensor(all_features[:n_train].values,
                              dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values,
                             dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1),
                            dtype=torch.float32)


class DataModule(pl.LightningDataModule):
    def __init__(self, data_dir = "path/to/dir", batch_size = 32):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def setup(self, stage = None):
        self.data = pd.read_csv(self.data_dir)
        


        all_features = pd.concat((self.data.iloc[:, 1:-1], self.data.iloc[:, 1:]))
        print(all_features)
        numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index

        all_features[numeric_features] = all_features[numeric_features].apply(
            lambda x: (x - x.mean()) / (x.std()))
        
        all_features[numeric_features] = all_features[numeric_features].fillna(0)

        all_features = pd.get_dummies(all_features, dummy_na=True)


        print(train_data.shape)
        print(test_data.shape)

        print (all_features.shape)

        # If test data were inaccessible, mean and standard deviation could be
        # calculated from training data
        numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
        all_features[numeric_features] = all_features[numeric_features].apply(
            lambda x: (x - x.mean()) / (x.std()))
        # After standardizing the data all means vanish, hence we can set missing
        # values to 0
        all_features[numeric_features] = all_features[numeric_features].fillna(0)

        # `Dummy_na=True` considers "na" (missing value) as a valid feature value, and
        # creates an indicator feature for it
        all_features = pd.get_dummies(all_features, dummy_na=True)
        print(all_features.shape)

        n_train = train_data.shape[0]

        IN_FEATURES = all_features.shape[1]
        print(IN_FEATURES)

        train_features = torch.tensor(all_features[:n_train].values,
                                    dtype=torch.float32)
        test_features = torch.tensor(all_features[n_train:].values,
                                    dtype=torch.float32)
        train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1),
                                    dtype=torch.float32)
        
        self.mnist_train, self.mnist_val = _data.random_split(all_features, )

    def train_dataloader(self):
        return _data.DataLoader(self.mnist_train, batch_size=self.batch_size)

    def val_dataloader(self):
        return _data.DataLoader(self.mnist_val, batch_size=self.batch_size)

    def test_dataloader(self):
        return _data.DataLoader(self.mnist_test, batch_size=self.batch_size)

    def teardown(self, stage = None):
        None
        # Used to clean-up when the run is finished


# Pytorch Lightning
class Model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(IN_FEATURES, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        print(batch)
        X, y = batch
        X = X.view(X.size(0), -1)
        _loss = nn.MSELoss(X, y)

        # Logging to TensorBoard by default
        self.log('train_loss', _loss)
        return _loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

dm = DataModule(download('kaggle_house_train'))
model = Model()
trainer = pl.Trainer()
trainer.fit(model, dm)
trainer.test(datamodule=dm)

All libraries installed!
../data/kaggle_house_pred_train.csv
(1460, 81)
(1459, 80)
(2919, 331)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


(2919, 331)
331
      MSSubClass MSZoning  LotFrontage  ...  SaleType SaleCondition SalePrice
0             60       RL         65.0  ...        WD        Normal       NaN
1             20       RL         80.0  ...        WD        Normal       NaN
2             60       RL         68.0  ...        WD        Normal       NaN
3             70       RL         60.0  ...        WD       Abnorml       NaN
4             60       RL         84.0  ...        WD        Normal       NaN
...          ...      ...          ...  ...       ...           ...       ...
1455          60       RL         62.0  ...        WD        Normal  175000.0
1456          20       RL         85.0  ...        WD        Normal  210000.0
1457          70       RL         66.0  ...        WD        Normal  266500.0
1458          20       RL         68.0  ...        WD        Normal  142125.0
1459          20       RL         75.0  ...        WD        Normal  147500.0

[2920 rows x 80 columns]


  rank_zero_warn(f"you passed in a {loader_name} but have no {step_name}. Skipping {stage} loop")


(1460, 81)
(1459, 80)
(2920, 332)



  | Name | Type       | Params
------------------------------------
0 | net  | Sequential | 21.3 K
------------------------------------
21.3 K    Trainable params
0         Non-trainable params
21.3 K    Total params
0.085     Total estimated model params size (MB)


(2920, 332)
332


  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

tensor([[ 7.3362e-02, -2.2933e-01, -2.0711e-01,  ...,  4.6757e-01,
         -3.0594e-01,  0.0000e+00],
        [-8.7241e-01,  4.5186e-01, -9.1871e-02,  ...,  4.6757e-01,
         -3.0594e-01,  0.0000e+00],
        [ 7.3362e-02, -9.3094e-02,  7.3467e-02,  ...,  4.6757e-01,
         -3.0594e-01,  0.0000e+00],
        ...,
        [-6.3597e-01, -4.5640e-01, -4.2014e-01,  ...,  4.6757e-01,
         -3.0594e-01,  0.0000e+00],
        [ 3.0981e-01, -9.1052e-01, -2.0210e-01,  ...,  4.6757e-01,
         -3.0594e-01,  0.0000e+00],
        [-8.7241e-01,  2.2976e-16, -1.9769e-01,  ...,  4.6757e-01,
         -3.0594e-01,  0.0000e+00]])


ValueError: ignored