In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.nn.utils import clip_grad_norm_

import tensorflow as tf
import tensorflow_datasets as tfds
import tfDatasets


# Pytorch :
The first step is to create the dataset, which is a custom class because the way in which the data is accessed depends on the current configuration. 

(need to implement validation)

In [2]:
class weathTrueValuesBasic(Dataset):
    def __init__(self, csv_file, root_dir):
        self.trueValues = torch.tensor(pd.read_csv(os.path.join(root_dir, csv_file), dtype='float32').drop('id', axis=1).values, device='cuda')
        self.root_dir = root_dir

    def __len__(self):
        return len(self.trueValues)
    
    def __getitem__(self, idx):
        data = self.trueValues[idx, [0,1,3,4,5,6]]
        label = self.trueValues[idx, 2]
        return data, label


Load the dataset

In [3]:
data = weathTrueValuesBasic(csv_file='weathTrueValuesBasic.csv', root_dir='data') 

Split it into train and test sets. Validation will be implemented at a later stage.

In [4]:
trainValuesIdx, testValuesIdx = train_test_split(range(len(data)), test_size=0.2, random_state=42)

In [5]:
dataTrainSubset = Subset(data, trainValuesIdx)
dataTestSubset = Subset(data, testValuesIdx)

trainLoader = DataLoader(dataTrainSubset, batch_size=128, shuffle=True)
testLoader = DataLoader(dataTestSubset, batch_size=128, shuffle=True)

The dataset and dataloader are ready. Now to setup the neural network. The first step is to make the network work on the gpu.

In [6]:
device = ("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using {device} device")

Using cuda device


Setup the neural network class

In [7]:
class neuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(6, 128), 
            nn.ReLU(),
            nn.Linear(128, 128), 
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    
    def forward(self, x):
        return self.linear_relu_stack(x)

In [8]:
model = neuralNetwork()

model.load_state_dict(torch.load('model.pth'))

# model = neuralNetwork().to(device)
# print(model)

neuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)


The model is built now to the training.

In [9]:
learning_rate = 1e-4
batch_size = 128
epochs = 1

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [10]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()

    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        y = y.view(-1, 1)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        optimizer.zero_grad()
        

        if batch % 7800 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()

    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            y = y.view(-1, 1)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [11]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainLoader, model, loss_fn, optimizer)
    test_loop(testLoader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 73316.187500  [  128/10118486]
loss: 50.557724  [998528/10118486]
loss: 46.690651  [1996928/10118486]
loss: 39.586533  [2995328/10118486]
loss: 43.447517  [3993728/10118486]
loss: 38.924362  [4992128/10118486]
loss: 40.351791  [5990528/10118486]
loss: 40.841446  [6988928/10118486]
loss: 40.986622  [7987328/10118486]
loss: 34.269966  [8985728/10118486]
loss: 33.416607  [9984128/10118486]
Test Error: 
 Accuracy: 12431.8%, Avg loss: 36.361928 

Done!


In [18]:
torch.save(model.state_dict(), 'model.pth')

In [21]:
newmodel = neuralNetwork()

newmodel.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [20]:
newmodel

neuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)

# TensorFlow
Unlike PyTorch datasets, which are typically custom Python objects representing the data, TensorFlow datasets are often loaded from serialized files or other data sources (TFRecords, CSV files, NumPy arrays).

In [14]:
tfTrueValueBasicDS = tfds.load('tfTrueValueBasic')