### Using Multiple GPUs

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd


In [None]:
import os
# The jupyter notebook is launched from your $HOME directory.
# Change the working directory to the workshop directory
# which was created in your username directory under /scratch/vp91
os.chdir(os.path.expandvars("/scratch/vp91/$USER/"))

#### Set Device
Se the default device as the GPU if it exists

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

### Dataloader

In [None]:
datapath = os.path.expandvars('/scratch/vp91/$USER/intro-to-pytorch/data/pima-indians-diabetes.data.csv')

# Define the custom Dataset class
column_names = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]

# Define the custom Dataset class
class PimaDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file, header=None, names=column_names)
        self.features = self.data.drop('Outcome', axis=1).values
        self.labels = self.data['Outcome'].values

        self.features_tensor = torch.tensor(self.features, dtype=torch.float32)
        self.labels_tensor = torch.tensor(self.labels, dtype=torch.float32)  # Changed to float

        self.mean = self.features_tensor.mean(dim=0)
        self.std = self.features_tensor.std(dim=0)
        self.features_tensor = (self.features_tensor - self.mean) / self.std

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature = self.features_tensor[idx]
        label = self.labels_tensor[idx]
        return feature, label

In [None]:
dataset = PimaDataset(datapath)

In [None]:
batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

### Defining the Model

In [None]:
class PimaClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(8, 12)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(12, 8)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(8, 1)
        self.act_output = nn.Sigmoid()
 
    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act2(self.hidden2(x))
        x = self.act_output(self.output(x))
        return x

In [None]:
class_model = PimaClassifier()
print(class_model)

#### Data Parallelism
Pytorch will only use one GPU by default. You can easily run your operations on multiple GPUs by making your model run parallelly using `nn.DataParallel`. 

Check for multiple GPUs and if multiple GPUs are available, wrap the model with `nn.DataParallel`. Finally, move the model to the GPUs using `model.to(device)`.

In [None]:
print(torch.cuda.device_count())

In [None]:
# Move model to primary device first
class_model = class_model.to(device)

In [None]:
print("Model main device:", next(class_model.parameters()).device)

In [None]:
# Wrap in DataParallel if multiple GPUs available
if torch.cuda.device_count() > 1:
    class_model = nn.DataParallel(class_model)
    print(f"Using {torch.cuda.device_count()} GPUs: {', '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())])}")


In [None]:
loss_fn = nn.BCELoss()

In [None]:
optimizer = optim.Adam(class_model.parameters(), lr=0.001)

#### Training the Model

DataParallel splits your data automatically and sends job orders to multiple models on several GPUs. After each model finishes their job, DataParallel collects and merges the results before returning it to you.

In [None]:
%%time
n_epochs = 100


for epoch in range(n_epochs):
    running_loss = 0.0
    for batch_features, batch_labels in data_loader:
        # Move data to the same device as the model (cuda:0)
        batch_features = batch_features.to(device)
        batch_labels = batch_labels.to(device).unsqueeze(1).float()  # Ensure shape is [B, 1] for BCEWithLogitsLoss

        # Forward pass
        outputs = class_model(batch_features)

        # Compute loss
        loss = loss_fn(outputs, batch_labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_features.size(0)

    epoch_loss = running_loss / len(dataset)
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss:.4f}')

In [None]:
print(f"Model is on device: {next(class_model.parameters()).device}")
if isinstance(class_model, nn.DataParallel):
    print(f"DataParallel devices: {class_model.device_ids}")

### Exercise

1. **What is the time difference in training**? Compare it with the previous training (change epoch to 100).