# Parameter
  
如何用DataParallel来使用多 GPU。  
通过PyTorch使用多个GPU非常简单。你可以将模型放在一个 GPU：  
```
device = torch.device("cuda:0")  
model.to(device)  
```
然后，你可以复制所有的张量到 GPU：  
```
mytensor = my_tensor.to(device)'  
```
只是调用 my_tensor.to(device) 返回一个 my_tensor 新的复制在GPU上，而不是重写 my_tensor。你需要分配给他一个新的张量并且在 GPU 上使用这个张量。  
在多 GPU 中执行前馈、后馈操作是非常自然的。尽管如此，PyTorch 默认只会使用一个GPU。通过使用 DataParallel 让模型并行运行，是很容易的。  
```
model = nn.DataParallel(model)'   
```

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# 参数
input_size = 5
output_size = 2
batch_size = 30
data_size = 100

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
# 生成数据
# 生成一个玩具数据。你只需要实现 getitem. 
class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)
    
    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return self.len

In [12]:
print (*RandomDataset(input_size, data_size))

tensor([-0.7808, -0.2692,  0.2331,  1.4260,  0.1552]) tensor([-0.0328, -0.2052,  0.1001,  0.1216,  0.5792]) tensor([ 0.1884, -0.6037, -0.1976,  0.3486, -0.5048]) tensor([-0.9775,  0.4831, -1.2960, -1.4174, -0.9343]) tensor([ 1.3104, -0.1092,  0.4479, -1.1926,  0.0422]) tensor([-0.6270, -0.8805, -1.3384, -0.6248, -0.5144]) tensor([-0.1781, -0.2820,  1.3492, -0.1941,  0.4501]) tensor([-0.7169,  0.6996,  0.3792,  0.3767, -0.0820]) tensor([ 1.8997, -0.7821, -0.8401,  0.4070,  1.4371]) tensor([ 1.3957, -0.6585,  3.1682,  0.9110, -1.1238]) tensor([-1.8214,  0.7353, -1.0977, -0.3508,  0.4149]) tensor([-0.9801, -0.0915, -0.1385, -0.6687, -0.6169]) tensor([-0.0890,  0.0896,  0.5461,  0.1845,  1.9233]) tensor([-1.0313, -0.2603,  0.7342,  1.5796, -2.2195]) tensor([-0.5531,  1.0040, -1.0429,  0.6244, -0.0396]) tensor([ 0.3229,  0.4049, -2.2026,  0.3023, -0.4129]) tensor([ 1.3366,  1.4404, -1.5362, -1.0688, -0.9580]) tensor([ 0.6549,  1.1600, -1.2970, -0.2273,  2.8085]) tensor([ 0.2425, -1.5962, -0

In [5]:
rand_loader = DataLoader(dataset = RandomDataset(input_size, data_size), 
                        batch_size = batch_size, shuffle = True)

In [6]:
# 简单模型
# 的模型只是获得一个输入，执行一个线性操作，然后给一个输出。
# 放置了一个输出声明在模型中来检测输出和输入张量的大小。
# 注意在 batch rank 0 中的输出。
class Model(nn.Module):
    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc = nn.Linear(input_size, output_size)
    
    def forward(self, input):
        output = self.fc(input)
        print("\tIn Model: input size", input.size(),"output size", output.size())
        return output 

In [7]:
cnt = torch.cuda.device_count()
print(cnt)

1


In [8]:
model = Model(input_size, output_size)
if cnt > 1:
    print("Let's use", cnt, "GPU")
    model = nn.DataParallel(model) # 重点⭐
model.to(device)

Model(
  (fc): Linear(in_features=5, out_features=2, bias=True)
)

In [9]:
for data in rand_loader:
    input = data.to(device)
    output = model(input)
    print("Outside: input size", input.size(),"output_size", output.size())

	In Model: input size torch.Size([30, 5]) output size torch.Size([30, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([30, 5]) output size torch.Size([30, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([30, 5]) output size torch.Size([30, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])


数据并行自动拆分了数据并且将任务单发送到多个 GPU 上。当每一个模型都完成自己的任务之后，DataParallel 收集并且合并这些结果然后返回