Visit this link to view different types of weight initialization method:
https://pytorch.org/docs/stable/nn.init.html

In [2]:
import torch
import torch.nn as nn

In [3]:
#initially create a linear layer and access it's weights
layer = nn.Linear(5,5)
layer

Linear(in_features=5, out_features=5, bias=True)

In [4]:
#to access the weights of the layers
layer.weight

Parameter containing:
tensor([[-0.0074, -0.4313,  0.0271, -0.2435,  0.3508],
        [ 0.3326,  0.2012,  0.0841,  0.3262,  0.1556],
        [ 0.0854, -0.3794, -0.2710, -0.1914, -0.3862],
        [-0.1178,  0.1093, -0.0612,  0.2609,  0.1188],
        [-0.2237, -0.3670, -0.4126,  0.1543,  0.0017]], requires_grad=True)

In [5]:
#if we just want to extract the weight data
layer.weight.data

tensor([[-0.0074, -0.4313,  0.0271, -0.2435,  0.3508],
        [ 0.3326,  0.2012,  0.0841,  0.3262,  0.1556],
        [ 0.0854, -0.3794, -0.2710, -0.1914, -0.3862],
        [-0.1178,  0.1093, -0.0612,  0.2609,  0.1188],
        [-0.2237, -0.3670, -0.4126,  0.1543,  0.0017]])

# Uniform distribution

In [7]:
#now we will use uniform weight initialization 
#nn.init.uniform_(tensor, a=min, b=max)
#see uniform is an inplace operation, we can modify already created tensor of weight.
nn.init.uniform_(layer.weight, a=0, b=3)

Parameter containing:
tensor([[0.9865, 1.3096, 2.0779, 1.8543, 2.7495],
        [2.7076, 1.8966, 0.1383, 1.3601, 0.1197],
        [1.0159, 2.6194, 2.3701, 2.5046, 1.1396],
        [0.6206, 2.3125, 1.7597, 1.3293, 1.6841],
        [2.2100, 1.3446, 1.9831, 0.6279, 0.6509]], requires_grad=True)

# Normal distribution

In [8]:
nn.init.normal_(layer.weight, mean=0, std=1)
#we can change the mean and standard deviation according to our application

Parameter containing:
tensor([[-0.0147,  0.2773, -0.2340,  0.7025, -0.1415],
        [-0.0642, -0.1373, -0.9278,  0.0841,  0.2325],
        [-1.5281,  1.9706,  0.7277,  0.5208,  2.2696],
        [ 1.3558, -0.0715, -0.3536, -1.1408,  0.4176],
        [-0.7229, -0.4758,  0.0793,  0.0946, -1.0895]], requires_grad=True)

# Constant value
 we studied that, if we set all of our weights to same value, it leads to a serious problem. But when we set our biases, in that cases we can use  this constant value weight initialization technique.

In [9]:
nn.init.constant_(layer.bias, 0)

Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)

In [12]:
#alternative way of setting bias to zeros
nn.init.zeros_(layer.bias)

Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)

# Xavier initialization

In [13]:
nn.init.xavier_uniform_(layer.weight, gain=1)

Parameter containing:
tensor([[ 0.4952,  0.6450,  0.1572,  0.1950, -0.2333],
        [ 0.2018,  0.6011, -0.6068, -0.4076,  0.7107],
        [ 0.1462, -0.2949,  0.5533,  0.5500,  0.4321],
        [-0.4588,  0.1976, -0.5303, -0.3195,  0.7570],
        [ 0.3256, -0.3610,  0.7669, -0.0242,  0.2205]], requires_grad=True)

In [14]:
nn.init.xavier_normal(layer.weight, gain=1)

  nn.init.xavier_normal(layer.weight, gain=1)


Parameter containing:
tensor([[-0.3224, -0.2205,  0.4279,  0.2927,  0.4726],
        [ 0.7371,  0.5861, -0.2509,  0.4303, -0.2600],
        [-0.5753,  0.2633,  0.2019, -0.7946, -0.0590],
        [ 0.1780,  0.2014,  0.2154,  0.1619,  0.3781],
        [-0.3715, -0.6067, -0.2076, -0.4693, -0.0506]], requires_grad=True)

xavier_uniform takes sample from U(−a,a) whereas xavier_normal takes sample from N(0,std^2). To know more differences between them and other weight initialization technique check out the link given in the first cell.