<a href="https://colab.research.google.com/github/Its-Shivanshu-Sharma/HumanPoseDetection/blob/main/HumanPoseDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing Libraries

In [None]:
%%bash
pip3 install -qq torch==1.10.2+cu113 torchvision==0.11.3+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

## Fetching & Preparing the Dataset

In [None]:
# Set the seed for the RNG(Random Number Generator manually
# (for reproducibility)
torch.manual_seed(10)

### NOTE:
The dataset used in this project is the [`Leeds Sports Pose Dataset`](http://sam.johnson.io/research/lsp.html).

In [None]:
%%bash
# Create a new directory called `pose_dataset` to store the data
mkdir pose_dataset
cd pose_dataset
wget -q http://sam.johnson.io/research/lsp_dataset.zip
unzip -qq lsp_dataset.zip
rm lsp_dataset.zip

In [None]:
from pathlib import Path
import pandas as pd
from scipy.io import loadmat

In [None]:
root = Path("./pose_dataset")

In [None]:
raw_data = loadmat(root/"joints.mat")

In [None]:
raw_data_x = raw_data["joints"][0].T.reshape(-1, 1).squeeze()
raw_data_y = raw_data["joints"][1].T.reshape(-1, 1).squeeze()

In [None]:
raw_data_x.shape

In [None]:
n_joints = 14
index = pd.MultiIndex.from_product((range(len(raw_data_x)//n_joints), range(n_joints)), names=["image_num", "joint"])
cols = pd.Index(["x", "y"], name="coordinates")

In [None]:
data = pd.DataFrame(zip(raw_data_x, raw_data_y), index=index, columns=cols)
data

In [None]:
data = data.unstack(1).swaplevel(i=0, j=1, axis=1).sort_index(axis=1)
data

In [None]:
def train_val_split(data, train_size=None, val_size=None):
    """Function for randomly data into training & validation sets.
    Splits the data in training and validation sets as per the size specified by
    the parmaters `train_size` or `val_size`.
    Indexes of training & validation sets are reset to 0. The old indexes are
    stored in a new column named `image_num` as they are used to refer to the
    images.

    Parameters:
        data (pandas.DataFrame): dataframe containing the data(i.e. annotations)
        train_size (float): fraction of the data to be allocated to training set
        val_size (float): fraction of the data to be allocated to validation
                            set. Ignored if `train_size` is not None.
        
    """
    data_len = len(data)
    if train_size:
        train_len = int(data_len*train_size)
    elif val_size:
        train_len = data_len - int(data_len*val_size)

    rand_idxs = torch.randperm(data_len)
    train_set = data.loc[rand_idxs[:train_len]]
    val_set = data.loc[rand_idxs[train_len:]]

    # Reset indices of `train_set` & `val_set` to start from 0
    train_set = train_set.reset_index().sort_index(axis=1)
    val_set = val_set.reset_index().sort_index(axis=1)

    return train_set, val_set

In [None]:
train_set, val_set = train_val_split(data, train_size=0.8)
train_set.shape, val_set.shape

### Creating `Dataset` & `DataLoader` objects

In [None]:
import os
from torchvision.io import read_image

In [None]:
class PoseDataset(Dataset):
    def __init__(self, annotations, img_dir, img_format="jpg", transform=None, target_transform=None):
        self.annotations = annotations
        self.img_dir = img_dir
        self.img_format = img_format
        self.transform = transform
        self.target_transform = target_transform
    
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        pts = self.annotations.loc[index, slice(0, 13)]
        img_num = self.annotations.loc[index, "image_num"]
        file_name = f"im{int(img_num)+1:04d}.{self.img_format}"
        img_file = os.path.join(self.img_dir, file_name)
        img = read_image(img_file)

        org_size = img.size()[-2:]
        if self.transform:
            img = self.transform(img)
            new_size = self.transform.size
        
            if self.target_transform:
                # `org_size` & `new_size` are used to transform the coordinates
                # of the joints
                pts = self.target_transform(pts, org_size, new_size)
        
        return img, pts

In [None]:
from torchvision.transforms import Resize

In [None]:
def transform_coords(pts, org_size, new_size, reshape=True):
    """Function to scale the coordinates in proportion to image resizing.
    Parameters:
    pts (pandas.DataFrame) - DataFrame containing the points to be scaled.
    org_size (tuple or array-like) - Original size of the image.
    new_size (tuple or array-like) - Final size of the image after resizing.
    reshape (bool) - Boolean denoting whether to finally return points should be 
                        a numpy 1d-ndarray or a pandas dataframe.
    """
    new_x = pts.loc[:, "x"]*new_size[1]/org_size[1]
    new_y = pts.loc[:, "y"]*new_size[0]/org_size[0]
    new_pts = pd.concat([new_x, new_y], axis=1)

    if reshape:
        return new_pts.to_numpy().reshape(-1, 1).squeeze()

In [None]:
img_size = (3, 150, 125)

In [None]:
train_data = PoseDataset(annotations=train_set,
                      img_dir=root/"images",
                      transform=Resize(img_size[1:]),
                      target_transform=transform_coords)
val_data = PoseDataset(annotations=val_set,
                      img_dir=root/"images",
                      transform=Resize(img_size[1:]),
                      target_transform=transform_coords)

In [None]:
len(train_data), len(val_data)

In [None]:
# Define the batch size
bs = 32

In [None]:
train_dataloader = DataLoader(train_data, batch_size=bs, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=bs, shuffle=True)

## Visualizing the Data

In [None]:
from google.colab.patches import cv2_imshow
import cv2

In [None]:
joints = [
    "right ankle",
    "right knee",
    "right hip",
    "left hip",
    "left knee",
    "left ankle",
    "right wrist",
    "right elbow",
    "right shoulder",
    "left shoulder",
    "left elbow",
    "left wrist",
    "neck",
    "head top",
]
joints_map = {joint:i for i, joint in enumerate(joints)}

In [None]:
class COLOR:
    """Class containing variables for various colors.
    Color scheme = (B,G,R)
    """
    RED = (0, 0, 255)
    GREEN = (0, 255, 0)
    BLUE = (255, 0, 0)
    CYAN = (255, 255, 0)
    YELLOW = (0, 255, 255)
    MAGENTA = (255, 0, 255)
    LAVENDER = (250, 230, 230)
    ORANGE = (0, 175, 255)

# List containing which joints must be connected using a line
# and with what color
joints_relations = [
    ("right ankle", "right knee", COLOR.CYAN),
    ("left ankle", "left knee", COLOR.CYAN),
    ("right knee", "right hip", COLOR.RED),
    ("left knee", "left hip", COLOR.RED),
    ("right wrist", "right elbow", COLOR.GREEN),
    ("left wrist", "left elbow", COLOR.GREEN),
    ("right elbow", "right shoulder", COLOR.BLUE),
    ("left elbow", "left shoulder", COLOR.BLUE),
    ("neck", "head top", COLOR.MAGENTA),
    ("neck", "hip line", COLOR.YELLOW), # hip line = mean of left & right hip
    ("right hip", "left hip", COLOR.LAVENDER),
    ("right shoulder", "left shoulder", COLOR.ORANGE)
]


In [None]:
def annotate_image(img, pts, mapping, relations):
    """Function to annotate an image with lines drawn between specified points.
    Parameters:
    img (array-like) - Image in (C,H,W) format. Color-scheme = (R,G,B)
    pts (array-like) - Points to be use for annotating lines.
    mapping (dict) - Dict used to get coordinates for points used in `relation` 
                        from `pts`
    relations (list) - List containing tuples defining which points to join & 
                        with what color.
    """
    # Change image format from (C, H, W) to (H, W, C) 
    img = img.clone().detach().permute((1, 2, 0))
    # Change image from RBG to BGR (& change to np ndarray to show img)
    img = torch.stack((img[:, :, 2], img[:, :, 1], img[:, :, 0]), dim=2).numpy()
    # Calculate the coordinates for the hip from right & left hip
    rhip, lhip = mapping.get("right hip"), mapping.get("left hip")
    hipline_coords = tuple(map(int, (pts[rhip]+pts[lhip])/2))

    for j1, j2, color in relations:
        pt1 = mapping.get(j1, None)
        pt2 = mapping.get(j2, None)
        # Convert coordinates to `int` for annotating the image
        # for "hip line" `pt1` or `pt2` will be equal `None` so we set it 
        # equal to the above calculated `hipline_coords`
        pt1 = tuple(map(int, pts[pt1])) if pt1 is not None else hipline_coords
        pt2 = tuple(map(int, pts[pt2])) if pt2 is not None else hipline_coords
        # Annotate the image
        cv2.line(img, pt1, pt2, color=color)
    cv2_imshow(img)

## Creating & Training a Model on the Data.

In [None]:
class Model(nn.Module):
    """Class implementing the model to be trained on the data."""
    def __init__(self, img_shape, output_shape):
        super(Model, self).__init__()

        self.img_shape = img_shape
        self.output_shape = output_shape
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=img_shape[0],
                      out_channels=8,
                      kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=1),
            nn.Conv2d(in_channels=8,
                      out_channels=16,
                      kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=1),
            nn.Conv2d(in_channels=16,
                      out_channels=32,
                      kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=1),
            nn.Conv2d(in_channels=32,
                      out_channels=64,
                      kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=1),
            nn.Conv2d(in_channels=64,
                      out_channels=16,
                      kernel_size=1),
        )

        self.flatten = nn.Flatten(start_dim=-3)

        # Calculate the shape of the output produced by the convolution & 
        # pooling layers
        out_shape = self._calc_output_shape()
        self.fc = nn.Sequential(
            nn.Linear(out_shape[0]*out_shape[1]*out_shape[2], 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_shape),
            nn.ReLU(),
        )
    
    def forward(self, x):
        y = self.conv(x)
        y = self.flatten(y)
        y = self.fc(y)
        return y

    def _calc_output_shape(self):
        out_img_shape = list(self.img_shape)
        for module in self.conv.modules():
            if isinstance(module, (nn.Conv2d, nn.MaxPool2d, nn.AvgPool2d)):
                # Kernel size
                k = module.kernel_size
                k = k if isinstance(k, tuple) else (k, k)
                # Stride
                s = module.stride
                s = s if isinstance(s, tuple) else (s, s)
                # Padding
                p = module.padding
                p = p if isinstance(p, tuple) else (p, p)
                # Dilation
                d = module.dilation
                d = d if isinstance(d, tuple) else (d, d)
                # Update no. of channels (for Conv2d layers)
                if hasattr(module, "out_channels"):
                    out_img_shape[0] = module.out_channels
                # Update image height
                out_img_shape[1] = (out_img_shape[1]+2*p[0]-d[0]*(k[0]-1)-1)//s[0] + 1
                # Update image width
                out_img_shape[2] = (out_img_shape[2]+2*p[1]-d[1]*(k[1]-1)-1)//s[1] + 1
                
        return out_img_shape

In [None]:
def train_loop(train_dataloader, model, loss_fn, optimizer, device="cpu"):
    """Function to perform the training on the data for one epoch
    Parameters:
    train_dataloader (DataLoader) - dataloader object for the training data
    model - the which has to be trained (& used for making predictions)
    loss_fn (function) - loss function to use to calculate gradients
    optimizer - optimizer to use to update the parameters of the model
    device (str: default="cpu") - device to use for training
    """
    model.train()
    avg_loss = 0
    for x, y in train_dataloader:
        x = x.float().to(device)
        y = y.float().to(device)
        optimizer.zero_grad()
        pred = model(x)
        # Calculate loss & gradients
        loss = loss_fn(pred, y)
        avg_loss += loss.item()
        loss.backward()
        # Update parameter values
        optimizer.step()
    print(f"avg. training loss: {avg_loss/len(train_dataloader)}")

In [None]:
def val_loop(val_dataloader, model, loss_fn, device="cpu"):
    """Function to calculate the loss & accuracy of the model for the validation
    dataset.
    Parameters:
    val_dataloader (DataLoader) - dataloader object for the validation data
    model - the which has to be trained (& used for making predictions)
    loss_fn (function) - loss function to use to calculate gradients
    device (str: default="cpu") - device to use for inference
    """
    model.eval()
    avg_loss = 0
    # Gradient need not be computed for the validation data, hence, computations
    # can be sped up
    with torch.no_grad():
        for x, y in val_dataloader:
            x = x.float().to(device)
            y = y.float().to(device)
            pred = model(x)
            avg_loss += loss_fn(pred, y).item()
    print(f"avg. validation loss: {avg_loss/len(val_dataloader)}")

##### Use `GPU` for training if available else use `CPU`

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device being used: {device}")

In [None]:
model = Model(img_size, n_joints*2)
model.to(device);

In [None]:
from torch.optim import Adam

In [None]:
n_epochs = 15
lr = 1e-3
optimizer = Adam(model.parameters(), lr)
mse_loss = nn.MSELoss()

In [None]:
for i in range(n_epochs):
    line_len = 50
    print("_"*line_len)
    print(f"Epoch No. {i}")
    print("_"*line_len)
    train_loop(train_dataloader, model, mse_loss, optimizer, device)
    val_loop(val_dataloader, model, mse_loss, device)
    print("_"*50)

In [None]:
model.eval()
with torch.no_grad():
    img, pts = val_data[4]
    preds = model(img.unsqueeze(0).float().to(device))
    annotate_image(img, preds.reshape(-1, 2), joints_map, joints_relations)