# Optim Bottom-Up

## Setup


In [None]:
%load_ext autoreload
%autoreload 2
from importlib import reload
import logging

import torch

import tensorcraft as tc

tc.set_logger_config(level = logging.INFO)

ALPHA = 1e-6 # 1 micro second of latency (Maybe bigger)
BETA=64.0/( 200.0 * 1e9) # 200 GBits per second bandwidth

In [None]:
columns = ["Step #", "Operation", "Distribution", "Cost[s]", "Memory Usage [MB]"]
columns_width = [8, 20, 40, 8, 8]

type_size = 8

def print_path(path: list[tuple[str, any, float]], tensor_shape: torch.Size):

    line = " & ".join(f"{col:<{width}}" for col, width in zip(columns, columns_width)) + " \\\\"
    print(line)
    for i, (op, s_dist, s_cost) in enumerate(path):
        line = f"{i:<{columns_width[0]}} & {op:<{columns_width[1]}} & {s_dist.latexStr():<{columns_width[2]}} & {s_cost:<{columns_width[3]}} & {s_dist.maxNumElements(tensor_shape) * 8 / 10**6:<{columns_width[4]}} \\\\" 
        print(line)


In [None]:

def mem_constrained_filter(shape: torch.Size, start_dist: tc.dist.MultiAxisDist, target_dist: tc.dist.MultiAxisDist, current_dist: tc.dist.MultiAxisDist ) -> bool:
    max_n_elements = max(start_dist.maxNumElements(shape), target_dist.maxNumElements(shape))
    return max_n_elements < current_dist.maxNumElements(shape)

## Redistributors

Given a tensor shape, a starting distribution and a target distribution, creates a sequence of collective ops to reach the target dist while optimizing for different metrics.

### Problem 1 ( Tiled Matrix to Row cyclic)

Shifting from a tiled matrix, to a row cyclic distribution

In [None]:
tensor_shape = torch.Size([100000, 100000])
mesh = torch.Size([2,4])
dist = tc.dist.MultiAxisDist(mesh, ((0,), (1,),), 100) 
target_dist = tc.dist.MultiAxisDist(mesh, ((0,1), None), 1)

#### Naive Gather Split

Simplest redistributor. Just allgathers, then splits. Should be both communication and memory ineficient. 

In [None]:
%%time
naive_rdist = tc.optim.NaiveGathererRedist(tc.optim.IdealLowerBoundsCM(), alpha=ALPHA, beta=BETA)

sequence, total_cost = naive_rdist.redistribute(tensor_shape, dist, target_dist)
print_path(sequence, tensor_shape)
print(f"Total cost: {total_cost:.2f}s")


N_procs: 8, N_elements: 10000000000
Step #   & Operation            & Distribution                             & Cost[s]  & Memory Usage [MB] \\
0        &                      & $T_{\perp\{ 0,1 \}(100,100)}$            & 0        & 10000.0  \\
1        & allgather_*          & $T_{\perp\{ \emptyset,\emptyset \}(\emptyset,\emptyset)}$ & 2.800003 & 80000.0  \\
2        & split_*              & $T_{\perp\{ (0,1),\emptyset \}(1,\emptyset)}$ & 0        & 10000.0  \\
Total cost: 2.80s
CPU times: user 576 ms, sys: 31.3 ms, total: 608 ms
Wall time: 535 ms


#### Memory Constrained (Top K = 10)

In [None]:
%%time
mem_constrained_dist = tc.optim.AStarRedistributor(tc.optim.IdealLowerBoundsCM(), alpha=ALPHA, beta=BETA, node_filter=mem_constrained_filter, top_k=10)
sequence, total_cost = mem_constrained_dist.redistribute(tensor_shape, dist, target_dist)

print_path(sequence, tensor_shape)
print(f"Total cost: {total_cost:.3f}s")

Explored 13 nodes, found 10 possible paths.
Step #   & Operation            & Distribution                             & Cost[s]  & Memory Usage [MB] \\
0        &                      & $T_{\perp\{ 0,1 \}(100,100)}$            & 0        & 10000.0  \\
1        & alltoall_0_1_-1      & $T_{\perp\{ \emptyset,(0,1) \}(\emptyset,100)}$ & 0.40000099999999994 & 10000.0  \\
2        & alltoall_minor_1_0_1 & $T_{\perp\{ 1,0 \}(1,400)}$              & 1.200002 & 10000.0  \\
3        & alltoall_1_0_-1      & $T_{\perp\{ (0,1),\emptyset \}(1,\emptyset)}$ & 0.40000099999999994 & 10000.0  \\
Total cost: 2.000s
CPU times: user 1min, sys: 1.15 s, total: 1min 1s
Wall time: 42.7 s


### Memory Constrained (Top K = 1)

In [None]:
%%time
mem_constrained_dist = tc.optim.AStarRedistributor(tc.optim.IdealLowerBoundsCM(), alpha=ALPHA, beta=BETA, node_filter=mem_constrained_filter, top_k=1)
sequence, total_cost = mem_constrained_dist.redistribute(tensor_shape, dist, target_dist)

print_path(sequence, tensor_shape)
print(f"Total cost: {total_cost:.3f}s")

Explored 7 nodes, found 1 possible paths.
Step #   & Operation            & Distribution                             & Cost[s]  & Memory Usage [MB] \\
0        &                      & $T_{\perp\{ 0,1 \}(100,100)}$            & 0        & 10000.0  \\
1        & changeBlockSize_0_1  & $T_{\perp\{ 0,1 \}(1,100)}$              & 0.20000099999999998 & 10000.0  \\
2        & alltoall_1_0_-1      & $T_{\perp\{ (1,0),\emptyset \}(1,\emptyset)}$ & 1.200002 & 10000.0  \\
3        & alltoall_minor_0_1_-1 & $T_{\perp\{ 1,0 \}(2,1)}$                & 0.40000099999999994 & 10000.0  \\
4        & alltoall_1_0_-1      & $T_{\perp\{ (0,1),\emptyset \}(2,\emptyset)}$ & 0.40000099999999994 & 10000.0  \\
5        & changeBlockSize_0_1  & $T_{\perp\{ (0,1),\emptyset \}(1,\emptyset)}$ & 0.35000299999999995 & 10000.0  \\
Total cost: 2.550s
CPU times: user 36.1 s, sys: 780 ms, total: 36.9 s
Wall time: 25.4 s


#### A*

In [None]:
%%time
a_star_redist = tc.optim.AStarRedistributor(tc.optim.IdealLowerBoundsCM(), alpha=ALPHA, beta=BETA, path_cost_w=10, estimate_w=1.0, max_depth=5, top_k = 10)
sequence, total_cost = a_star_redist.redistribute(tensor_shape, dist, target_dist)

print_path(sequence, tensor_shape)
print(f"Total cost: {total_cost:.2f}s")

Explored 85 nodes, found 20 possible paths.
Step #   & Operation            & Distribution                             & Cost[s]  & Memory Usage [MB] \\
0        &                      & $T_{\perp\{ 0,1 \}(100,100)}$            & 0        & 10000.0  \\
1        & allgather_1          & $T_{\perp\{ 0,\emptyset \}(100,\emptyset)}$ & 1.200002 & 40000.0  \\
2        & split_minor_0_1_1    & $T_{\perp\{ (0,1),\emptyset \}(25,\emptyset)}$ & 0.0      & 10000.0  \\
3        & changeBlockSize_0_1  & $T_{\perp\{ (0,1),\emptyset \}(1,\emptyset)}$ & 0.35000299999999995 & 10000.0  \\
Total cost: 1.55s
CPU times: user 2min 20s, sys: 3.58 s, total: 2min 23s
Wall time: 1min 49s
