In [2]:
import torch

import numpy as np

# Set the seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Create a 5x5 matrix A
A = torch.randn(5, 5)  # Random values in a 5x5 matrix

# Original code implementation
def original_forward(A):
    v = 0.95
    A_np = A.clone().detach().cpu().numpy()
    row_sums = np.abs(A_np).sum(axis=-1)

    for idx in np.where(row_sums > v)[0]:
        row = A_np[idx, :]
        row_sign = np.sign(row)
        row_abs = np.abs(row)
        sorted_row = np.sort(row_abs)

        s = np.sum(sorted_row) - v
        l = float(len(sorted_row))
        for i in range(len(sorted_row)):
            if s / l > sorted_row[i]:
                s -= sorted_row[i]
                l -= 1
            else:
                break
        alpha = s / l
        A_np[idx, :] = row_sign * np.maximum(row_abs - alpha, 0)

    return torch.tensor(A_np, dtype=A.dtype, device=A.device)

import torch

def fully_vectorized_forward(A, v=0.95):
    # Step 1: Compute the row sums
    row_sums = A.abs().sum(dim=1)

    # Step 2: Mask the rows where sum is greater than v
    mask = row_sums > v
    A_masked = A[mask]

    # Step 3: Sort the absolute values of the selected rows
    row_abs = A_masked.abs()
    sorted_row_abs, sorted_indices = torch.sort(row_abs, dim=1)

    # Step 4: Compute the cumulative sum and find the index to cut
    cumulative_sum = sorted_row_abs.cumsum(dim=1)
    cut_idx = (cumulative_sum > (cumulative_sum[:, -1:] - v)).int().argmax(dim=1)

    # Step 5: Compute alpha and subtract from the original rows
    alpha = (cumulative_sum[torch.arange(cut_idx.size(0)), cut_idx] - v) / (cut_idx.float() + 1)

    # Step 6: Rebuild the rows using the sign and maximum operation
    row_sign = torch.sign(A_masked)
    result_rows = row_sign * torch.maximum(row_abs - alpha.unsqueeze(1), torch.tensor(0.0, device=A.device))

    # Step 7: Reassign the updated rows back to the matrix
    A[mask] = result_rows

    return A

# Set a fixed random seed for reproducibility
torch.manual_seed(42)

# Test the original and optimized versions

A = torch.randn(5, 5)

# Call the fully vectorized forward method
A_vectorized = fully_vectorized_forward(A.clone())
print(A_vectorized)

# Optionally, compare with your original function (if available) to ensure they are the same
A_original = original_forward(A.clone())
print(A_original)

# Check if the results are the same
result = torch.allclose(A_vectorized, A_original)
print("Are the original and optimized results the same?", result)


tensor([[ 0.6971,  0.2575,  0.0000, -0.8757,  0.0000],
        [-0.5603, -0.0000, -0.9305, -0.0779, -0.0124],
        [-0.0000,  0.0000, -0.4501,  0.0000, -1.6561],
        [-0.0000, -0.8902, -0.0000,  0.3090, -0.1272],
        [-0.1076, -0.0000, -0.3441,  0.0000, -1.0732]])
tensor([[ 0.3857,  0.0000,  0.0000, -0.5643,  0.0000],
        [-0.2899, -0.0000, -0.6601, -0.0000, -0.0000],
        [-0.0000,  0.0000, -0.0000,  0.0000, -0.9500],
        [-0.0000, -0.7648, -0.0000,  0.1835, -0.0017],
        [-0.0000, -0.0000, -0.1105,  0.0000, -0.8395]])
Are the original and optimized results the same? False


In [6]:
# Set the seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Create a 5x5 matrix A
A = torch.randn(5, 5) 
print(A)

tensor([[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784],
        [-1.2345, -0.0431, -1.6047, -0.7521, -0.6866],
        [-0.4934,  0.2415, -1.1109,  0.0915, -2.3169],
        [-0.2168, -1.3847, -0.3957,  0.8034, -0.6216],
        [-0.5920, -0.0631, -0.8286,  0.3309, -1.5576]])


In [123]:
import torch

import numpy as np

# Set the seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Create a 5x5 matrix A
A = torch.randn(10, 5)  # Random values in a 5x5 matrix

# Original code implementation
def original_forward(A):
    v = 0.95
    A_np = A.clone().detach().cpu().numpy()
    row_sums = np.abs(A_np).sum(axis=-1)
    # print(row_sums)
    
    
    for idx in np.where(row_sums > v)[0]:
        # print(idx)
        row = A_np[idx, :]
        # print(row)
        row_sign = np.sign(row)
        row_abs = np.abs(row)
        # print(row_abs)
        sorted_row = np.sort(row_abs)
        print(sorted_row)




        s = np.sum(sorted_row) - v
        # print(s)
        l = float(len(sorted_row))
        # print(l)
        # print(s/l)
        m = 0
        for i in range(len(sorted_row)):
            if s / l > sorted_row[i]:
                # print("base")
                # print(s/l)
                # print("matrix")
                print(sorted_row[i])
                m += 1
                s -= sorted_row[i]
                l -= 1
                
            else:
                break
        # print(m)
        # print(s)
        # print(l)
        alpha = s / l
        A_np[idx, :] = row_sign * np.maximum(row_abs - alpha, 0)

    return torch.tensor(A_np, dtype=A.dtype, device=A.device)

# Set a fixed random seed for reproducibility
torch.manual_seed(42)

# Test the original and optimized versions

A = torch.randn(10, 5)

# Optionally, compare with your original function (if available) to ensure they are the same
A_original = original_forward(A.clone())




[0.67841846 0.9007172  1.4872841  1.9269153  2.105521  ]
0.67841846
0.9007172
1.4872841
[0.04306748 0.7521353  1.2345449  1.604667   1.648723  ]
0.043067478
0.7521353
[0.39247864 0.5594302  0.7278813  0.7688389  1.4036071 ]
0.39247864
0.5594302
[0.15959747 0.43958926 0.49739754 0.7624454  1.6423169 ]
0.15959747
0.43958926
0.49739754
[0.75813115 0.80080056 1.0783176  1.2791244  1.6806206 ]
0.75813115
0.80080056
[0.04175949 0.23162432 0.6104665  1.2964228  1.3347378 ]
0.04175949
0.23162432
0.6104665
[0.07802387 0.2515753  0.8598585  0.87123615 1.3846737 ]
0.07802387
0.2515753
[0.48799172 0.52580875 0.7359928  0.81400764 1.191369  ]
0.48799172
0.52580875
[0.06347727 0.09780689 0.6756149  0.83712465 0.92239004]
0.06347727
0.09780689
[0.70781225 1.1845374  1.2024456  1.3835493  1.844594  ]
0.70781225


In [None]:
row_sums = A.abs().sum(dim=1)
mask = row_sums > v
A_masked = A[mask]
row_abs = A_masked.abs()
sorted_row_abs, _ = torch.sort(row_abs, dim=1)

s = torch.sum(sorted_row_abs, 1) - v
l = torch.tensor([float(len(sorted_row_abs))] * len(s))
mask = sorted_row_abs < (s / l).unsqueeze(1)



# print(mask)
num_elements = torch.sum(mask)
print(num_elements)
s -= np.sum(sorted_row_abs[:num_elements])
l -= num_elements

In [None]:
import torch
import numpy as np

def fully_vectorized_forward(A, v=0.95):
    # Step 1: Compute the row sums
    row_sums = A.abs().sum(dim=1)
    # print(row_sums)
    
    
    
    # Step 2: Mask the rows where sum is greater than v
    sorted_row_abs, _ = torch.sort(A[row_sums > v].abs(), dim=1)
    # print(sorted_row_abs)





    # Step 4: Compute the cumulative sum and find the index to cut
    s = torch.sum(sorted_row_abs, 1) - v
    # print(s.shape)
    l = torch.tensor([float(sorted_row_abs.shape[1])] * len(s))
    alpha = s/l
    # print(alpha)
    # print(l)
    # print((s/l).unsqueeze(1))
    # mask = sorted_row_abs < (s / l).unsqueeze(1)
    # print(mask)
    
    for u in range(sorted_row_abs.shape[0]):
        # print(u)
        # print(s[u])
        # print(sorted_row_abs[u,:])
        s[u] -= sorted_row_abs[u,:]
        print(s[u])
        
        
    # num_elements_to_subtract = mask.sum(dim=1)
    # print(num_elements_to_subtract)
    # Update s and l vectorized:
    s = s - (sorted_row_abs * mask).sum(dim=1)
    l = l - num_elements_to_subtract
    # print(s)
    # print(l)
    return A

# Set a fixed random seed for reproducibility
torch.manual_seed(42)

# Test the original and optimized versions

A = torch.randn(10, 5)

# Call the fully vectorized forward method
A_vectorized = fully_vectorized_forward(A.clone())
# print(A_vectorized)

0
tensor(6.1489)
tensor([0.6784, 0.9007, 1.4873, 1.9269, 2.1055])
1
tensor(4.3331)
tensor([0.0431, 0.7521, 1.2345, 1.6047, 1.6487])
2
tensor(2.9022)
tensor([0.3925, 0.5594, 0.7279, 0.7688, 1.4036])
3
tensor(2.5513)
tensor([0.1596, 0.4396, 0.4974, 0.7624, 1.6423])
4
tensor(4.6470)
tensor([0.7581, 0.8008, 1.0783, 1.2791, 1.6806])
5
tensor(2.5650)
tensor([0.0418, 0.2316, 0.6105, 1.2964, 1.3347])
6
tensor(2.4954)
tensor([0.0780, 0.2516, 0.8599, 0.8712, 1.3847])
7
tensor(2.8052)
tensor([0.4880, 0.5258, 0.7360, 0.8140, 1.1914])
8
tensor(1.6464)
tensor([0.0635, 0.0978, 0.6756, 0.8371, 0.9224])
9
tensor(5.3729)
tensor([0.7078, 1.1845, 1.2024, 1.3835, 1.8446])


NameError: name 'mask' is not defined