In [1]:
import numpy as np
import torch
import pandas as pd

pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 20)
np.set_printoptions(linewidth=400)

In [6]:
# INTRODUCING HELPER FUNCTIONS

def nanvar(
        tensor: torch.Tensor,
        dim: int=None,
        keepdim: bool=False,
        correction: int=1
) -> torch.Tensor:
    # Count non-NaN elements
    count = (~torch.isnan(tensor)).sum(dim=dim, keepdim=keepdim)

    # Compute mean while ignoring NaNs
    mean = torch.nanmean(tensor, dim=dim, keepdim=True)

    # Compute squared difference from the mean
    sq_diff = (tensor - mean).pow(2)
    sq_diff = torch.where(
        torch.isnan(sq_diff),
        torch.zeros_like(sq_diff),
        sq_diff)

    # Sum of squared differences
    sum_sq_diff = sq_diff.sum(dim=dim, keepdim=keepdim)

    # Degrees of freedom correction (e.g., 1 for sample variance)
    divisor = (count - correction).clamp(min=1)

    return sum_sq_diff / divisor.to(sum_sq_diff.dtype)

def nanstd(
        tensor: torch.Tensor,
        dim: int=None,
        keepdim: bool=False,
        correction: int=1
) -> torch.Tensor:
    """
    Computes the standard deviation of a tensor along a specified dimension,
    ignoring NaN values.
    """

    return nanvar(tensor, dim=dim, keepdim=keepdim, correction=correction).sqrt()

In [10]:
# COVARIANCE

data = {
    1: { 1: 4.0, 2: np.nan, 3: 2.0 },
    2: { 1: 5.0, 2: 3.0,      },
    3: { 1: 3.0, 2: 4.0, 3: 5.0 },
}
r = pd.DataFrame(data).T

#    1    2    3
# 1  4  NaN  2.0
# 2  5  3.0  NaN
# 3  3  4.0  5.0

# Pandas
R_pandas = (
    (r - r.mean(axis=0, skipna=True))
     .div(r.std(axis=0, ddof=0, skipna=True) + 1e-8, axis=1)
     .fillna(0.0)
)

# NumPy
r_np = r.values  # shape (n_items, n_users)
means = np.nanmean(r_np, axis=0)
stds  = np.nanstd (r_np, axis=0, ddof=0)

R_numpy = (r_np - means) / (stds + 1e-8)
R_numpy = np.nan_to_num(R_numpy, nan=0.0)

# 2C) PyTorch
r_torch = torch.tensor(r_np, dtype=torch.float64)

means_t = torch.nanmean(r_torch, dim=0)
stds_t  = nanstd (r_torch, dim=0, correction=0)

R_torch = (r_torch - means_t) / (stds_t + 1e-8)
R_torch = torch.nan_to_num(R_torch, nan=0.0)



# By hand

def by_hand(r_df):
    # for each column
    out = []
    for col in r_df:
        col_data = r_df[col].values  # nanable
        # A) compute mean/std over non‐nan
        m = np.nanmean(col_data)
        s = np.nanstd (col_data, ddof=0)
        # B) z‐score and then replace nan→0
        z = (col_data - m) / (s + 1e-8)
        z = np.where(np.isnan(z), 0.0, z)
        out.append(z)
    # stack back into (n_items, n_users)
    return np.vstack(out).T

R_byhand = by_hand(r)

print("Max abs diff pandas vs byhand:", np.max(np.abs(R_pandas.values - R_byhand)))
print("Max abs diff numpy  vs byhand:", np.max(np.abs(R_numpy      - R_byhand)))
print("Max abs diff torch  vs byhand:", torch.max(torch.abs(R_torch - torch.tensor(R_byhand))))



Max abs diff pandas vs byhand: 0.0
Max abs diff numpy  vs byhand: 0.0
Max abs diff torch  vs byhand: tensor(0., dtype=torch.float64)
