In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import rc
import seaborn as sns
import sys
from sklearn.random_projection import GaussianRandomProjection
from tqdm import tqdm

sys.path.append('../')
sys.path.append('../src/')
from src.generative import Sphere, Memorizer, Mixture, MultivariateGaussian
from src.detection import DataCopyingDetector, ThreeSampleDetector

In [4]:
# Set LaTeX for text rendering and configure Palatino fonts
rc('text', usetex=True)  # Enable LaTeX rendering
rc('font', family='serif')  # Use serif fonts
rc('font', serif='Palatino')  # Set Palatino as the serif font

# Add dsfont package to the LaTeX preamble for double-struck symbols
plt.rcParams['text.latex.preamble'] = r'\usepackage{dsfont}'

# Set plotting style and font sizes
plt.rcParams.update({
    'font.size': 10,          # Base font size
    'axes.titlesize': 9,     # Title size
    'axes.labelsize': 8,     # Axis labels size
    'xtick.labelsize': 7,    # X-axis tick labels size
    'ytick.labelsize': 7     # Y-axis tick labels size
})

# Define line styles, colors, markers
line_styles = ['-', '--', ':']
colors = sns.color_palette('colorblind')
markers = ['x', '^', 'h', '^', 'v', '>', '<', 'p', '*', 'h']
markersize = [5, 4, 3, 2, 1]

# Calculate textwidth in inches for plot sizing
textwidth_in_inches = 398.33864 / 72.27


# Uniform distribution over the surface of a d-dimensional sphere

In [12]:
# # small d seem not to be working
# for d in [2, 4, 5, 10, 20, 50]:
#     p = Sphere(d=d)
#     X = p.sample(20000)
#     det = DataCopyingDetector()
#     print(f"Estimated data copying rate for d={d}: {det.estimate_cr(X, p, m=20000)}")

__Analysis:__ 

- Copying Rate for small dimensions, i.e. $\mathcal{R}^{d}$ with $d \leq 5$ is not zero which is an unwanted behaviour. Reason: Intrinsic dimension is $d-1$.
- Same goes for $\mathcal{R}^{d}$ with $d \geq 20)$. $\gamma$ is too large resulting to classification of generated points of the true distribution as data copies. So there would be the need to adjust it.

In [13]:
# # check regularity for different d
# fig, axs = plt.subplots(3, 3, figsize=(15, 15))
# for i, d in enumerate([2, 3, 4, 5, 6, 7, 8, 9, 10]):
#     p = Sphere(d=d) 
#     X = p.sample(2000)
#     radii = np.logspace(-2, 0, 100)
#     counts = np.zeros((len(radii), len(X)))
#     for k, x in enumerate(X):
#         for j, r in enumerate(radii):
#             counts[j, k] = np.sum(np.linalg.norm(X - x, axis=1) < r)

#     axs[i // 3, i % 3].plot(radii, np.mean(counts, axis=1))
#     axs[i // 3, i % 3].set_title(f"d={d}")
#     axs[i // 3, i % 3].fill_between(radii, np.mean(counts, axis=1) - np.std(counts, axis=1), np.mean(counts, axis=1) + np.std(counts, axis=1), alpha=0.3)
# plt.show()

In [14]:
ds = [10, 25, 50, 75, 100]
d_projs = [1, 2, 3]
k = 10
n = 20000
rhos = np.round(np.linspace(0, 1, 11), 2)
copier = Memorizer(radius=0.05)
underfit = Memorizer(radius=1, n_copying=n)

In [15]:
# C_S = np.zeros((len(ds), len(rhos)))

# for i, d in enumerate(tqdm(ds)):
#     p = Sphere(d=d)
#     X = p.sample(n)
#     X_test = p.sample(n)
#     for j, rho in enumerate(rhos):
#         q = Mixture(rho=rho, q1=copier, q2=underfit).fit(X)
#         tst = ThreeSampleDetector(num_regions=int(20))
#         C_S[i, j] = tst.C_T(q, X, X_test)[0]


# np.save("../doc/Sphere/C_S.npy", C_S)

In [16]:
C_S = np.load("../doc/Sphere/C_S.npy")

# for each d print smallest rho for which C_S < -3
rho_min = np.zeros(len(ds))
for i, d in enumerate(ds):
    rho_min[i] = rhos[np.where(C_S[i] < -3)[0][0]]
    print(f"d={d}: {rho_min[i]}")

d=10: 0.6
d=25: 0.7
d=50: 0.5
d=75: 0.3
d=100: 0.3


In [17]:
# results = np.zeros((n, k, len(ds), len(d_projs), len(rhos))).astype(bool)

# for i, d in enumerate(ds):
#     p = Sphere(d=d)
#     X = p.sample(n)
#     for j, r in enumerate(tqdm(rhos)):
#         q = Mixture(rho=r, q1=copier, q2=underfit).fit(X)
#         X1_gen = q.sample(n)
#         X2_gen = q.sample(n)
#         for l, d_proj in enumerate(d_projs):
#             for m in range(k):
#                 rp = GaussianRandomProjection(n_components=d_proj).fit(X)
#                 dcd = DataCopyingDetector()
#                 idx = dcd.get_copying_indices(rp.transform(X), rp.transform(X1_gen), rp.transform(X2_gen))
#                 results[idx, m, i, l, j] = True

# # save results
# np.save('../doc/Sphere/mixed_model_results_new.npy', results)

In [18]:
### plotting
results = np.load('../doc/Sphere/mixed_model_results_new.npy')
majority_votes = (results.mean(axis=1) > 0.5).mean(axis=0)

line_styles = ['--', '-.', ':']
markers = ['.', '^', 'x', 's']
colors = sns.color_palette('colorblind')
fig, axs = plt.subplots(2, 3, figsize=(textwidth_in_inches, 3), sharex=True, sharey=True)
axs = axs.flatten()

for i, d in enumerate(ds):
    axs[i].plot(rhos, rhos, label='True $cr_q$', linestyle='-', color='black')
    for j, d_proj in enumerate(d_projs):
        # plot ground truth
        axs[i].plot(rhos, majority_votes[i, j], label='$d_{\\mathrm{proj}}=$' + str(d_proj), linestyle=line_styles[j], marker=markers[j], markersize=5, color=colors[j])
    axs[i].axvspan(xmin=rho_min[i], xmax=1.0, color='#FFCCCB', alpha=0.5)
    axs[i].set_title(f"({chr(97 + i)}) $d={d}$")
    axs[i].set_xticks(np.linspace(0, 1, 6))
    axs[i].set_yticks(np.linspace(0, 1, 6))

# labeling axis in first column and last row
axs[0].set_ylabel('$cr_q$')
axs[3].set_ylabel('$cr_q$')
axs[3].set_xlabel('$\\rho$')
axs[4].set_xlabel('$\\rho$')

axs[5].axis('off')
handles, labels = axs[0].get_legend_handles_labels()
gray_patch = mpatches.Patch(color='#FFCCCB', alpha=0.5, label='$C_S < -3$')
handles.append(gray_patch)
labels.append('$C_S < -3$')
axs[5].legend(handles, labels, loc='center')

plt.tight_layout(pad=0.5)
plt.savefig('../doc/Sphere/majority_votes_new.png', dpi=300)
plt.close()

# Multivariate Gaussian

In [5]:
ds = [10, 25, 50, 75, 100]
r_underfit = [3, 4.5, 7, 8.5, 10]
d_projs = [1, 2, 3]
k = 10
n = 40000
rhos = np.round(np.linspace(0, 1, 11), 2)
copier = Memorizer(radius=0.1)
underfit = Memorizer(radius=7, n_copying=n)

In [6]:
# C_S = np.zeros((len(ds), len(rhos)))

# for i, d in enumerate(tqdm(ds)):
#     p = MultivariateGaussian(dim=d)
#     X = p.sample(n)
#     X_test = p.sample(n)
#     for j, rho in enumerate(rhos):
#         underfit = Memorizer(radius=r_underfit[i], n_copying=n)
#         q = Mixture(rho=rho, q1=copier, q2=underfit).fit(X)
#         tst = ThreeSampleDetector(num_regions=int(20))
#         C_S[i, j] = tst.C_T(q, X, X_test)[0]

# np.save("../doc/MultivariateGaussian/C_S.npy", C_S)

In [7]:
C_S = np.load("../doc/MultivariateGaussian/C_S.npy")

# for each d print smallest rho for which C_S < -3
rho_min = np.zeros(len(ds))
for i, d in enumerate(ds):
    rho_min[i] = rhos[np.where(C_S[i] < -3)[0][0]]
    print(f"d={d}: {rho_min[i]}")

d=10: 0.5
d=25: 0.7
d=50: 0.6
d=75: 0.4
d=100: 0.3


In [8]:
# results = np.zeros((n, k, len(ds), len(d_projs), len(rhos))).astype(bool)

# for i, d in enumerate(ds):
#     p = MultivariateGaussian(dim=d)
#     X = p.sample(n)
#     for j, r in enumerate(tqdm(rhos)):
#         underfit = Memorizer(radius=r_underfit[i], n_copying=n)
#         q = Mixture(rho=r, q1=copier, q2=underfit).fit(X)
#         X1_gen = q.sample(n)
#         X2_gen = q.sample(n)
#         for l, d_proj in enumerate(d_projs):
#             for m in range(k):
#                 rp = GaussianRandomProjection(n_components=d_proj).fit(X)
#                 dcd = DataCopyingDetector()
#                 idx = dcd.get_copying_indices(rp.transform(X), rp.transform(X1_gen), rp.transform(X2_gen))
#                 results[idx, m, i, l, j] = True

# # save results
# np.save('../doc/MultivariateGaussian/mixed_model_results_new.npy', results)

In [9]:
### plotting
results = np.load('../doc/MultivariateGaussian/mixed_model_results_new.npy')
majority_votes = (results.mean(axis=1) > 0.5).mean(axis=0)

line_styles = ['--', '-.', ':']
markers = ['.', '^', 'x', 's']
colors = sns.color_palette('colorblind')
fig, axs = plt.subplots(2, 3, figsize=(textwidth_in_inches, 3), sharex=True, sharey=True)
axs = axs.flatten()

for i, d in enumerate(ds):
    axs[i].plot(rhos, rhos, label='True $cr_q$', linestyle='-', color='black')
    for j, d_proj in enumerate(d_projs):
        # plot ground truth
        axs[i].plot(rhos, majority_votes[i, j], label='$d_{\\mathrm{proj}}=$' + str(d_proj), linestyle=line_styles[j], marker=markers[j], markersize=5, color=colors[j])
    axs[i].set_title(f"({chr(97 + i)}) $d={d}$")
    axs[i].set_xticks(np.linspace(0, 1, 6))
    axs[i].set_yticks(np.linspace(0, 1, 6))
    axs[i].axvspan(xmin=rho_min[i], xmax=1.01, color='#FFCCCB', alpha=0.5)

# labeling axis in first column and last row
axs[0].set_ylabel('$cr_q$')
axs[3].set_ylabel('$cr_q$')
axs[3].set_xlabel('$\\rho$')
axs[4].set_xlabel('$\\rho$')

axs[5].axis('off')
handles, labels = axs[0].get_legend_handles_labels()
gray_patch = mpatches.Patch(color='#FFCCCB', alpha=0.5, label='$C_S < -3$')
handles.append(gray_patch)
labels.append('$C_S < -3$')
axs[5].legend(handles, labels, loc='center')

plt.tight_layout(pad=0.5)
plt.savefig('../doc/MultivariateGaussian/majority_votes_new.png', dpi=300)
plt.close()