In [263]:
import numpy as np
import scipy as sp
from scipy import linalg as sp_linalg
from sinkhorn_knopp import sinkhorn_knopp as skp

V = 100
rng = np.random.RandomState(0)

power_law_seed = rng.power(a=1.0, size=V)
P_ij = np.triu(rng.lognormal(mean=1, sigma=1, size=[V, V]), k=1)
P_ij *= power_law_seed
P_ij += P_ij.T #P_ij == P_ji
P_ij /= P_ij.sum()

p_i = P_ij.sum(axis=1)
P_i = np.diag(p_i)
P_i_inv = np.diag(1.0 / p_i)
similarities = P_i_inv.dot(P_ij).dot(P_i_inv)

pmis = np.log(similarities)
pmis[pmis == -np.inf] = 0

assert np.allclose(similarities, similarities.T)
assert np.allclose(similarities.dot(P_i).sum(axis=1), 1)
assert np.allclose(P_ij, P_ij.T)
assert np.allclose(pmis, pmis.T)
assert np.allclose(P_i.sum(), 1)
assert np.allclose(P_ij.sum(), 1)



In [264]:
p_i.min(), p_i.max()

(0.00012232142177970225, 0.025151188047223612)

In [265]:
similarities[similarities != 0].min(), similarities.max()

(0.0022734043329924124, 20.83410688459186)

In [266]:
rng = np.random.RandomState(0)
num_vars = V**2
for n in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]:
    emp_ij = rng.multinomial(n * num_vars, P_ij.flatten()).reshape([V, V])
    emp_ij = emp_ij / emp_ij.sum()
    P_ij_err = ((emp_ij - P_ij)**2).mean()
    
    emp_i = emp_ij.sum(axis=1)
    P_i_err = ((emp_i - p_i)**2).mean()
    
    emp_i_inv = np.diag(1.0 / emp_i)
    emp_sim = emp_i_inv.dot(emp_ij).dot(emp_i_inv)
    sim_err = ((emp_sim - similarities)**2).mean()
    
    emp_pmi = np.log(emp_sim)
    emp_pmi[emp_pmi == -np.inf] = 0
    pmi_error = ((emp_pmi - pmis)**2).mean()
    print(n, P_ij_err, P_i_err, sim_err, pmi_error)

1 1.0265443841464265e-08 9.772192955583414e-07 nan nan
10 1.0409332259158267e-09 1.2131419508590086e-07 1.7195876544205315 0.7776803743490577
100 1.0273522714604914e-10 8.546234406636178e-09 0.04470665924422943 0.135661693163949
1000 9.772329768455331e-12 9.540590481283047e-10 0.004631978564807558 0.021746243928775912
10000 1.0144610429671092e-12 1.0279129976052298e-10 0.00046961534909790564 0.002674273709495486
100000 1.0017957205287814e-13 9.874178361735818e-12 5.066247675350424e-05 0.00010328054829645623
1000000 1.0011063211109766e-14 8.22683312486442e-13 5.3453815474824934e-06 1.1498798612890605e-05
10000000 9.825277364317782e-16 1.0647339845845071e-13 5.218583849162609e-07 1.03953639398122e-06


  # This is added back by InteractiveShellApp.init_path()
  from ipykernel import kernelapp as app
