In [17]:
import numpy as np
import emcee
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import time 
from scipy.stats import norm
from scipy.stats import anderson_ksamp, ks_2samp, shapiro

In [18]:
np.random.seed(0)
file_path = "../csvs/resonant/resonant_features/"
planet_systems = np.loadtxt("../csvs/resonant/order.txt", dtype = str)

def lnprob(p, vec):
    diff = vec-p[0]
    N = len(vec)

    if p[1] <=0:
        return -np.inf
    try:
        probs = -0.5 * N * np.log(2. * np.pi) - N/2. * np.log(np.abs(p[1])**2) - 0.5 \
                                    * np.sum(( (vec - p[0]) / p[1] ) ** 2)
    except:
        probs = 0.00
    return probs
       
def log_prob_normed(mu, sigma, info):
    prob = -np.log(2*np.pi)/2. - np.log(sigma**2.)/2.-(1./(sigma**2.)/2./info.shape[0])*np.nansum((info-mu)**2.)
    return prob

In [19]:

delta_times = np.zeros((planet_systems.shape[0],3))

for jj, planet_sys in enumerate(planet_systems):
    try:
        times = pd.read_csv(file_path + planet_sys)
        delta_times[jj, 1:] = times["t"].values[:2]
        sys_id = planet_sys.split("_")[2]
        delta_times[jj, 0] = sys_id
    except:
        print ("Did not work for {0}".format(planet_sys))

In [21]:
deltatimes=pd.DataFrame(delta_times, columns=["ID","t", "t_shadow"])
deltatimes= deltatimes.sort_values("ID")
deltatimes.tail()

Unnamed: 0,ID,t,t_shadow
245,391.0,28154.14,51420.78
246,393.0,710379.3,322191.2
247,394.0,2527237.0,3198412.0
249,395.0,8670.889,7556.106
250,399.0,2319745.0,3153388.0


In [22]:
deltatimes.to_csv("../csvs/resonant/deltatimes.csv", index=False)

In [23]:
trimmed = deltatimes[(deltatimes["t"]<1e7) & (deltatimes["t"]>1e4)
                    & (deltatimes["t_shadow"]<1e7)& (deltatimes["t_shadow"]>1e4)]
trimmed.shape

(184, 3)

In [24]:
nwalkers = 20
ndim = 2
iterations = 1000

sample_size = nwalkers*iterations*planet_systems.shape[0]
all_walker_samples=np.zeros((int(sample_size), 4)) 
values = np.zeros((planet_systems.shape[0], 7), dtype=str)
covariences = np.zeros((planet_systems.shape[0], ndim, ndim))

In [25]:
trimmed_ids = trimmed["ID"].values

Even with the random seed set, it'll produce values ever so slightly different, but the overall result will still hold

In [9]:
info = pd.DataFrame(values, columns=["ID", "Probability", "Mean", "Sigma", "Mean_Error", "Sigma_Error", "Determinant"])
info.head()

for jj, planet_sys in enumerate(planet_systems):

    np.random.seed(jj)
    data =pd.read_csv(file_path+planet_sys, index_col = 0)
    data = data[data["t"]<1e8]
    data = np.log10(data["t"].values)
    sys_name = planet_sys.split("_")[2]
    print (sys_name)

    p0 = [np.random.rand(ndim) for i in range(nwalkers)]
    sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=[data], a=5)
    
    # Run 200 steps as a burn-in.
    pos, prob, state = sampler.run_mcmc(p0, 200)

    sampler.reset()
    
    print ("Running MCMC ..."),
    start = time.time()
    pos, prob, state = sampler.run_mcmc(pos, iterations, rstate0=jj)
    end = time.time()
    print ("Done at ...", (end - start), "seconds.")

    maxprob_indice = np.argmax(prob)
    mean_fit, sigma_fit = pos[maxprob_indice]
    sigma_fit = np.abs(sigma_fit)
    mean_samples = sampler.flatchain[:,0]
    sigma_samples = sampler.flatchain[:,1]
    mean_std = mean_samples.std()
    sigma_std =np.std(np.abs(sigma_samples) )
    covarience = np.cov([mean_samples, sigma_samples])
    print (covarience)
    determinant = np.linalg.det(covarience)
    print (determinant)
    covariences[jj, :, :] = covarience

    probability = -np.log(2*np.pi)/2. - np.log(sigma_fit**2.)/2.-(1./(sigma_fit**2.)/2./info.shape[0])*np.nansum((data-mean_fit)**2.)
    samples = sampler.chain[:, :, :].reshape((-1, ndim))
    
    if np.exp(probability)>1:
        info.loc[jj] = [sys_name]+[np.inf]+[mean_fit, sigma_fit] + [np.inf]*3

    else:
        info.loc[jj] = [sys_name, np.exp(probability), mean_fit, sigma_fit, mean_std, sigma_std, determinant]
    print (sys_name , probability, np.exp(probability),"\n")

0
Running MCMC ...
Done at ... 0.723921537399292 seconds.
[[ 2.87846615e-04 -1.25190497e-06]
 [-1.25190497e-06  1.33707988e-04]]
3.8485824393282375e-08
0 -0.7408830634493418 0.4766927796332871 

100
Running MCMC ...
Done at ... 0.7497456073760986 seconds.
[[6.87929449e-06 3.23577061e-08]
 [3.23577061e-08 3.15098529e-06]]
2.1675508716439725e-11
100 1.158226077583728 3.1842795980469845 

101
Running MCMC ...
Done at ... 0.8359930515289307 seconds.
[[ 6.34951521e-05 -3.69184028e-08]
 [-3.69184028e-08  3.09762414e-05]]
1.9668397941200244e-09
101 0.04035326532237038 1.0411785214984206 

102
Running MCMC ...
Done at ... 0.7689776420593262 seconds.
[[8.88562086e-04 2.09735924e-06]
 [2.09735924e-06 4.38219305e-04]]
3.8938066036311526e-07
102 -1.3130503773144047 0.2689982574413694 

103
Running MCMC ...
Done at ... 0.7858541011810303 seconds.
[[2.01411169e-04 1.47963493e-06]
 [1.47963493e-06 9.85141734e-05]]
1.9839665496210836e-08
103 -0.5608764624254949 0.5707086399006779 

104
Running MCMC ..

In [10]:
info.head()

Unnamed: 0,ID,Probability,Mean,Sigma,Mean_Error,Sigma_Error,Determinant
0,0,0.476693,3.78606,0.375144,0.0169656,0.0115629,3.84858e-08
1,100,inf,4.88925,0.0568705,inf,inf,inf
2,101,inf,5.86882,0.174573,inf,inf,inf
3,102,0.268998,6.01117,0.659359,0.029808,0.0209332,3.89381e-07
4,103,0.570709,3.87233,0.31513,0.0141916,0.00992518,1.98397e-08


In [11]:
info = info.astype("float64")
info = info.sort_values("ID")
info.to_csv("../csvs/resonant/Gaussian_Fits_{0}.csv".format(info.shape[0]),index=False)

trimmed_ids = trimmed["ID"].values
print (trimmed_ids[:4], type(trimmed_ids[0]), type(info.loc[0]["ID"]))
trimmed_info = info.loc[ info["ID"].isin(trimmed_ids) ]
print (trimmed_info.shape)

[3. 4. 5. 8.] <class 'numpy.float64'> <class 'numpy.float64'>
(184, 7)


In [12]:
trimmed_info.head()

Unnamed: 0,ID,Probability,Mean,Sigma,Mean_Error,Sigma_Error,Determinant
222,3.0,0.59016,6.841539,0.305631,0.013744,0.009843,1.829682e-08
256,4.0,0.761169,5.687744,0.238708,0.010759,0.007405,6.341003e-09
267,5.0,0.391423,5.792242,0.459218,0.020848,0.014812,9.537167e-08
300,8.0,0.430615,5.885702,0.417955,0.019451,0.013066,6.459192e-08
39,13.0,0.716483,4.640246,0.250958,0.011279,0.008004,8.138431e-09


In [13]:
trimmed_systems = [x for x in planet_systems if int(x.split("_")[2]) in trimmed_ids ]

In [14]:
sig_levels_and = np.zeros(len(trimmed))
sig_levels_kstest = np.zeros(len(trimmed))
sig_levels_shapiro = np.zeros(len(trimmed))
ids = np.zeros(len(trimmed))

for jj, planet_sys in enumerate(trimmed_systems):

    np.random.seed(jj)
    data =pd.read_csv(file_path +planet_sys, index_col = 0)
    data = data[data["t"]<1e8]
    data = np.log10(data["t"].values)
    sys_name = float(planet_sys.split("_")[2])
    ids[jj] = sys_name
    index = np.where(sys_name==trimmed_info["ID"].values)[0][0]
    
    print(trimmed_info.iloc[index])

    test = np.random.normal(loc=trimmed_info.iloc[index]["Mean"], scale=trimmed_info.iloc[index]["Sigma"], size = data.shape[0])

    try:
        statistic, crit_val, sig_level =  anderson_ksamp([data, test])
    except:
        statistic, crit_val, sig_level = 0,0,0
    print (sys_name)
    
    print ("Anderson",  statistic,  crit_val, sig_level)
    sig_levels_and[jj] = sig_level
   
    try:
        statistic, p_val = ks_2samp(data, test)
    except:
        statistic, p_val = 0,0
        
    sig_levels_kstest[jj] = p_val
    print ("KS", statistic, p_val)
    
    try:
        statistic, p_val = shapiro(data)
    except:
        statistic, p_val = 0,0
        
    sig_levels_shapiro[jj] = p_val
    print ("Shapiro", statistic, p_val)

ID             100.000000
Probability           inf
Mean             4.889254
Sigma            0.056870
Mean_Error            inf
Sigma_Error           inf
Determinant           inf
Name: 1, dtype: float64
100.0
Anderson 13.21657568544678 [0.325 1.226 1.961 2.718 3.752] 3.089015495251583e-05
KS 0.148 2.937295067799978e-05
Shapiro 0.8095924258232117 7.678782306613097e-24
ID             101.000000
Probability           inf
Mean             5.868816
Sigma            0.174573
Mean_Error            inf
Sigma_Error           inf
Determinant           inf
Name: 2, dtype: float64
101.0
Anderson 2.8290852741136896 [0.325 1.226 1.961 2.718 3.752] 0.022468161824977358
KS 0.10199999999999998 0.010125210232304454
Shapiro 0.9720158576965332 3.526777447859786e-08
ID             1.080000e+02
Probability    4.971364e-01
Mean           6.588663e+00
Sigma          3.628783e-01
Mean_Error     1.632359e-02
Sigma_Error    1.178140e-02
Determinant    3.694853e-08
Name: 10, dtype: float64
108.0
Anderson -0.17



168.0
Anderson 1.568737991077129 [0.325 1.226 1.961 2.718 3.752] 0.0724545099363668
KS 0.07599999999999996 0.10629177180942284
Shapiro 0.9570897817611694 6.932136809023604e-11
ID             1.690000e+02
Probability    6.324398e-01
Mean           4.600497e+00
Sigma          2.861146e-01
Mean_Error     1.296149e-02
Sigma_Error    8.969229e-03
Determinant    1.345529e-08
Name: 78, dtype: float64
169.0
Anderson -0.3737235711662384 [0.325 1.226 1.961 2.718 3.752] 0.5153633871981671
KS 0.05199999999999999 0.49808258504772013
Shapiro 0.9833963513374329 1.7751068298821338e-05
ID             1.700000e+02
Probability    3.599456e-01
Mean           4.909022e+00
Sigma          4.929562e-01
Mean_Error     2.258382e-02
Sigma_Error    1.609020e-02
Determinant    1.319742e-07
Name: 79, dtype: float64
170.0
Anderson 0.5917490372503219 [0.325 1.226 1.961 2.718 3.752] 0.1897811137489104
KS 0.052000000000000005 0.4980825850477198
Shapiro 0.9689223766326904 8.411981333722451e-09
ID             1.710000e+0



ID             3.280000e+02
Probability    3.238006e-01
Mean           5.719670e+00
Sigma          5.634641e-01
Mean_Error     2.531369e-02
Sigma_Error    1.793261e-02
Determinant    2.055583e-07
Name: 204, dtype: float64
328.0
Anderson 5.753288567212284 [0.325 1.226 1.961 2.718 3.752] 0.0020239438510540166
KS 0.098 0.015202766669802489
Shapiro 0.9073243141174316 6.792660944753164e-17
ID             3.310000e+02
Probability    3.467632e-01
Mean           5.704296e+00
Sigma          5.290126e-01
Mean_Error     2.435690e-02
Sigma_Error    1.618289e-02
Determinant    1.553791e-07
Name: 206, dtype: float64
331.0
Anderson 0.28414583043469976 [0.325 1.226 1.961 2.718 3.752] 0.25957299876874224
KS 0.050000000000000044 0.5491661614735049
Shapiro 0.9539251327514648 2.2225525239671917e-11
ID             3.330000e+02
Probability    3.363385e-01
Mean           5.967806e+00
Sigma          5.394991e-01
Mean_Error     2.442405e-02
Sigma_Error    1.735511e-02
Determinant    1.793425e-07
Name: 207, dty

In [15]:
scores = np.vstack([ ids, sig_levels_and, sig_levels_kstest, sig_levels_shapiro])
scores = pd.DataFrame(scores.T, columns=["ID", "Anderson_P", "KS_P", "ShapiroW"] )
scores.head()

Unnamed: 0,ID,Anderson_P,KS_P,ShapiroW
0,100.0,3.1e-05,2.9e-05,7.678782e-24
1,101.0,0.022468,0.010125,3.526777e-08
2,108.0,0.41946,0.449255,5.864905e-06
3,109.0,0.670769,0.65555,3.596655e-06
4,113.0,0.02488,0.046678,7.397713e-13


In [16]:
scores.to_csv("../csvs/resonant/res_signficance_scores_new_scipy.csv")