In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, anderson, kstest
import os
import math

## Data
cwd = os.getcwd()
fileName = os.path.join(cwd, "randomnumbers.csv")
df = pd.read_csv(fileName, header=None)
df.columns = ["random numbers"]

## Data Statistics
mean = df["random numbers"].mean()
variance = df["random numbers"].var()
std = math.sqrt(variance)
sigma3lower = int(mean - (3.0 * std))
sigma3upper = int(mean + (3.0 * std))
print("mean: ", mean, ", std: ", std)

## Theoretical Statistics
mean_theory = 58070468
std_theory = 1814702
print("mean (thorey): ", mean_theory, ", std (thorey): ", std_theory)

## Test
adFails = 0
ksFails = 0
adMaxRS = 0
ksMaxRS = 0
p_prev = 0.0
p2_prev = 0.0
numSim = 1000
for i in range(numSim):
    samples = df["random numbers"].sample(n=2000, random_state=i)
    # Anderson-Darling Test
    testResult1 = anderson(samples)
    AD = testResult1.statistic
    if AD >= .6:
        p = math.exp(1.2937 - 5.709*AD - .0186*(AD**2))
    elif AD >=.34:
        p = math.exp(.9177 - 4.279*AD - 1.38*(AD**2))
    elif AD >.2:
        p = 1 - math.exp(-8.318 + 42.796*AD - 59.938*(AD**2))
    else:
        p = 1 - math.exp(-13.436 + 101.14*AD - 223.73*(AD**2))

    # Kolmogorov-Smirnov Test 
    testResult2 = kstest(samples, cdf= norm(mean_theory, std_theory).cdf)
    p2 = testResult2.pvalue

    if p < 0.05:
        adFails += 1
        # print("AD test fails at random_state= ", i, ", p-value= ", p)
    if p2 < 0.05:
        ksFails += 1
        # print("KS test fails at random_state= ", i, ", p-value= ", p2)
    # if p > 0.9 and testResult2.pvalue > 0.9:
    #     print("AD test p-value: ", p)
    #     print("AD test fitting result: \n", testResult1.fit_result)
    #     print("KS test p-value: ", testResult2.pvalue)
    #     print("random_state = ", i)
    #     break
    if p > p_prev:
        p_prev = p
        adMaxRS = i
    if p2 > p2_prev:
        p2_prev = p2
        ksMaxRS = i

print("AD test success rate: ", 1.0 - (adFails/numSim))
print("AD test max p-value: ", p_prev, ", at random_state=", adMaxRS)
print("ks test success rate: ", 1.0 - (ksFails/numSim))
print("KS test p-value: ", p2_prev, ", at random state=", ksMaxRS)


# ## Visualization
# numXaxis = 10000
# xRange = np.linspace(start = sigma3lower, stop = sigma3upper, num = numXaxis)
# # Fitting PDF
# plt.figure(1)
# plt.plot(xRange, norm(mean, std).pdf(xRange))
# plt.title("PDF")
# # Fitting CDF
# plt.figure(2)
# plt.plot(xRange, norm(mean, std).cdf(xRange))
# plt.title("CDF")
# # Histogram
# plt.figure(3)
# df.hist()

# plt.show()





mean:  58071713.03943 , std:  1816770.3329484055
mean (thorey):  58070468 , std (thorey):  1814702
AD test success rate:  0.896
AD test max p-value:  0.9960497893551817 , at random_state= 500
ks test success rate:  0.9410000000000001
KS test p-value:  0.9999642734488178 , at random state= 93


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, anderson, kstest
import os
import math

cwd = os.getcwd()
fileName = os.path.join(cwd, "randomnumbers.csv")

df = pd.read_csv(fileName)
print(df.head())

df2 = pd.read_csv(fileName, header=None)
df2.columns = ["random numbers"]
print(df2.head())