In [26]:
import numpy as np
import pandas as pd   
import scipy
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [27]:
cg = 1.78
eps = 1e-2
laplace_scale = 1.0 / (2**.5)

In [28]:
np.random.seed(50)

## Helper Functions

In [38]:
def random_generator(n,a,gamma=(2*cg)**.5):
    """Draws random numbers from normal, laplace and cauchy distributions."""
    distributions = {}
    distributions['normal'] = a + np.random.randn(n)
    distributions['laplace'] = a + np.random.laplace(size = n,scale = laplace_scale)
    distributions['cauchy'] = a + scipy.stats.cauchy.rvs(size=n,loc = 0, scale = gamma)
    return distributions

def cauchy_likelihood(data, gamma, a):
    return -np.sum(np.log(1 + ((data-a)/gamma)**2))

def f(data,gamma,a):
    """Optimization function for the cauchy distribution (given gamma)."""
    return np.sum((a-data)/(1 + ((data-a)/gamma)**2))

def df_da(data,gamma,a):
    """derivative of f with respect to a."""
    return np.sum((1 - ((data-a)/gamma)**2)/((1 + ((data-a)/gamma)**2)**2))

def nr(data,guess,gamma,iterations=1000):
    """Newton-Raphson method for estimating the scale parameter of a Cauchy distribution."""
    for iteration in range(iterations):
        y = f(data,gamma, guess)
        dy_dA = df_da(data, gamma, guess)
        guess -= y/dy_dA
        if abs(y) < eps:
            break
    return guess

def grad_desc(data,guess,gamma,iterations=1000):
    """gradient descent method for estimating the location parameter of a cauchy distribution."""
    # print(guess)
    lr = 1e-3
    pguess = 1e5
    for _ in range(iterations):
        y = f(data,gamma, guess)
        # dy_da = df_da(data, gamma, guess)
        pguess = guess
        guess -= lr*y
        if abs(pguess-guess) < eps:
            break
        # print(guess)
    # print("over")
    return guess

def mle_estimates(distributions, gamma, guess):
    """Calculates the mle estimates for normal, laplace and cauchy distributions."""
    estimates = {}
    for key in distributions.keys():
        if key == 'normal':
            estimates[key] = np.mean(distributions[key])
        elif key == 'laplace':
            estimates[key] = np.median(distributions[key])
        elif key == 'cauchy':
            estimates[key] = nr(distributions[key], guess, gamma=gamma,iterations=10000)
    return estimates



## Experiments

In [64]:
# from tqdm import tqdm

def realisations(n, a, num_realisations):
    """Generates random numbers and calculates MLE estimates for multiple realisations."""
    estimates = {"normal": [], "laplace": [], "cauchy": []}
    
    for _ in tqdm(
        range(num_realisations),
        desc=f"n={n} | Realisations",
        leave=False,
        dynamic_ncols=True,
    ):
        distributions = random_generator(n, a)
        if n%2 != 0:
            estimate = mle_estimates(distributions, gamma=(2 * cg) ** 0.5, guess=np.median(distributions["cauchy"]))
        else:
            estimate1 = mle_estimates(distributions, gamma=(2 * cg) ** 0.5, guess=sorted(distributions["cauchy"])[int(n/2)])
            estimate2 = mle_estimates(distributions, gamma=(2 * cg) ** 0.5, guess=sorted(distributions["cauchy"])[int(n/2)-1])

            if (cauchy_likelihood(distributions["cauchy"],(2 * cg) ** 0.5,estimate1["cauchy"])
               <cauchy_likelihood(distributions["cauchy"],(2 * cg) ** 0.5,estimate2["cauchy"])):
                estimate  = estimate1
            else:
                estimate = estimate2

        for key in estimate:
            estimates[key].append(estimate[key])
    
    return estimates

def get_statistics(estimates):
    """Calculates mean and standard deviation of MLE estimates."""
    statistics = {}
    for key in estimates:
        statistics[key] = {
            'mean': np.mean(estimates[key]),
            'var': np.var(estimates[key])
        }
    return statistics

def iter_over_sample_size(ns, a, num_realisations):
    """Iterates over different sample sizes and calculates MLE estimates."""
    estimates = {}
    stats = {}

    outer = tqdm(ns, desc="Sample Sizes", dynamic_ncols=True)
    for n in outer:
        outer.set_description(f"Sample size: n={n}")
        estimates[n] = realisations(n, a, num_realisations)
        stats[n] = get_statistics(estimates[n])
        outer.write(f"Done: n={n}, num_real={num_realisations}")
    
    return estimates, stats

def give_dataframe(a, sample_sizes, num_realisations):
    data = {}
    for ni in sample_sizes:
        data[f"{ni}"]={}
        for nri in num_realisations:
            data[f"{ni}"][f"normal_{nri}"]={}
            data[f"{ni}"][f"laplace_{nri}"]={}
            data[f"{ni}"][f"cauchy_{nri}"]={}
            
    for nr in num_realisations:
        _, stats = iter_over_sample_size(sample_sizes, a, nr)
        for ni in sample_sizes:
            data[f"{ni}"][f"normal_{nr}"] = rf"{stats[ni]["normal"]["mean"]:.3f} ± {np.sqrt(stats[ni]["normal"]["var"]):.3f}"
            data[f"{ni}"][f"laplace_{nr}"] = rf"{stats[ni]["laplace"]["mean"]:.3f} ± {np.sqrt(stats[ni]["laplace"]["var"]):.3f}"
            data[f"{ni}"][f"cauchy_{nr}"] = rf"{stats[ni]["normal"]["mean"]:.3f} ± {np.sqrt(stats[ni]["cauchy"]["var"]):.3f}"
    df = pd.DataFrame(data).T
    ordered_cols = [f"normal_{nr}" for nr in num_realisations]
    ordered_cols.extend([f"laplace_{nr}" for nr in num_realisations])
    ordered_cols.extend([f"cauchy_{nr}" for nr in num_realisations])
    df = df.loc[:,ordered_cols]
    return df


## Plotting functions

In [65]:
def plot_cdf(data, stat ,label='CDF', color='blue', axis=None):
    """Plots the CDF of the data."""
    if axis is None:
        _, axis = plt.subplots(figsize=(5, 5), layout="constrained")
    sorted_data = np.sort(data)
    cdf = np.linspace(0, 1, len(sorted_data))
    axis.step(sorted_data, cdf, where='post', label=label, color=color)
    axis.set_xlabel(r'Value of $\hat{a}$')
    axis.set_ylabel('Cumulative Probability')
    axis.set_title(f"CDF of {label} with mean: {stat['mean']:.3E} var: {stat['var']:.3E}")
    axis.set_xlim(max(-2,np.min(data)), min(4,np.max(data)))
    # plt.legend()
    # plt.show()

def plot_cdf_pdf(ns,estimates,stats):
    for n in ns:
        _, axs = plt.subplots(3, 2, figsize=(10, 5), layout="constrained")
        for i,key in enumerate(estimates[n].keys()):
            axs[i,0].hist(estimates[n][key], density=True, bins=100, alpha=0.5, color='blue', label='PDF',range=(max(-2,np.min(estimates[n][key])), min(4,np.max(estimates[n][key]))))
            axs[i,0].set_title(f"PDF of {key} with sample size {n}")
            plot_cdf(estimates[n][key], stats[n][key], label=key, color='blue', axis=axs[i,1])
            axs[i,1].set_title(f"CDF of {key} with sample size {n}")
        plt.show()

def plot_mean_var(stats):
    _, axs = plt.subplots(1, 2, figsize=(10, 5), layout="constrained")
    mean, var = {"normal": [], "laplace": [], "cauchy": []},{"normal": [], "laplace": [], "cauchy": []}
    for n in stats.keys():
        for key in mean.keys():
            mean[key].append(stats[n][key]['mean'])
            var[key].append(stats[n][key]['var'])
    for key in mean.keys():
        axs[0].semilogy(stats.keys(), mean[key], label=key)
        axs[1].semilogy(stats.keys(), var[key], label=key)
    axs[0].set_xlabel(r'Sample Size')
    axs[0].set_ylabel(r'Mean of $\hat{a}$')
    axs[0].set_title(r'Mean of $\hat{a}$')
    # axs[0].set_xscale(r'log')
    axs[1].set_xlabel(r'Sample Size')
    axs[1].set_ylabel(r'Variance of $\hat{a}$')
    axs[1].set_title(r'Variance of $\hat{a}$')
    # axs[1].set_xscale(r'log')
    axs[0].legend()
    axs[1].legend()
    plt.show()

In [68]:
n = [1, 10, 100, 1000, 10000]#, 1000, 10000]
num_realisations = [10,100,1000]

In [69]:
a = 1.0
df1 = give_dataframe(a, n, num_realisations)
df1.head()

Sample Sizes:   0%|          | 0/5 [00:00<?, ?it/s]

n=1 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=1, num_real=10


n=10 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=10, num_real=10


n=100 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=100, num_real=10


n=1000 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=1000, num_real=10


n=10000 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=10000, num_real=10


Sample Sizes:   0%|          | 0/5 [00:00<?, ?it/s]

n=1 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=1, num_real=100


n=10 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=10, num_real=100


n=100 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=100, num_real=100


n=1000 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=1000, num_real=100


n=10000 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=10000, num_real=100


Sample Sizes:   0%|          | 0/5 [00:00<?, ?it/s]

n=1 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=1, num_real=1000


n=10 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=10, num_real=1000


n=100 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=100, num_real=1000


n=1000 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=1000, num_real=1000


n=10000 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=10000, num_real=1000


Unnamed: 0,normal_10,normal_100,normal_1000,laplace_10,laplace_100,laplace_1000,cauchy_10,cauchy_100,cauchy_1000
1,0.536 ± 1.054,1.002 ± 0.938,1.024 ± 0.958,1.330 ± 1.359,0.982 ± 1.033,0.993 ± 1.024,0.536 ± 43.096,1.002 ± 67.971,1.024 ± 54.052
10,0.862 ± 0.291,0.982 ± 0.326,1.006 ± 0.313,1.071 ± 0.234,0.971 ± 0.283,1.001 ± 0.257,0.862 ± 0.651,0.982 ± 853.603,1.006 ± 1097.508
100,0.943 ± 0.096,0.982 ± 0.110,0.999 ± 0.100,1.005 ± 0.060,0.997 ± 0.085,1.002 ± 0.071,0.943 ± 0.306,0.982 ± 0.274,0.999 ± 0.274
1000,0.986 ± 0.022,1.006 ± 0.030,1.000 ± 0.031,0.988 ± 0.027,1.002 ± 0.019,1.000 ± 0.023,0.986 ± 0.083,1.006 ± 0.081,1.000 ± 0.085
10000,1.002 ± 0.011,0.999 ± 0.010,1.000 ± 0.010,0.999 ± 0.005,1.000 ± 0.007,1.000 ± 0.007,1.002 ± 0.022,0.999 ± 0.024,1.000 ± 0.028


In [70]:
a = 10.0
df10 = give_dataframe(a, n, num_realisations)
df10.head()

Sample Sizes:   0%|          | 0/5 [00:00<?, ?it/s]

n=1 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=1, num_real=10


n=10 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=10, num_real=10


n=100 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=100, num_real=10


n=1000 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=1000, num_real=10


n=10000 | Realisations:   0%|          | 0/10 [00:00<?, ?it/s]

Done: n=10000, num_real=10


Sample Sizes:   0%|          | 0/5 [00:00<?, ?it/s]

n=1 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=1, num_real=100


n=10 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=10, num_real=100


n=100 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=100, num_real=100


n=1000 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=1000, num_real=100


n=10000 | Realisations:   0%|          | 0/100 [00:00<?, ?it/s]

Done: n=10000, num_real=100


Sample Sizes:   0%|          | 0/5 [00:00<?, ?it/s]

n=1 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=1, num_real=1000


n=10 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=10, num_real=1000


n=100 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=100, num_real=1000


n=1000 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=1000, num_real=1000


n=10000 | Realisations:   0%|          | 0/1000 [00:00<?, ?it/s]

Done: n=10000, num_real=1000


Unnamed: 0,normal_10,normal_100,normal_1000,laplace_10,laplace_100,laplace_1000,cauchy_10,cauchy_100,cauchy_1000
1,9.930 ± 0.816,9.932 ± 0.945,10.008 ± 0.980,10.438 ± 0.423,10.038 ± 1.073,9.956 ± 1.096,9.930 ± 82.050,9.932 ± 14.156,10.008 ± 80.940
10,10.073 ± 0.349,9.928 ± 0.314,10.000 ± 0.308,9.905 ± 0.307,9.932 ± 0.267,10.001 ± 0.263,10.073 ± 1.049,9.928 ± 1654.616,10.000 ± 1163.359
100,10.054 ± 0.105,9.982 ± 0.097,10.001 ± 0.100,9.977 ± 0.082,10.009 ± 0.077,10.002 ± 0.073,10.054 ± 0.207,9.982 ± 0.255,10.001 ± 0.261
1000,10.018 ± 0.035,10.005 ± 0.032,10.000 ± 0.032,10.008 ± 0.017,10.001 ± 0.026,10.000 ± 0.023,10.018 ± 0.065,10.005 ± 0.092,10.000 ± 0.085
10000,10.002 ± 0.008,10.001 ± 0.008,10.000 ± 0.010,9.999 ± 0.007,10.001 ± 0.007,10.000 ± 0.007,10.002 ± 0.038,10.001 ± 0.025,10.000 ± 0.026


In [None]:
plot_mean_var(stats_10)

In [None]:
plot_cdf_pdf(n, estimates_1, stats_1)