In [58]:
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import math
import csv
import pandas as pd

In [59]:
def case_generator():
    cases = (
        (stats.norm, (0, 1), 'Normal'), 
        (stats.cauchy, (0, 1), 'Cauchy'), 
        (stats.laplace, (0, 1/math.sqrt(2)), 'Laplace'), 
        (stats.uniform, (-math.sqrt(3), 2*math.sqrt(3)), 'Uniform'),
        (stats.poisson, (10, 0), 'Poisson')
    )
    
    for case in cases:
        sample_generator = lambda size: list(map(lambda x: round(x, 4), case[0].rvs(*case[1], size)))  # rounded sample
        yield sample_generator, case[2]

In [60]:
def draw_boxplot(sample, name):
    plt.boxplot(sample, sym="o", labels=["n=20", "n=100"], vert=True)
    plt.savefig(name + '.png')
    plt.show()
    plt.boxplot(sample, sym="o", labels=["n=20", "n=100"], vert=True)
    plt.xlim([-10,10])
    plt.savefig(name + '_zoomed' + '.png')
    plt.show()


In [61]:
def borders(sample):
    x_1 = np.quantile(sample, 0.25) - 3 / 2 * (np.quantile(sample, 0.75) - np.quantile(sample, 0.25))
    x_2 = np.quantile(sample, 0.75) + 3 / 2 * (np.quantile(sample, 0.75) - np.quantile(sample, 0.25))

    return x_1, x_2


def find_outliers(sample):
    left_border, right_border = borders(sample)
    outliers = list()
    for element in sample:
        if element < left_border or element > right_border:
            outliers.append(element)

    return outliers


def count_outliers_proportion(sample):
    outliers = find_outliers(sample)

    return len(outliers) / len(sample)

In [64]:
repetitions = 10
sizes = [10000]

res = dict()
for case in case_generator():
    get_sample, case_name = case[0], case[1]

    for size in sizes:
        test_name = case_name + ' n=' + str(size)
        for _ in range(repetitions):
            sample = get_sample(size)
            try:
                res[test_name] += [count_outliers_proportion(sample)]
            except KeyError:
                res[test_name] = [count_outliers_proportion(sample)]
        res[test_name] = [np.mean(res[test_name]).round(2)]

f = pd.DataFrame().from_dict(res).transpose()
f.index.name = 'Sample'
f.columns=['Outliers ratio']
f.to_csv('outliers_t.csv')