In [1]:
from typing import List, Union

In [2]:
def arithmetic_mean(
        data: List[Union[int, float]]
    ) -> Union[int, float]:

    """
    Calculate data's arithmetic mean

    Args
    data: List[Union[int, float]]
        Set of data

    Returns
    Union[int, float]
        Data's arithmetic mean
    """

    return sum(data) / len(data)

In [3]:
def arithmetic_mean_with_frequency(
    data: List[Union[int, float]],
    freq_list: List[Union[int, float]]
) -> Union[int, float]:

    """
    Calculate data's arithmetic mean

    Args
    data: List[Union[int, float]]
        Set of data
    freq_list: List[Union[int, float]]
        Set of data's frequency

    Returns
    Union[int, float]
        Data's arithmetic mean
    """

    if len(data) != len(freq_list):
        raise ValueError("Data and frequency arrays must have the same size")

    if any(f <= 0 for f in freq_list):
        raise ValueError("Every frequency must be positive")

    return sum(d * f for d, f in zip(data, freq_list)) / len(data)

In [4]:
def weighted_average(
        data: List[Union[int, float]],
        weights: List[Union[int, float]]
    ) -> Union[int, float]:

    """
    Calculate data's weighted average

    Args
    data: List[Union[int, float]]
        Set of data
    weights: List[Union[int, float]]
        Set of data's weight 
    
    Notes:
        All the values must be positive
        Data set and Weight set mus have the same size

    Returns
    Union[int, float]
        Data's weighted average
    """

    if len(data) != len(weights):
        raise ValueError("Data and weights arrays must have the same size")

    if any(w <= 0 for w in weights):
        raise ValueError("Every weight must be positive")

    weighted_sum = sum(d * w for d, w in zip(data, weights))
    sum_weights = sum(weights)
    return weighted_sum / sum_weights

In [5]:
def median(
        data: List[Union[int, float]]
    ) -> Union[int, float]:

    """
    Calculate data's median

    Args
    data: List[Union[int, float]]
        Set of data

    Returns
    Union[int, float]
        Data's median
    """

    sorted_data = sorted(data)
    n = len(sorted_data)
    half = n // 2
    if n % 2 == 0:
        return (sorted_data[half - 1] + sorted_data[half]) / 2
    else:
        return sorted_data[half]

In [6]:
def percentile(
    data: List[Union[int, float]],
    percent: Union[int, float]
) -> Union[int, float]:

    """
    Calculate data's percentile

    Args
    data: List[float]
        Set of data
    percent: float
        Percentage reference

    Returns
    Union[int, float]
        Data set value that is higher than 'percent' of the values from the Data set
    """

    sorted_data = sorted(data)
    n = len(sorted_data)

    idx = int(n * percent) if percent < 1 else -1
    
    return sorted_data[idx]

In [7]:
def amplitude(
    data: List[Union[int, float]]
) -> Union[int, float]:

    """
    Calculate data's amplitude

    Args
    data: List[Union[int, float]]
        Set of data

    Returns
    Union[int, float]
        Amplitude of the data set
    """

    return max(data) - min(data)

In [None]:
def segmentation(
    data: List[Union[int, float]],
    n_segments: int
) -> dict:

    """
    Split data set in n segments

    Args
    data: List[Union[int, float]]
        Set of data
    n_segments: int
        Numbers of classes/segments to split the data set

    Returns
    dict
        Data Segmented = {"segment's limits": [data class], ...}
    """

    n = len(data)
    
    if n % n_segments != 0:
        raise ValueError("The number of segments must be a perfect divisor of the data set")
    
    percent = 1 / n_segments
    sup_lim_segments = [percentile(data, percent * i) for i in range(1, n_segments + 1)]
    
    sorted_data = sorted(data)
    
    idx_max = [sorted_data.index(value) + sorted_data.count(value) - 1 for value in sup_lim_segments]
    idx_min = [0] + [idx_max[i] + 1 for i in range(len(idx_max) - 1)]

    return {
        f'{sorted_data[i_min]}:{sorted_data[i_max]}': sorted_data[i_min: i_max] for i_min, i_max in zip(idx_min, idx_max)
    }

In [8]:
def median_with_segmented_data(
    data: List[Union[int, float]],
    n_segments: int
) -> Union[int, float]:

    """
    Calculate data's median
    (Assuming a normal distribution)

    Args
    data: List[Union[int, float]]
        Set of data
    n_segments: int
        Numbers of classes/segments to split the data set

    Returns
    Union[int, float]
        Data's median
    """

    data_segmented = segmentation(data, n_segments)

    keys = [k for k in data_segmented.keys()]
    median_segment_idx = [i for i in range(len(keys)) if sorted_data[int(n / 2)] in data_segmented[keys[i]]][0]
    inf_lim_median_segment = min(data_segmented[keys[median_segment_idx]])

    n_below_ilms = sum([len(data_segmented[keys[i]]) for i in range(median_segment_idx)])
    n_in_median_segment = len(data_segmented[keys[median_segment_idx]])
    median_segment_amplitude = amplitude(data_segmented[keys[median_segment_idx]])

    return round(inf_lim_median_segment + ((n / 2 - n_below_ilms) * median_segment_amplitude / n_in_median_segment),0)

In [9]:
def quartiles(
        data: List[Union[int, float]]
    ) -> dict:

    """
    Calculate data's quartiles

    Args
    data: List[Union[int, float]]
        Set of data

    Returns
    dict: {25%, 50%, 75%} 
        Data's quartiles
    """

    sorted_data = sorted(data)
    n = len(sorted_data)

    def quartile_position(
        p: Union[int, float]
    ) -> float:

        position = (n + 1) * p
        k = int(position)
        d = position - k

        if k < 1:
            return sorted_data[0]
        elif k >= n:
            return sorted_data[-1]
        else:
            return sorted_data[k - 1] + d * (sorted_data[k] - sorted_data[k-1])

    return {
        'Q1': quartile_position(.25),
        'Q2': median(data),
        'Q3': quartile_position(.75)
    }

In [10]:
from collections import Counter

In [11]:
def mode(
        data: List[Union[int, float]],
        all_modes: bool = False
    ) -> Union[List[Union[int, float]], Union[int, float]]:

    """
    Calculate data's mode

    Args
    data: List[Union[int, float]]
        Set of data
    all_modes: bool (False)
        If true, returns a list with all modes, if there's more than one
        If false, returns the mode (most frequent value)

    Returns
    List[Union[int, float]] or Union[int, float]
        Most frequent values (Data's mode)
    """

    count = Counter(data)
    max_frequency = max(count.values())
    modes = [value for value, freq in count.items() if freq == max_frequency]

    if all_modes:
        return modes
    else:
        return modes[0] if modes else None

In [12]:
def frequency(
        data: List[Union[int, float]]
    ) -> dict:

    """
    Calculate data's frequencies

    Args
    data: List[Union[int, float]]
        Set of data

    Returns
    dict: {value_i: count_i, ...}
        Sorted data's frequency
    """

    return dict(Counter(data))

In [13]:
def mean_absolute_deviation(
    data: List[Union[int, float]],
    inference: bool = False
) -> Union[int, float]:
    
    """
    Calculate data's mean absolute deviation

    Args
    data: List[Union[int, float]]
        Set of data
    inference: bool (False)
        If True, consider the calculation version used for inference
        If False, consider the traditional calculation version

    Returns
    Union[int, float]
        Data's mean absolute deviation
    """
    
    if inference:
        return sum([abs(x - arithmetic_mean(data)) for x in data]) / (len(data) - 1)
    else:
        return sum([abs(x - arithmetic_mean(data)) for x in data]) / len(data)

In [14]:
def variance(
    data: List[Union[int, float]],
    inference: bool = False,
) -> Union[int, float]:

    """
    Calculate data's variance

    Args
    data: List[Union[int, float]]
        Set of data
    inference: bool (False)
        If True, consider the calculation version used for inference
        If False, consider the traditional calculation version

    Returns
    Union[int, float]
        Data's variance
    """
    
    if inference:
        return sum([(x - arithmetic_mean(data)) ** 2 for x in data]) / (len(data) - 1)
    else:
        return sum([(x - arithmetic_mean(data)) ** 2 for x in data]) / len(data)

In [15]:
def variance_with_frequency(
    data: List[Union[int, float]],
    freq_list: List[Union[int, float]],
    inference: bool = False
) -> Union[int, float]:

    """
    Calculate data's variance

    Args
    data: List[Union[int, float]]
        Set of data
    inference: bool (False)
        If True, consider the calculation version used for inference
        If False, consider the traditional calculation version

    Returns
    Union[int, float]
        Data's variance
    """
    
    if inference:
        return (sum([f * x ** 2 for f, x in zip(freq_list, data)]) / (len(data) - 1)) - (sum([f * x for f, x in zip(freq_list, data)]) / (len(data) - 1)) ** 2
    else:
        return (sum([f * x ** 2 for f, x in zip(freq_list, data)]) / len(data)) - (sum([f * x for f, x in zip(freq_list, data)]) / len(data)) ** 2

In [16]:
import numpy as np

In [17]:
def standard_deviation(
    data: List[Union[int, float]],
    inference: bool = False
) -> Union[int, float]:

    """
    Calculate data's standard deviation

    Args
    data: List[Union[int, float]]
        Set of data
    inference: bool (False)
        If True, consider the calculation version used for inference
        If False, consider the traditional calculation version

    Returns
    Union[int, float]
        Data's standard deviation
    """
    
    return np.sqrt(variance(data, inference))

In [18]:
def variation_coefficient(
    data: List[Union[int, float]],
    inference: bool = False
) -> Union[int, float]:

    """
    Calculate data's variation coefficient

    Args
    data: List[Union[int, float]]
        Set of data
    inference: bool (False)
        If True, consider the calculation version used for inference
        If False, consider the traditional calculation version

    Returns
    Union[int, float]
        Data's variation coefficient
    """

    return standard_deviation(data, inference) / arithmetic_mean(data)

In [19]:
def Tchebichev_theorem(
    data: List[Union[int, float]],
    k: Union[int, float],
    inference: bool = False
) -> bool:

    """
    The proportion of values inside k standard deviations, starting from the arithmetic mean, must be inside 1- 1/kÂ².

    Args
    data: List[Union[int, float]]
        Set of data
    k: Union[int, float]
        Standard deviation multiple considered to countain the data
    inference: bool (False)
        If True, consider the calculation version used for inference
        If False, consider the traditional calculation version

    Returns
    bool
        Verification of the Tchebichec Theorem
    """
    
    mean = arithmetic_mean(data)
    std_dev = standard_deviation(data, inference)

    k_std_dev = k * std_dev
    
    inf_limit = mean - k_std_dev
    sup_limit = mean + k_std_dev

    data_cut = [d for d in data if sup_limit > d > inf_limit]

    percent = len(data_cut) / len(data)
    
    limit_range = 1 - 1 / (k ** 2)
    
    assert(percent <= limit_range)