In [1]:
import pandas as pd
import numpy as np
import math
from scipy.stats import norm

In [2]:
df = pd.read_csv("/home/lucasleao/bootstrapping-intervals/data/risk_factors_cervical_cancer.csv")

# Exemplo de uso:
data = df[df["Dx:Cancer"] == 1]["Age"]
num_samples = len(data)
num_iterations = 1000

Boostrapping Normal Interval Confidence

In [3]:
def bootstrapping_mean_std(data, num_samples, num_iterations=1000):
    """
    Perform bootstrapping on the given data to estimate the mean and standard deviation.

    Parameters:
    data (array-like): The input data for bootstrapping.
    num_samples (int): The number of samples to draw in each bootstrap iteration.
    num_iterations (int): The number of bootstrap iterations to perform.

    Returns:
    tuple: A tuple containing the mean and standard deviation of the bootstrap means.
    
    Purpose:
    This function aims to provide a robust estimate of the mean and its variability by performing
    bootstrapping. Bootstrapping is a statistical method that involves repeated random sampling
    with replacement from a dataset to approximate the distribution of a statistic.
    """
    np.random.seed(42)
    bootstrap_means = np.zeros(num_iterations)
    bootstrap_std_dev = np.zeros(num_iterations)

    for i in range(num_iterations):
        bootstrap_sample = np.random.choice(data, size=num_samples, replace=True)
        bootstrap_means[i] = np.mean(bootstrap_sample)
        bootstrap_std_dev[i] = np.std(bootstrap_sample)
    
    return bootstrap_means.mean(), bootstrap_std_dev.mean()

mean, std_dev = bootstrapping_mean_std(data, num_samples, num_iterations)

print(f"Estimated Mean: {mean}")
print(f"Estimated Standard Deviation: {std_dev}")

Estimated Mean: 33.306444444444445
Estimated Standard Deviation: 7.643030665955143


In [4]:
def interval_confidence(confidence_level, std_dev, mean, n):
    """
    Calculates the margin of error and confidence interval for the mean of a sample.

    Parameters:
    - confidence_level (float): Desired confidence level (e.g., 0.95 for 95%).
    - std_dev (float): Standard deviation of the sample.
    - mean (float): Mean of the sample.
    - n (int): Sample size (number of observations).

    Returns:
    - float: The margin of error.
    - list: List containing the lower and upper bounds of the confidence interval rounded to two decimal places.
    """
    def se(std_dev, n):
        """
        Calculates the standard error of the mean.

        Parameters:
        - std_dev (float): Standard deviation of the sample.
        - n (int): Sample size (number of observations).

        Returns:
        - float: Standard error of the mean.
        """
        return std_dev / math.sqrt(n)

    # Calculates the z-score corresponding to the desired confidence level.
    z_score = norm.ppf((1 + confidence_level) / 2)
    se_value = se(std_dev, n) 
    margin_error = z_score * se_value

    return margin_error, [round(mean - margin_error, 2), round(mean + margin_error, 2)]

In [5]:
interval_confidence(0.95, std_dev, mean, len(data))

(3.5308351431897, [29.78, 36.84])

In [6]:
def optimal_sample(confidence_level, std_dev, std_error):
    """
    Calculates the optimal sample size needed to achieve a desired standard error for a given confidence level and standard deviation.

    Parameters:
    - confidence_level (float): Desired confidence level (e.g., 0.95 for 95%).
    - std_dev (float): Standard deviation of the population.
    - std_error (float): Desired standard error of the mean.

    Returns:
    - int: The optimal sample size, rounded to the nearest integer.
    """
    z_score = norm.ppf((1 + confidence_level) / 2)  # Calculates the z-score corresponding to the desired confidence level.
    return round((z_score ** 2) * (std_dev ** 2) / (std_error ** 2))  # Calculates and returns the optimal sample size.

In [7]:
optimal_sample(0.90, std_dev, 1)

158

Boostrapping Percentile Interval Confidence

In [8]:
def bootstraping_percentile_confidence_interval(data, confidence_level, num_iterations=1000):
    """
    Calculates the percentile bootstrap confidence interval for the mean of the given data.

    Parameters:
    - data (list or numpy array): Sample data.
    - confidence_level (float): Desired confidence level (e.g., 0.95 for 95%).
    - num_resamples (int): Number of bootstrap resamples to perform. Default is 1000.

    Returns:
    - list: List containing the lower and upper bounds of the confidence interval rounded to two decimal places.
    """
    np.random.seed(42)
    sample_means = np.zeros(num_iterations)
    
    for i in range(num_iterations):
        resample = np.random.choice(data, size=len(data), replace=True)
        sample_means[i] = np.mean(resample)

    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)
    
    return [round(lower_bound, 2), round(upper_bound, 2)]

In [None]:
bootstraping_percentile_confidence_interval(data, 0.95)

[29.83, 37.17]