In [1]:
import stats_functions as sf
import pandas as pd

In [2]:
data = pd.read_csv("../data/alc_sales.csv")

arr = data.iloc[:, 1]
# arr = data.iloc[:, 1].values
arr

0      1713
1      1763
2      1753
3      1784
4      1783
       ... 
355    5854
356    5839
357    5846
358    5936
359       .
Name: MRTSSM4453USS_20220114, Length: 360, dtype: object

In [6]:
import numpy as np
import pandas as pd
import scipy.stats as st

import re
import warnings

def clean_to_numeric_array(data):
    """
    Cleans up any data format (list, Series, set, generator) and returns a NumPy array
    of floats or integers. Handles common issues like missing values, invalid strings,
    whitespace, and other non-numeric data.

    Args:
        data (list, Series, set, generator): Input data that needs to be cleaned.

    Returns:
        np.ndarray: A cleaned NumPy array of numeric data (float or int).
    """
    # Convert data to an iterable list
    if isinstance(data, (pd.Series, list, set, tuple, np.ndarray)):
        data = list(data)
    elif hasattr(data, '__iter__'):
        data = list(data)
    else:
        raise ValueError("Unsupported data type. Provide list, Series, set, or generator.")

    # Helper function to clean individual values
    def clean_value(val):
        if pd.isna(val):  # Handle missing values (NaN, None)
            return np.nan
        
        if isinstance(val, str):
            # Remove extra whitespace
            val = val.strip()
            
            # Remove commas from numbers (e.g., "1,000" -> "1000")
            val = val.replace(',', '')

            # Remove any currency symbols or percentage signs
            val = re.sub(r'[^\d\.\-]', '', val)

            # Handle empty strings after cleaning
            if val == '':
                return np.nan

        # Attempt to convert cleaned value to a float
        try:
            return float(val)
        except ValueError:
            return np.nan  # Return NaN for any invalid conversions

    # Apply cleaning to the entire data array
    cleaned_data = np.array([clean_value(x) for x in data], dtype=float)

    # Optional: Filter out NaNs if desired (uncomment below to enable)
    # cleaned_data = cleaned_data[~np.isnan(cleaned_data)]

    return cleaned_data
    
    
def confidence_interval(data, confidence=0.95, pop_std=None):
    """
    Calculates the confidence interval for the population mean.
    - Uses t-distribution for n<=30
    - Uses Normal distribution for n>30
    - If populations standard deviation is provided the 
      standard error of the mean (sem) is calculated with it.

    Args:
        data (array-like): Data sample.
        confidence (float): Level of confidence (e.g., 0.95 for 95% confidence).
        pop_std (float, optional): Population standard deviation. Defaults to None.

    Returns:
        tuple: Lower and upper bounds of the confidence interval.
    """
    data = clean_to_numeric_array(data)
    n = len(data)
    mean = np.mean(data)

    # uses t-distribution for less than 30 observations
    if not pop_std and n <= 30:
        return st.t.interval(confidence, df=n - 1, loc=mean, scale=st.sem(data)) 

    # normal-dist for n>30
    if not pop_std and n > 30:
        return st.norm.interval(confidence, loc=mean, scale=st.sem(data))

    # if population standard deviation is provided 
    # normal-dist is used & standard error of the mean is calculated with pop_std.
    sem = pop_std / np.sqrt(n)
    return st.norm.interval(confidence, loc=mean, scale=sem)


In [8]:
sf.confidence_interval(arr, 0.95)

(np.float64(nan), np.float64(nan))