In [None]:
from abc import ABC, abstractmethod
from scipy.stats import beta

class ConfidenceInterval(ABC):

    @abstractmethod
    def __init__(self, func):
        self.func = func

    def __call__(self, *args):
        return self.func(*args)


class CPInterval(ConfidenceInterval):
    def __init__(self):
        super().__init__(self.func)

    def func(self, alpha, n, m):
        a = beta.ppf(alpha/2, m, n-m+1)
        b = beta.ppf(1-alpha/2, m+1, n-m+0.000001)

        return a, b

In [None]:
import numpy as np

def h_absolute(gamma, epsilon):
    if gamma < 0.5:
        return 4.5 / ((3*gamma + epsilon)*(3*(1-gamma) - epsilon))
    return 4.5 / ((3*(1-gamma) + epsilon)*(3*gamma + epsilon))

def sequential_massart_absolute(epsilon, delta, alpha, samples, interval=CPInterval()):

    lower_bounds, upper_bounds = [], []
    estimates = []

    M = np.ceil(1 / (2 * epsilon**2) * np.log(2 / delta))
    n = M
    m = 0

    a = 0
    b = 1

    for k, sample in enumerate(samples):
        k += 1
        m += sample
        a, b = interval(alpha, k, m)

        print(f'{k}: Interval: [{a}, {b}] - Samples: {n} - Estimate: {m/k}')
        lower_bounds.append(a)
        upper_bounds.append(b)
        estimates.append(m/k)

        if a <= 0.5 <= b:
            n = M
        elif b < 0.5:
            n = np.ceil(2 / (h_absolute(b, epsilon)*epsilon**2) * np.log(2 / (delta - alpha)))
        else:
            n = np.ceil(2 / (h_absolute(a, epsilon)*epsilon**2) * np.log(2 / (delta - alpha)))

        n = min(n, M)
        if n <= k:
            return m / k, True, lower_bounds, upper_bounds, estimates
    return m / k, False, lower_bounds, upper_bounds, estimates

In [None]:
import pandas as pd

MODEL_NAME = 'llama3-8b'
verified_data = pd.read_csv(f'{MODEL_NAME}-verified.csv')
predicted_data = pd.read_csv(f'../{MODEL_NAME}/evaluation.csv')

predicted_data = predicted_data[:len(verified_data)].drop(columns=["cr0"])

verified_data, predicted_data

In [None]:
verified_flattened = verified_data.melt(id_vars=['id'], var_name='criterion', value_name='verified_value')
predicted_flattened = predicted_data.melt(id_vars=['id'], var_name='criterion', value_name='predicted_value')
merged_df = pd.merge(verified_flattened, predicted_flattened, on=['id', 'criterion']).drop(columns=['id', 'criterion'])
merged_df['comparison'] = (merged_df['verified_value'] == merged_df['predicted_value']).astype(int)
merged_df, merged_df['comparison'].mean()

In [None]:
result, success, lower_bounds, upper_bounds, estimates = sequential_massart_absolute(epsilon=0.05, delta=0.024, alpha=0.001, samples=merged_df['comparison'])

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Example data (replace with your actual data)
k = np.arange(1, len(estimates) + 1)  # x-axis values (sample number)
lower_bounds = np.array(lower_bounds)
upper_bounds = np.array(upper_bounds)
estimates = np.array(estimates)

# Create the plot
plt.figure(figsize=(10, 6))

epsilon = upper_bounds - lower_bounds

# Running estimate with lower and upper bounds
plt.plot(k, estimates, label='Estimated Accuracy', color='darkturquoise', linewidth=2)
plt.plot(k, epsilon, label='Interval Width', color='coral', linewidth=2)
plt.fill_between(k, lower_bounds, upper_bounds, color='lightsalmon', alpha=0.4, label='Confidence Interval')

# Set logarithmic scale for x-axis with base 10
plt.xscale('log', base=10)

# Labels and limits
plt.xlabel('Samples (k)')
plt.ylim([0, 1.05])  # Adjusted y-axis limit to zoom in
plt.xlim([1, len(estimates)+10])  # Adjusted x-axis limit to start from 0
plt.legend(loc='lower left')
plt.grid(True, which="both", ls="--")

# Final values
final_estimate = estimates[-1]
final_epsilon = epsilon[-1]

# Plot dotted lines and text for final values
plt.axhline(y=final_estimate, color='darkturquoise', linestyle='--', linewidth=1)
plt.axhline(y=final_epsilon, color='coral', linestyle='--', linewidth=1)

# Position text near the y-axis ticks on the left side
fontdict = {'size': 11, 'weight': 'bold'}
plt.text(0.93, final_estimate, f'{final_estimate:.2f}', color='darkturquoise', va='center', ha='right', fontdict=fontdict)
plt.text(0.94, final_epsilon, f'{final_epsilon:.3f}', color='coral', va='center', ha='right', fontdict=fontdict)

# Title
plt.title('Sequential Massart Evaluation ($\\epsilon=5\%$, $\\delta=2.4\%$, $\\alpha=0.1\%$)')
plt.savefig('sequential_massart.pdf', format='pdf', bbox_inches='tight', pad_inches=0)
plt.show()