In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
import numpy as np
import matplotlib.pyplot as plt
from kneed import KneeLocator
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
df_raw = pd.read_parquet("../data/pipeline_log_1.parquet", engine="pyarrow")

In [None]:
df_raw

In [None]:
df_raw.groupby('question_id').size().reset_index(name='no_row').groupby('no_row').size().reset_index(name='count').sort_values('count', ascending=False)

In [None]:
df_no_row = df_raw.groupby('question_id').size().reset_index(name="no_row")

In [None]:
df_no_row[df_no_row["no_row"] < 150] 

In [None]:

def plot_score_distribution(scores):
    plt.figure(figsize=(10,5))

    sns.histplot(scores, bins=10, kde=True, color='steelblue')
    plt.title("Score Distribution (Histogram + KDE)")
    plt.xlabel("Cosine similarity score")
    plt.ylabel("Count")
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(10,5))
    plt.plot(sorted(scores, reverse=True), marker='.')
    plt.title("Score Curve (sorted)")
    plt.xlabel("Rank")
    plt.ylabel("Score")
    plt.grid(True)
    plt.show()


In [None]:

def continuous_entropy(scores, num_samples=10_000):
    scores = np.asarray(scores)
    kde = gaussian_kde(scores)
    xmin, xmax = scores.min(), scores.max()
    xs = np.linspace(xmin, xmax, num_samples)
    px = kde(xs)
    px = np.clip(px, 1e-12, None)
    entropy = -np.trapz(px * np.log(px), xs)
    return entropy


def histogram_entropy(scores, bins=10):
    scores = np.asarray(scores)
    hist, _ = np.histogram(scores, bins=bins, density=True)
    p = hist / hist.sum()
    p = p[p > 0]
    entropy = -np.sum(p * np.log2(p))
    return entropy


def renyi_entropy(scores, alpha=2, bins=10):
    scores = np.asarray(scores)

    hist, _ = np.histogram(scores, bins=bins, density=True)

    p = hist / hist.sum()
    p = p[p > 0]

    entropy = (1 / (1 - alpha)) * np.log2(np.sum(p ** alpha))
    return entropy


def normalize_minmax(values):
    values = np.asarray(values)
    return (values - values.min()) / (values.max() - values.min() + 1e-12)


def compute_entropy_confidence(scores):
    e_cont = continuous_entropy(scores)
    e_hist = histogram_entropy(scores)
    e_reny = renyi_entropy(scores)

    raw = np.array([e_cont, e_hist, e_reny])
    norm = normalize_minmax(raw)

    confidence = 1 - norm.mean()

    return {
        "continuous_entropy": e_cont,
        "histogram_entropy": e_hist,
        "renyi_entropy": e_reny,
        "continuous_entropy_norm": norm[0],
        "histogram_entropy_norm": norm[1],
        "renyi_entropy_norm": norm[2],
        "confidence_score": confidence
    }


def compute_steepness(scores):
    scores = np.asarray(scores)

    sorted_scores = np.sort(scores)[::-1]

    top1 = sorted_scores[0]
    last = sorted_scores[-1]

    steepness = top1 - last
    return steepness


def find_elbow(values):
    values = np.asarray(values)
    y = np.sort(values)[::-1]
    x = np.arange(len(y))

    p1 = np.array([x[0], y[0]])
    p2 = np.array([x[-1], y[-1]])

    line_vec = p2 - p1
    line_vec_norm = line_vec / np.linalg.norm(line_vec)

    distances = []
    for i in range(len(x)):
        p = np.array([x[i], y[i]])
        vec = p - p1
        proj = np.dot(vec, line_vec_norm) * line_vec_norm
        dist = np.linalg.norm(vec - proj)
        distances.append(dist)

    distances = np.array(distances)
    elbow_idx = distances.argmax()
    elbow_value = y[elbow_idx]

    return elbow_idx, elbow_value


def find_knee_kneed(scores):
    y = scores.sort_values(ascending=False).values
    x = range(len(y))

    kneedle = KneeLocator(
        x, y,
        curve='convex',
        direction='decreasing',
        interp_method='polynomial'
    )

    return kneedle.knee, y[kneedle.knee]


def second_derivative_elbow(values):
    y = np.sort(values)[::-1]
    d1 = np.gradient(y)
    d2 = np.gradient(d1)
    elbow_idx = np.argmin(d2)
    return elbow_idx, y[elbow_idx]


def simple_elbow(alpha=0.5, min_index=1, sigma_factor=0.2):
    def wrap(scores):
        scores = np.sort(np.asarray(scores))[::-1]
        n = len(scores)

        if n == 2:
            return 1, scores[1]

        diffs = scores[:-1] - scores[1:]
        k = len(diffs)

        positions = np.arange(k)
        mu = (k - 1) / 2
        sigma = k * sigma_factor

        gauss = np.exp(-((positions - mu) ** 2) / (2 * sigma ** 2))

        weights = (1 - alpha) + alpha * gauss

        combined = diffs * weights

        valid_range = positions >= min_index
        combined_masked = np.where(valid_range, combined, -np.inf)

        elbow_idx = np.argmax(combined_masked)
        elbow_value = scores[elbow_idx]

        return elbow_idx, elbow_value

    return wrap


def plot_score_with_elbow_kneed(scores, calc_elbow_func, title=None):
    y = np.sort(np.asarray(scores))[::-1]
    x = np.arange(len(y))

    elbow_idx, elbow_value = calc_elbow_func(scores)

    fig, ax = plt.subplots(figsize=(10,6))
    ax.plot(x, y, marker='o', label='Scores (sorted)', color='steelblue')

    if elbow_idx is not None:
        ax.scatter(elbow_idx, elbow_value, color='red', s=120, label=f'Elbow @ {elbow_idx}')
        ax.axvline(elbow_idx, color='red', linestyle='--', alpha=0.6)
    else:
        ax.text(0.5, 0.5, "Not found", transform=ax.transAxes)

    ax.plot([0, len(y)-1], [y[0], y[-1]], '--', color='gray', label='Reference line')

    if title:
        ax.set_title(title)
    else:
        ax.set_title("Score Curve with Elbow")
    ax.set_xlabel("Rank")
    ax.set_ylabel("Score")
    ax.grid(True)
    ax.legend()

    return fig, elbow_idx, elbow_value


def save_elbow_plots_to_pdf(df_raw, pdf_filename='elbow_plots.pdf', n_questions=100,
                            calc_elbow_func=None):
    if calc_elbow_func is None:
        calc_elbow_func = simple_elbow(alpha=0.5, min_index=1, sigma_factor=0.2)

    results = []
    with PdfPages(pdf_filename) as pdf:
        for i in range(n_questions):
            df = df_raw[(df_raw["question_id"] == i) & (df_raw["step"] == "Decision")]
            try:
                question_text = df_raw[df_raw['question_id'] == i]['question'].iloc[0]
            except Exception:
                question_text = f"question_id {i}"

            print(f"{i} - {question_text}")

            if df.empty or 'score' not in df:
                print(f"  skip {i}: no data")
                continue

            try:
                fig, elbow_idx, elbow_value = plot_score_with_elbow_kneed(
                    df['score'],
                    calc_elbow_func,
                    title=f"{i} - {question_text}"
                )
                pdf.savefig(fig)
                plt.close(fig)
                results.append((i, elbow_idx, elbow_value))
            except Exception as e:
                print(f"  error for {i}: {e}")

    print(f"Saved {len(results)} plots to {pdf_filename}")
    return results


In [None]:
for i in range(100):
    df = df_raw[(df_raw["question_id"] == i) & (df_raw["step"] == "Decision")]
    print(f"{i} - {df_raw[df_raw['question_id'] == i]['question'].iloc[0]}")    
    try:
        plot_score_with_elbow_kneed(df['score'], simple_elbow(alpha=0.5, min_index=1, sigma_factor=0.3))
    except Exception as e:
        print(e)

In [None]:
results = save_elbow_plots_to_pdf(df_raw, pdf_filename='elbow_gauss_alpha_0.7.pdf',
                                  n_questions=200, calc_elbow_func=simple_elbow(alpha=0.7, min_index=1, sigma_factor=0.2))

In [None]:
results = save_elbow_plots_to_pdf(df_raw, pdf_filename='second_derivative_elbow.pdf',
                                  n_questions=200, calc_elbow_func=second_derivative_elbow)

In [None]:
# find_knee_kneed

results = save_elbow_plots_to_pdf(df_raw, pdf_filename='knee_kneed_elbow.pdf',
                                  n_questions=200, calc_elbow_func=find_knee_kneed)