# Data Import

In [None]:
# Import libraries
import os
import pandas as pd

In [None]:
date = "04-17-2025"
data_dict = {}

In [None]:
graph_str       = "../graphs/" + date + "/"
analysis_str    = "../analysis/" + date + "/"
directory_str   = "../data/" + date + "/"

for dir in os.listdir(directory_str):
    data_dict[dir] = []

    tmp_graph_str     = graph_str + dir + "/"
    tmp_directory_str = directory_str + dir + "/"

    if os.path.isdir(analysis_str) == False:
        os.makedirs(analysis_str)

    if os.path.isdir(tmp_graph_str) == False:
        os.makedirs(tmp_graph_str)

    for file in os.listdir(tmp_directory_str):
        data_dict[dir].append(pd.read_csv(tmp_directory_str + file, delimiter=";"))

# Data Cleaning

In [None]:
X_COL = "Flow EZ #2 (12806)"
Y_COL = "Flow Unit #1 [Flow EZ #1 (11411)]"

filtered_dict = {}

In [None]:
for key in data_dict.keys():
    filtered_dict[key] = []

    for df in data_dict[key]:
        filtered_dict[key].append(df[[X_COL, Y_COL]])

# Functions

In [None]:
import numpy as np
import scipy.stats as stats

from typing import Tuple
from scipy.optimize import curve_fit

In [None]:
def find_log_fit(df: pd.DataFrame) -> Tuple[str, float]:
    """
    Use scipy to find the logarithmic fit and fitting parameters
    """

    log_x = np.log(df[X_COL])
    y = df[Y_COL]

    a_log, b_log, r_log, _, _ = stats.linregress(log_x, y)

    df['log_fit'] = a_log * log_x + b_log

    y_true     = df[Y_COL]
    y_pred_log = df['log_fit']
    ss_res_log = np.sum((y_true - y_pred_log) ** 2)
    ss_tot_log = np.sum((y_true - np.mean(y_true)) ** 2)
    r2_log = 1 - (ss_res_log / ss_tot_log)

    if not np.isnan(a_log):
        log_eq = (
            f"Log: y = {a_log:.3f} ln(x) + {b_log:.3f}\n"
            f"R² = {r2_log:.3f}"
        )
    else:
        df['log_fit'] = np.nan
        log_eq = "Logarithmic fit failed"

    return log_eq

In [None]:
def find_logistic_fit(df: pd.DataFrame) -> Tuple[str, float]:
    """
    Use scipy to find the logistic fit and fitting parameters
    """

    # Logistic function definition
    def logistic(x, L, k, x0):
        return L / (1 + np.exp(-k * (x - x0)))
    
    # Initial parameter guess: [max y, slope, midpoint]
    initial_guess = [df[Y_COL].max(), 1, df[X_COL].median()]

    # Fit the logistic model
    try:
        params, _ = curve_fit(logistic, df[X_COL], df[Y_COL], p0=initial_guess, maxfev=10000)
        L, k, x0_log = params
        df['logistic_fit'] = logistic(df[X_COL], L, k, x0_log)
        logistic_eq = f"Logistic: y = {L:.2f} / (1 + e^(-{k:.2f}(x - {x0_log:.2f})))"

        # Calculate R²
        y_true = df[Y_COL]
        y_pred = df['logistic_fit']
        ss_res = np.sum((y_true - y_pred) ** 2)
        ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
        r2_logistic = 1 - (ss_res / ss_tot)

        logistic_eq = (
            f"Logistic: y = {L:.2f} / (1 + e^(-{k:.2f}(x - {x0_log:.2f})))\n"
            f"R² = {r2_logistic:.3f}"
        )
    except RuntimeError:
        df['logistic_fit'] = np.nan
        logistic_eq = "Logistic fit failed"

    return logistic_eq

In [None]:
def find_lin_fit(df: pd.DataFrame) -> Tuple[str, float]:
    """
    Use scipy to find the linear fit and fitting parameters
    """

    a_lin, b_lin, r_lin, _, _ = stats.linregress(df[X_COL], df[Y_COL])

    df['lin_fit'] = a_lin * df[X_COL] + b_lin

    y_true     = df[Y_COL]
    y_pred_lin = df['lin_fit']
    ss_res_lin = np.sum((y_true - y_pred_lin) ** 2)
    ss_tot_lin = np.sum((y_true - np.mean(y_true)) ** 2)
    r2_lin = 1 - (ss_res_lin / ss_tot_lin)

    lin_eq = (
        f"Log: y = {a_lin:.3f} ln(x) + {b_lin:.3f}\n"
        f"R² = {r2_lin:.3f}"
    )

    return lin_eq

In [None]:
def melt_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Melt data for legend support
    """

    df_melt = pd.melt(
        df,
        id_vars=[X_COL, Y_COL],
        value_vars=['log_fit', 'lin_fit', 'logistic_fit'],
        var_name='Fit Type',
        value_name='Fit Value'
    )

    # Rename for legend
    fit_labels = {'log_fit': 'Logarithmic Fit', 'lin_fit': 'Linear Fit', 'logistic_fit': 'Logistic Fit'}
    df_melt['Fit Type'] = df_melt['Fit Type'].map(fit_labels)

    return df_melt

In [None]:
def chi_square_test(df_melt: pd.DataFrame, chip: str, alpha: float = 0.05):
    """
    Perform Chi-Square test
    """
    def calculate_bins(data: list) -> int:
        """
        Calculate the number of histogram bins using the Freedman-Diaconis rule.
        """
        # Pre-process
        data = np.asarray(data)
        data = data[~np.isnan(data)]

        q75, q25 = np.percentile(data, [75 ,25])
        iqr = q75 - q25
        n = len(data)

        # Fallback: just one bin
        if iqr == 0 or n <= 1:
            return 1

        bin_width = 2 * iqr / (n ** (1 / 3))

        # Avoid division by zero
        if bin_width == 0:
            return 1

        data_range = data.max() - data.min()
        bins = int(np.ceil(data_range / bin_width))

        # Always at least one bin
        return max(bins, 1)

    # Replace with your DataFrame and column name
    df = df_melt.copy()
    x_data = df[X_COL]
    y_data = df[Y_COL]

    # Calculate the number of bins using Freedman-Diaconis rule
    x_bins = calculate_bins(x_data)
    y_bins = calculate_bins(y_data)

    # Bin the x and y data
    df['x_binned'] = pd.cut(x_data, bins=x_bins)
    df['y_binned'] = pd.cut(y_data, bins=y_bins)

    # Create contingency table (cross-tab of binned values)
    contingency_table = pd.crosstab(df['x_binned'], df['y_binned'])

    # Perform chi-squared test
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

    # Write data to file
    with open(analysis_str + f"{chip}_" + "chi_square_results.txt", "w") as f:
        # Log results to file
        f.write(f"Chi-squared statistic: {chi2}\n")
        f.write(f"Degrees of freedom: {dof}\n")
        f.write(f"Optimal bin count: x = {x_bins}, y = {y_bins}\n")
        f.write(f"Expected frequencies:\n{expected}\n")
        f.write(f"P-value: {p}\n")

        # Interpret the result
        if p < alpha:
            f.write("Result: Reject the null hypothesis — association exists between binned x and y.")
        else:
            f.write("Result: Fail to reject the null — no significant association.")

In [None]:
def anova_test(y_vals: list, alpha: float = 0.05) -> None:
    """
    Perform one-way ANOVA test
    """
    # Number of groups and total number of observations
    k = len(y_vals)
    n = 0

    for group in y_vals:
        n += len(group)

    # Perform one-way ANOVA
    f_statistic, p_value = stats.f_oneway(*y_vals)

    # Degrees of freedom
    df_between = k - 1
    df_within = n - k

    # Calculate critical F value
    f_critical = stats.f.ppf(1 - alpha, df_between, df_within)

    # Write data to file
    with open(analysis_str + "anova_results.txt", "w") as f:
        # Log results to file
        f.write(f"F-statistic: {f_statistic:.5f}\n")
        f.write(f"p-value: {p_value:.5f}\n")
        f.write(f"F-critical (alpha = {alpha}): {f_critical:.5f}\n")

        # Interpret the result
        if f_statistic > f_critical:
            f.write(f"Reject the null hypothesis: At least one group mean is different.")
        else:
            f.write(f"Fail to reject the null hypothesis: No significant difference between group means.")

# Data Analysis

In [None]:
from plotnine import (
    ggplot,
    aes,
    geom_point,
    geom_line,
    ggtitle,
    labs,
    annotate,
    xlim,
    ylim,
    scale_color_manual,
)

In [None]:
X_LABEL = "Pressure In (mb)"
Y_LABEL = "Flow Rate Out (µL/min)"

In [None]:
# Make variables for analysis
data_list = []

In [None]:
# Graph all the data
for key in filtered_dict.keys():
    df = pd.DataFrame()
    chip_list = filtered_dict[key]

    for i, df in enumerate(chip_list):
        # Find applicable fits
        log_eq      = find_log_fit(df)
        lin_eq      = find_lin_fit(df)
        logistic_eq = find_logistic_fit(df)

        # Melt the data
        df_melt = melt_data(df)

        # Make the plot
        plot =  (
            ggplot(df_melt, aes(X_COL, Y_COL)) +
            geom_point() +
            geom_line(
                aes(
                    y='Fit Value',
                    color='Fit Type'
                )
            ) +
            scale_color_manual(
                values = {
                    'Logarithmic Fit': 'blue',
                    'Linear Fit': 'red',
                    'Logistic Fit': 'green'
                }
            ) +
            ggtitle(f"{Y_LABEL} vs {X_LABEL}") +
            labs(
                x=X_LABEL,
                y=Y_LABEL,
                color="Model Fit"
            ) +
            xlim(0, None) +
            ylim(0, 400) +
            annotate(
                "text",
                x=0,
                y=400,
                label=logistic_eq,
                ha='left',
                va='top',
                size=8,
                color='green'
            ) +
            annotate(
                "text",
                x=0,
                y=350,
                label=log_eq,
                ha='left',
                va='top',
                size=8,
                color='blue'
            ) +
            annotate(
                "text",
                x=0,
                y=300,
                label=lin_eq,
                ha='left',
                va='top',
                size=8,
                color='red'
            )
        )

        # Save the plot
        plot.save(graph_str + f"{key}/graph_{i}.png", width=10, height=6, dpi=300)

        # Perform Chi-Square test
        chi_square_test(df_melt, key)

        # Add data to the analysis list
        data_list.append(df_melt[Y_COL].tolist())

In [None]:
# Perform data analysis
anova_test(data_list)

In [None]:
df_melt