In [11]:
import pulp
import pandas as pd
import numpy as np

def lf_constraint_solve_no_new_lf_multi_class(df, lf_acc_thresh=0.5, 
                        instance_acc_thresh=0.5,
                        min_non_abstain_thresh=0.8,
                        expected_label_col='expected_label',
                        instance_acc_on_valid=False,
                        use_non_abstain=True,
                        class_num=2
                                 ):
    
    # Problem initialization
    prob = pulp.LpProblem("Label_Flip_Minimization", pulp.LpMinimize)

    # Parameters
    labeling_functions = [lf_name for lf_name in df.columns if lf_name != expected_label_col]
    num_instances = len(df)
    # Define P_vars (Decision Variables) with values ranging from -1 to 3
    upBound = class_num-1
    P_vars = pulp.LpVariable.dicts("P", (range(num_instances), labeling_functions), 
                                   lowBound=-1, upBound=upBound, cat='Integer')

    # Define flip variables for all possible transitions
    flip_vars = {}
    value_range = list(range(-1,class_num))
    M = max(value_range) - min(value_range)

    for v1 in value_range:
        for v2 in value_range:
            if v1 != v2:  # No self-flipping
                flip_vars[(v1, v2)] = pulp.LpVariable.dicts(f"flip_{v1}_to_{v2}", 
                                                             (range(num_instances), labeling_functions), cat='Binary')

    # Define abstain indicator variables (is_abstain[i][lf] == 1 if P_vars[i][lf] == -1)
    is_abstain = pulp.LpVariable.dicts("is_abstain", 
                                       (range(num_instances), labeling_functions), 
                                       cat='Binary')

    # Objective: Minimize the number of flips
    flip_cost = pulp.lpSum([flip_vars[(v1, v2)][i][lf] 
                            for v1 in value_range for v2 in value_range if v1 != v2
                            for i in range(num_instances) for lf in labeling_functions])
    
    prob += flip_cost, "Minimize_Flips"

    # Mutual exclusivity: At most one flip per (i, lf)
    for i in range(num_instances):
        for lf in labeling_functions:
            prob += pulp.lpSum([flip_vars[(v1, v2)][i][lf] for v1 in value_range for v2 in value_range if v1 != v2]) <= 1, f"Flip_Exclusivity_{i}_{lf}"

    # Enforce flipping logic constraints
    for i in range(num_instances):
        for lf in labeling_functions:
            original_val = df.loc[i, lf]
            prob += P_vars[i][lf] == pulp.lpSum([v2 * flip_vars[(original_val, v2)][i][lf] for v2 in value_range if v2 != original_val]) + \
                                          original_val * (1 - pulp.lpSum([flip_vars[(original_val, v2)][i][lf] for v2 in value_range if v2 != original_val])), \
                                          f"Flip_From_{original_val}_{i}_{lf}"

    # Define correctness variables
    correctness_vars = pulp.LpVariable.dicts("correct", 
                                             (range(num_instances), labeling_functions), cat='Binary')

    # Accuracy constraints for each labeling function
    for lf in labeling_functions:
        lf_correct_predictions = pulp.lpSum([correctness_vars[i][lf] for i in range(num_instances)])
        prob += lf_correct_predictions >= lf_acc_thresh * num_instances, f"LF_{lf}_Accuracy"


    for i in range(num_instances):
        correct_predictions_per_instance = pulp.lpSum([correctness_vars[i][lf] for lf in labeling_functions])
        instance_abstain_count = pulp.lpSum([is_abstain[i][lf] for lf in labeling_functions])        
        num_labeling_functions_used = len(labeling_functions)
        if(instance_acc_on_valid):
            prob += correct_predictions_per_instance >= instance_acc_thresh * (num_labeling_functions_used-instance_abstain_count), f"Instance_{i}_Accuracy"
        else:
            prob += correct_predictions_per_instance >= instance_acc_thresh * (num_labeling_functions_used), f"Instance_{i}_Accuracy"
        if(use_non_abstain):
            prob += instance_abstain_count <= num_labeling_functions_used *(1- min_non_abstain_thresh), f"Instance_{i}_NonAbastain"


    # Correctness constraints: ensure `P_vars[i][lf] == expected_label` when marked correct
    for i in range(num_instances):
        for lf in labeling_functions:
            true_label = df.loc[i, expected_label_col]
            prob += P_vars[i][lf] - true_label <= M * (1 - correctness_vars[i][lf]), f"Correctness_UpperBound_{i}_{lf}"
            prob += true_label - P_vars[i][lf] <= M * (1 - correctness_vars[i][lf]), f"Correctness_LowerBound_{i}_{lf}"
        

    # Solve the integer program
    solver = pulp.PULP_CBC_CMD(msg=1, timeLimit=600)
    prob.solve(solver)

    # Extract solutions
    p_vars_solution = pd.DataFrame(index=df.index, columns=labeling_functions)
    for i in range(num_instances):
        for lf in labeling_functions:
            p_vars_solution.loc[i, lf] = int(pulp.value(P_vars[i][lf]))

    return p_vars_solution, pulp.value(flip_cost)

In [12]:
def generate_test_dataframe(num_instances=10, num_labeling_functions=4):
    np.random.seed(42)
    
    # Generate random labels in range {-1, 0, 1, 2, 3}
    data = np.random.choice([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], size=(num_instances, num_labeling_functions))
    
    # Convert to DataFrame
    df = pd.DataFrame(data, columns=[f"lf{i}" for i in range(1, num_labeling_functions + 1)])
    
    # Assign expected labels randomly
    df["expected_label"] = np.random.choice([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], size=num_instances)
    
    return df

# Generate a test dataframe
test_df = generate_test_dataframe()
print(test_df)

# Run solver on test data
p_vars_solution, total_flips = lf_constraint_solve_no_new_lf_multi_class(df=test_df, lf_acc_thresh=0.5, 
                        instance_acc_thresh=0.7,
                        min_non_abstain_thresh=0.8,
                        expected_label_col='expected_label',
                        instance_acc_on_valid=False,
                        use_non_abstain=False,
                        class_num=10)
print("\nUpdated Label Assignments:")
print(p_vars_solution)
print("\nTotal Flips:", total_flips)

   lf1  lf2  lf3  lf4  expected_label
0    5    2    9    6               3
1    3    5    8    1               7
2    5    9    9    6               5
3    3    2    6    6               0
4    1    4    3    0               2
5    6    4    0    3               7
6   -1    8    4    7               0
7   -1    9    9    8               8
8    1    5    2    7               7
9    1    3    1    5               8
Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/chenjieli/.pyenv/versions/3.10.0/envs/rulecleaner/lib/python3.10/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/sl/1_4_3jg90p50ft92b5k6mgv00000gn/T/3dc517862f1f49899259f1d9f617cc36-pulp.mps -sec 600 -timeMode elapsed -branch -printingOptions all -solution /var/folders/sl/1_4_3jg90p50ft92b5k6mgv00000gn/T/3dc517862f1f49899259f1d9f617cc36-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 179 COLUMNS
At line 18620 RHS
At line 18795 BOUNDS
At 