In [1]:
import pulp
import pandas as pd

# Original Data
original_predictions = pd.DataFrame({
    'lf1': [1, 0, 1, 0, 1], 
    'lf2': [0, -1, 0, 1, 0], 
    'lf3': [1, 1, 1, -1, 1], 
    'tlabel': [1, 0, 1, 0, 1]  # True labels
})

# Parameters
n_instances = len(original_predictions)
n_label_funcs = 3  # We have 3 labeling functions
min_lf_accuracy = 0.7
min_instance_accuracy = 0.6

# Create the Pulp Problem
prob = pulp.LpProblem("Minimize_Label_Changes", pulp.LpMinimize)

# Decision variables for flips (0 means no flip, 1 means flip)
flip_vars = {(i, lf): pulp.LpVariable(f"flip_{i}_{lf}", 0, 1, pulp.LpBinary)
             for i in range(n_instances) for lf in range(n_label_funcs)}

# Post-flip predictions (force binary)
predictions = {(i, lf): pulp.LpVariable(f"pred_{i}_{lf}", 0, 1, pulp.LpBinary)
               for i in range(n_instances) for lf in range(n_label_funcs)}

# Objective: Minimize the number of flips
prob += pulp.lpSum(flip_vars[i, lf] for i in range(n_instances) for lf in range(n_label_funcs))

# Constraints

# Link flip variables to predictions (flip original value if flip_var is 1)
for i in range(n_instances):
    for lf, lf_name in enumerate(['lf1', 'lf2', 'lf3']):
        original_value = original_predictions[lf_name][i]
        
        # If original value is -1 (abstain), introduce flip decision to choose between 0 and 1
        if original_value == -1:
            # We can either assign it to 0 or 1, and this counts as a flip
            prob += predictions[i, lf] == flip_vars[i, lf]
        else:
            # If flip_vars[i, lf] == 1, the value should flip (1 becomes 0, 0 becomes 1).
            # When original_value is 1, the expression is: predictions[i, lf] = 1 - flip_vars[i, lf]
            # When original_value is 0, the expression is: predictions[i, lf] = flip_vars[i, lf]
            prob += predictions[i, lf] == (original_value * (1 - flip_vars[i, lf]) + (1 - original_value) * flip_vars[i, lf])

# Ensure each labeling function accuracy is at least 70%
for lf, lf_name in enumerate(['lf1', 'lf2', 'lf3']):
    correct_predictions = pulp.lpSum((predictions[i, lf] == original_predictions['tlabel'][i])
                                     for i in range(n_instances))
    prob += correct_predictions / n_instances >= min_lf_accuracy

# Ensure instance-wise accuracy is at least 60%
for i in range(n_instances):
    correct_lfs = pulp.lpSum((predictions[i, lf] == original_predictions['tlabel'][i])
                             for lf in range(n_label_funcs))
    prob += correct_lfs / n_label_funcs >= min_instance_accuracy

# Solve the problem
prob.solve()

# Get the results and round them to ensure binary values (0 or 1)
flip_results = pd.DataFrame({
    'lf1': [round(pulp.value(predictions[i, 0])) for i in range(n_instances)],
    'lf2': [round(pulp.value(predictions[i, 1])) for i in range(n_instances)],
    'lf3': [round(pulp.value(predictions[i, 2])) for i in range(n_instances)],
    'flips_lf1': [round(pulp.value(flip_vars[i, 0])) for i in range(n_instances)],
    'flips_lf2': [round(pulp.value(flip_vars[i, 1])) for i in range(n_instances)],
    'flips_lf3': [round(pulp.value(flip_vars[i, 2])) for i in range(n_instances)]
})


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/3a30d5acc6ce4721a10d14155aabe367-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/3a30d5acc6ce4721a10d14155aabe367-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 28 COLUMNS
At line 164 RHS
At line 188 BOUNDS
At line 219 ENDATA
Problem MODEL has 23 rows, 30 columns and 60 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Problem is infeasible - 0.00 seconds
Option for printingOptions changed from normal to all
Total time (CPU seconds):       0.00   (Wallclock seconds):       0.00



In [2]:
flip_results

Unnamed: 0,lf1,lf2,lf3,flips_lf1,flips_lf2,flips_lf3
0,1,1,1,0,0,0
1,2,1,1,0,0,0
2,3,1,1,0,0,0
3,0,2,1,0,0,0
4,1,1,3,0,0,0


In [3]:
prob

Minimize_Label_Changes:
MINIMIZE
1*flip_0_0 + 1*flip_0_1 + 1*flip_0_2 + 1*flip_1_0 + 1*flip_1_1 + 1*flip_1_2 + 1*flip_2_0 + 1*flip_2_1 + 1*flip_2_2 + 1*flip_3_0 + 1*flip_3_1 + 1*flip_3_2 + 1*flip_4_0 + 1*flip_4_1 + 1*flip_4_2 + 0
SUBJECT TO
_C1: flip_0_0 + pred_0_0 = 1

_C2: - flip_0_1 + pred_0_1 = 0

_C3: flip_0_2 + pred_0_2 = 1

_C4: - flip_1_0 + pred_1_0 = 0

_C5: - flip_1_1 + pred_1_1 = 0

_C6: flip_1_2 + pred_1_2 = 1

_C7: flip_2_0 + pred_2_0 = 1

_C8: - flip_2_1 + pred_2_1 = 0

_C9: flip_2_2 + pred_2_2 = 1

_C10: - flip_3_0 + pred_3_0 = 0

_C11: flip_3_1 + pred_3_1 = 1

_C12: - flip_3_2 + pred_3_2 = 0

_C13: flip_4_0 + pred_4_0 = 1

_C14: - flip_4_1 + pred_4_1 = 0

_C15: flip_4_2 + pred_4_2 = 1

_C16: 0.2 pred_0_0 + 0.2 pred_1_0 + 0.2 pred_2_0 + 0.2 pred_3_0 + 0.2 pred_4_0
 >= 1.3

_C17: 0.2 pred_0_1 + 0.2 pred_1_1 + 0.2 pred_2_1 + 0.2 pred_3_1 + 0.2 pred_4_1
 >= 1.3

_C18: 0.2 pred_0_2 + 0.2 pred_1_2 + 0.2 pred_2_2 + 0.2 pred_3_2 + 0.2 pred_4_2
 >= 1.3

_C19: 0.333333333333 pred

In [4]:
flip_vars

{(0, 0): flip_0_0,
 (0, 1): flip_0_1,
 (0, 2): flip_0_2,
 (1, 0): flip_1_0,
 (1, 1): flip_1_1,
 (1, 2): flip_1_2,
 (2, 0): flip_2_0,
 (2, 1): flip_2_1,
 (2, 2): flip_2_2,
 (3, 0): flip_3_0,
 (3, 1): flip_3_1,
 (3, 2): flip_3_2,
 (4, 0): flip_4_0,
 (4, 1): flip_4_1,
 (4, 2): flip_4_2}