  1. A "constraint solver" (params tunnable on a validation set?):
  Given a set of instances $S$, and a list of labelling functions $F$
      * $\tau$: the valid label rate: for each sentence $s_i \in S$, we need at least $\tau \times |F|$ valid predictions
      * $\delta$: the accuracy per label: for each sentence $ s_i \in S$, out of all the valid predictions, we need at least $\delta \times |F|$ correct predictions.
      * $\kappa$: the labelling function accuracy threshold: for each $f_i \in F$, we need to achieve accuracy not below than $\kappa$
      * $\gamma$: the threshold of deciding if introducing "partition" is allowed: if the most frequently appeared label has frequency higher than $\gamma$, then the flipping is not allowed.
      
  
| SentenceID | ExpectedLabel | $LF_1$ | $LF_2$| $LF_3$ | $NewLF_i$ |
| --- | --- | --- | --- | --- | --- |
| 1 | 1 | 0 | 1 | 1 | ? |
| 2 | 0 | -1 | 1 | 1 | ? |
| 3 | 1 | 0 | 1 | 1 | ? |
| 4 | 0 | 1 | -1 | 1 | ? |
| 5 | 1 | 0 | 1 | -1 | ? |


In [13]:
import numpy as np

# Define parameters
tau = 0.6  # valid label rate
delta = 0.5  # accuracy per label
kappa = 0.6  # labeling function accuracy threshold
gamma = 0.1  # dominant label threshold for flipping

# Input data (rows are sentences, columns are labeling functions)
data = np.array([
    [0, 1, 1, 0],  
    [-1, 1, 1, 1],  
    [0, 1, 1, -1],  
#     [1, -1, 1, 0],  
#     [0, 1, -1, 1]  
])

# Expected labels
expected_labels = [1, 0, 1]

# Number of labeling functions
num_lf = data.shape[1]

def dominant_label_proportion(data):
    # decide what is the proportion of the most dominant label 
    # for each lf is and return its value.
    proportions = []
    for i in range(num_lf):
        lf_data = data[:, i]
        unique_labels = [x for x in set(lf_data) if x!=-1]
        valid_data = lf_data[lf_data != -1]  # Only count valid predictions
        if len(valid_data) > 0:
            dominant_proportion=-1
            for u in unique_labels:
                dominant_proportion = max(dominant_proportion, 
                                          np.mean(valid_data == u)) 
            proportions.append(dominant_proportion)
        else:
            proportions.append(0)  # If no valid data, treat as 0 proportion
    return proportions

# Check if a label can be flipped based on gamma constraint
def can_flip_label(lf_index, proportions, gamma):
    return proportions[lf_index] <= gamma 


# Phase 1: Minimize flips, respecting gamma constraint
def apply_minimal_flips(data, expected_labels, gamma):
    proportions = dominant_label_proportion(data)
    flipped_data = data.copy()
    total_flips = 0
    for i in range(len(expected_labels)):  # For each sentence
        for j in range(num_lf):  # For each labeling function
#             if flipped_data[i, j] != -1:  # If valid prediction
            if flipped_data[i, j] != expected_labels[i] and can_flip_label(j, proportions, gamma):
                # Flip the label if it doesn't match expected and flipping is allowed
                flipped_data[i, j] = expected_labels[i]
                total_flips += 1

    return flipped_data, total_flips

# Phase 2: Introduce new labeling functions if flips are not enough
def introduce_new_lfs(data, expected_labels, num_new_lfs):
    num_sentences = len(expected_labels)
    new_lfs = np.zeros((num_sentences, num_new_lfs))

    for i in range(num_sentences):
        for j in range(num_new_lfs):
            new_lfs[i, j] = expected_labels[i]  # New LF gives correct prediction

    return new_lfs

# Phase 1: Try flipping the labels
flipped_data, total_flips = apply_minimal_flips(data, expected_labels, gamma)

# Check if all thresholds are met after flipping (placeholder for detailed validation logic)
# Add detailed logic for verifying tau, delta, and kappa constraints
flips_sufficient = True  # Placeholder: Set to False if constraints aren't met

def check_flip_result(flipped_data, tau, delta, kappa):
    


# Phase 2: If flips are not sufficient, introduce new LFs
if not flips_sufficient:
    new_lfs = introduce_new_lfs(flipped_data, expected_labels, num_new_lfs=1)
    data_with_new_lfs = np.column_stack([flipped_data, new_lfs])
else:
    data_with_new_lfs = flipped_data

# Output the final dataset
import pandas as pd
df = pd.DataFrame(data_with_new_lfs, columns=[f"LF_{i+1}" for i in range(data_with_new_lfs.shape[1])])
df["ExpectedLabel"] = expected_labels
print(f"Total flips made: {total_flips}")
print(df)


Total flips made: 0
   LF_1  LF_2  LF_3  LF_4  ExpectedLabel
0     0     1     1     0              1
1    -1     1     1     1              0
2     0     1     1    -1              1


In [2]:
data

array([[ 0,  1,  1],
       [-1,  1,  1],
       [ 0,  1,  1],
       [ 1, -1,  1],
       [ 0,  1, -1]])

In [3]:
data.shape

(5, 3)

In [4]:
data1 = np.array([
    [0, 1, 1],  
    [-1, 1, 1],  
    [0, 1, 1],  
    [1, -1, 1],  
    [0, 1, -1]  
])

In [6]:
x = data1[:,1]

In [20]:
import numpy as np
import pulp

# Define parameters
tau = 0.6  # valid label rate
delta = 0.5  # accuracy per label
kappa = 0.6  # labeling function accuracy threshold
gamma = 0.7  # dominant label threshold for flipping

# Input data (rows are sentences, columns are labeling functions)
data = np.array([
    [0, 1, 1],  # Sentence 1
    [-1, 1, 1],  # Sentence 2
    [0, 1, 1],  # Sentence 3
    [1, -1, 1],  # Sentence 4
    [0, 1, -1]   # Sentence 5
])

# Expected labels
expected_labels = [1, 0, 1, 0, 1]

# Number of labeling functions and sentences
num_lf = data.shape[1]
num_sentences = len(expected_labels)

# Phase 1: Solve for minimal flips (this part is similar to previous implementation)

# Binary decision variables for each flip (1 = flip, 0 = no flip)
flip_vars = pulp.LpVariable.dicts("flip", (range(num_sentences), range(num_lf)), cat="Binary")

# Define the optimization problem (minimizing the total number of flips)
prob = pulp.LpProblem("Minimize_Flips", pulp.LpMinimize)

# Objective: minimize the total number of flips
prob += pulp.lpSum([flip_vars[i][j] for i in range(num_sentences) for j in range(num_lf)]), "TotalFlips"

# Constraints: meet tau, delta, and gamma requirements for flipping (already discussed)

# Solve the flipping optimization problem
prob.solve()

# Flipped data after phase 1
flipped_data = data.copy()
for i in range(num_sentences):
    for j in range(num_lf):
        if pulp.value(flip_vars[i][j]) == 1:
            flipped_data[i, j] = 1 if flipped_data[i, j] == 0 else 0  # Flip the label

# Output the final dataset
import pandas as pd
df = pd.DataFrame(flipped_data, columns=[f"LF_{i+1}" for i in range(flipped_data.shape[1])])
df["ExpectedLabel"] = expected_labels
print("Optimized Dataset with Minimal Flips:")
print(df)

# Output total number of flips
total_flips = pulp.value(pulp.lpSum([flip_vars[i][j] for i in range(num_sentences) for j in range(num_lf)]))
print(f"Total flips made: {total_flips}")

# Phase 2: Add new labeling functions if flipping is not enough

# Decision variables for new labeling functions (LFs)
# new_lf_vars[i][k]: 1 if new LF k for sentence i assigns a valid label (0 or 1), -1 if abstain
num_new_lfs = 10  # Max number of new LFs to consider
new_lf_vars = pulp.LpVariable.dicts("new_lf", (range(num_sentences), range(num_new_lfs), [0, 1]), cat="Binary")

# Objective: minimize the number of new labeling functions used
prob2 = pulp.LpProblem("Minimize_New_LFs", pulp.LpMinimize)
new_lf_usage = pulp.LpVariable.dicts("new_lf_usage", range(num_new_lfs), cat="Binary")

# Objective: minimize the total number of new labeling functions used
prob2 += pulp.lpSum(new_lf_usage[k] for k in range(num_new_lfs)), "TotalNewLFs"

# Constraints: Ensure that the new LFs help meet the tau and delta thresholds
for i in range(num_sentences):
    valid_preds = pulp.lpSum([1 if flipped_data[i, j] != -1 else 0 for j in range(num_lf)]) + \
                  pulp.lpSum([new_lf_vars[i][k][0] + new_lf_vars[i][k][1] for k in range(num_new_lfs)])
    
    correct_preds = pulp.lpSum([(1 if flipped_data[i, j] == expected_labels[i] else 0) for j in range(num_lf)]) + \
                    pulp.lpSum([new_lf_vars[i][k][expected_labels[i]] for k in range(num_new_lfs)])
    
    # Ensure enough valid predictions for each sentence (tau constraint)
    prob2 += (valid_preds >= tau * (num_lf + num_new_lfs)), f"Tau_Constraint_Sentence_{i}"
    
    # Ensure enough correct predictions for each sentence (delta constraint)
    prob2 += (correct_preds >= delta * (num_lf + num_new_lfs)), f"Delta_Constraint_Sentence_{i}"

# Additional constraint: ensure that the number of used LFs is minimized
for k in range(num_new_lfs):
    for i in range(num_sentences):
        # If the LF is used, at least one label (0 or 1) must be assigned
        prob2 += (new_lf_vars[i][k][0] + new_lf_vars[i][k][1] <= new_lf_usage[k]), f"NewLFUsage_{k}_for_Sentence_{i}"

# Solve the problem to minimize the number of new LFs
prob2.solve()

# Output the new LFs used
new_lf_results = np.full((num_sentences, num_new_lfs), -1)
for i in range(num_sentences):
    for k in range(num_new_lfs):
        if pulp.value(new_lf_vars[i][k][0]) == 1:
            new_lf_results[i, k] = 0
        elif pulp.value(new_lf_vars[i][k][1]) == 1:
            new_lf_results[i, k] = 1

# Output the final dataset
data_with_new_lfs = np.column_stack([flipped_data, new_lf_results])
df = pd.DataFrame(data_with_new_lfs, columns=[f"LF_{i+1}" for i in range(data_with_new_lfs.shape[1])])
df["ExpectedLabel"] = expected_labels
print(df)

# Output total number of new LFs used
total_new_lfs = pulp.value(pulp.lpSum(new_lf_usage[k] for k in range(num_new_lfs)))
print(f"Total new labeling functions added: {total_new_lfs}")


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/fab7e32e1815456fa373a1906ae4b031-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/fab7e32e1815456fa373a1906ae4b031-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 5 COLUMNS
At line 51 RHS
At line 52 BOUNDS
At line 68 ENDATA
Problem MODEL has 0 rows, 15 columns and 0 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0 - 0.00 seconds
Cgl0004I processed model has 0 rows, 0 columns (0 integer (0 of which binary)) and 0 elements
Cbc3007W No integer variables - nothing to do
Cuts at root node changed objective from 0 to -1.79769e+308
Probing was tried 0 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
Gomory was tried 0 times an