#  variables 

1. user input size (20, 40, 80)
2. lf_acc_thresh ()
3. instance_acc_thresh
4. min_non_abstain_thresh
5. kwg related params

In [1]:
# from rbbm_src.labelling_func_src.src.utils import lf_constraint_solve
from rbbm_src.labelling_func_src.src.lfs_tree import keyword_labelling_func_builder
from rbbm_src.labelling_func_src.src.TreeRules import SPAM, HAM, ABSTAIN, PredicateNode
from rbbm_src.labelling_func_src.src.LFRepair import populate_violations, fix_rules_with_solver_input
from rbbm_src.labelling_func_src.src.classes import clean_text

import re
import psycopg2
import pandas as pd
from snorkel.labeling import (
	LabelingFunction, 
	labeling_function, 
	PandasLFApplier, 
	LFAnalysis,
	filter_unlabeled_dataframe
	)
from snorkel.labeling.model import MajorityLabelVoter, LabelModel
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import matplotlib.patches as mpatches
import pulp
from sklearn.metrics import accuracy_score, classification_report

from rbbm_src.labelling_func_src.src.KeyWordRuleMiner import KeyWordRuleMiner 
# sample user confirmation and complaints
import random
from collections import deque
import numpy as np
import pickle
import pydot
from IPython.display import Image, display

[nltk_data] Downloading package words to /home/opc/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/opc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-10-15 12:35:27.035722: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-15 12:35:27.086747: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-15 12:35:27.088177: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
DEBUG 2024-10-15 12:35:27,913 [tpu_cluster_resolver.py:<module>:32] Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install c

In [2]:
from rbbm_src.labelling_func_src.src.example_tree_rules import gen_amazon_funcs

In [3]:
from collections import defaultdict

In [4]:
def calculate_coverage(df, lf_column):
    total_instances = len(df)
    labeled_instances = df[lf_column] != -1
    coverage = labeled_instances.sum() / total_instances
    return coverage

def calculate_accuracy(df, lf_column, ground_truth_column):
    labeled_instances = df[df[lf_column] != -1]
    correct_labels = (labeled_instances[lf_column] == labeled_instances[ground_truth_column]).sum()
    total_labeled = len(labeled_instances)
    if total_labeled == 0:
        return 0.0  # Avoid division by zero if no instances are labeled
    accuracy = correct_labels / total_labeled
    return accuracy

In [5]:
def run_snorkel_with_funcs(dataset_name, funcs, conn):
    
    sentences_df=pd.read_sql(f'SELECT * FROM {dataset_name}', conn)
    sentences_df = sentences_df.rename(columns={"class": "expected_label", "content": "old_text"})
    sentences_df['text'] = sentences_df['old_text'].apply(lambda s: clean_text(s))
    sentences_df = sentences_df[~sentences_df['text'].isna()]
    applier = PandasLFApplier(lfs=funcs)
    initial_vectors = applier.apply(df=sentences_df, progress_bar=False)
    model = LabelModel(cardinality=2, verbose=True, device='cpu')
    model.fit(L_train=initial_vectors, n_epochs=500, log_freq=100, seed=123)
    probs_test= model.predict_proba(L=initial_vectors)
    df_sentences_filtered, probs_test_filtered, filtered_vectors, df_no_signal  = filter_unlabeled_dataframe(
        X=sentences_df, y=probs_test, L=initial_vectors
    )	

    df_sentences_filtered = df_sentences_filtered.reset_index(drop=True)
    prob_diffs = [abs(t[0]-t[1]) for t in probs_test_filtered]
    prob_diffs_tuples = [(t[0],t[1]) for t in probs_test_filtered]
    df_sentences_filtered['model_pred_diff'] = pd.Series(prob_diffs)
    df_sentences_filtered['model_pred_prob_tuple'] = pd.Series(prob_diffs_tuples)
    df_sentences_filtered['model_pred'] = pd.Series(model.predict(L=filtered_vectors))

    wrong_preds = df_sentences_filtered[(df_sentences_filtered['expected_label']!=df_sentences_filtered['model_pred'])]
    # df_sentences_filtered.to_csv('predictions_shakira.csv', index=False)
    # logger.critical(wrong_preds)
    accuracy=(len(df_sentences_filtered)-len(wrong_preds))/len(df_sentences_filtered)

    print(f"""
        out of {len(sentences_df)} sentences, {len(df_sentences_filtered)} actually got at least one signal to \n
        make prediction. Out of all the valid predictions, we have {len(wrong_preds)} wrong predictions, \n
        accuracy = {(len(df_sentences_filtered)-len(wrong_preds))/len(df_sentences_filtered)} 
    """)
    
    global_accuracy = (len(df_sentences_filtered)-len(wrong_preds))/len(df_sentences_filtered)
    
    
    ground_truth = df_sentences_filtered['expected_label']
    snorkel_predictions = df_sentences_filtered['model_pred']
    snorkel_probs = df_sentences_filtered['model_pred_diff']
    df_sentences_filtered['vectors'] = pd.Series([",".join(map(str, t)) for t in filtered_vectors])
    correct_predictions = (snorkel_predictions == ground_truth)
    incorrect_predictions = (snorkel_predictions != ground_truth)
    correct_preds_by_snorkel = df_sentences_filtered[correct_predictions].reset_index(drop=True)
    wrong_preds_by_snorkel = df_sentences_filtered[incorrect_predictions].reset_index(drop=True)
    
    return df_sentences_filtered, correct_preds_by_snorkel, wrong_preds_by_snorkel, filtered_vectors, correct_predictions, incorrect_predictions, global_accuracy 

In [6]:
def select_user_input(user_confirm_size,
                     user_complaint_size,
                     random_state,
                     filtered_vectors,
                     correct_preds_by_snorkel,
                     wrong_preds_by_snorkel,
                      correct_predictions,
                      incorrect_predictions ):

    user_confirm_df = correct_preds_by_snorkel.sample(n=user_confirm_size, random_state=42)
    user_complaints_df = wrong_preds_by_snorkel.sample(n=user_complaint_size, random_state=42)

    random_confirm_indices = user_confirm_df.index
    random_complaints_indices = user_complaints_df.index
    random_user_confirms_vecs = filtered_vectors[correct_predictions][random_confirm_indices]
    random_user_complaints_vecs = filtered_vectors[incorrect_predictions][random_complaints_indices]
    user_input_df = pd.concat([user_confirm_df, user_complaints_df])
    gts = user_input_df['expected_label'].reset_index(drop=True)
    user_vecs = np.vstack((random_user_confirms_vecs, random_user_complaints_vecs))
    
    return user_vecs, gts, user_input_df


In [7]:
def gather_user_input_signals_on_rules(tree_rules, user_input):
    leaf_nodes = []
    
    for atui in tree_rules:
        rids = set([])
        for i, c in user_input.iterrows():
            leaf_node_with_complaints = populate_violations(atui, c)
            if(leaf_node_with_complaints.number not in rids):
                rids.add(leaf_node_with_complaints.number)
                leaf_nodes.append(leaf_node_with_complaints)
            
    uinput_unsatisfied_counts = defaultdict(int)
    
    for ln in leaf_nodes:
        if(ln.label==ABSTAIN):
            for l in [SPAM, HAM]:
                for u in ln.pairs[l]:
                    uinput_unsatisfied_counts[u['cid']]+=1
    
    return uinput_unsatisfied_counts

In [8]:
def gather_used_keywords(tree_rules):
    
    used_keywords = []
    
    for atui in tree_rules:
        rids = set([])
        queue = deque([atui.root])
        while(queue):
            cur_node = queue.popleft()
            if(isinstance(cur_node, PredicateNode)):
                used_keywords.extend(cur_node.pred.keywords)
            if(cur_node.left):
                queue.append(cur_node.left)
            if(cur_node.right):
                queue.append(cur_node.right)
    
    return used_keywords
#     for i, c in sorted_df.iterrows():
#         leaf_node_with_complaints = populate_violations(atui, c)
#         if(leaf_node_with_complaints.number not in rids):
#             rids.add(leaf_node_with_complaints.number)
#             leaf_nodes.append(leaf_node_with_complaints)

In [9]:
def apply_new_lfs_to_df(new_funcs, user_input_df):
    new_rules_applier = PandasLFApplier(lfs=new_funcs)
    new_rules_vector = new_rules_applier.apply(df=user_input_df, progress_bar=False)
    
    return new_rules_vector


In [10]:
def construct_input_df_to_solver(new_rules_vector,user_vecs, gts):
    
    df_new_vectors = pd.DataFrame(new_rules_vector, columns=[f'nlf_{i+1}' for i in range(new_rules_vector.shape[1])])
    df_user_vectors = pd.DataFrame(user_vecs, columns=[f'lf_{i+1}' for i in range(user_vecs.shape[1])])
    combined_df= pd.concat([df_new_vectors, df_user_vectors, gts], axis=1)
    
    return combined_df

In [11]:
def lf_constraint_solve(df, lf_acc_thresh=0.5, 
                        instance_acc_thresh=0.5,
                        min_non_abstain_thresh=0.8,
                        nlf_prefix='nlf_',
                        expected_label_col='expected_label',
                        new_lf_weight=0.1):
    
    # Problem initialization
    prob = pulp.LpProblem("Label_Flip_Minimization", pulp.LpMinimize)

    # Parameters
#     labeling_functions = df.columns[:-1]
    labeling_functions = [lf_name for lf_name in df.columns if lf_name!=expected_label_col]
    print(f"lf_acc: {lf_acc_thresh}, ins_acc:{instance_acc_thresh}")
    print(f"labeling_functions: {labeling_functions}")
    num_instances = len(df)
    print(f"num_instances: {num_instances}")
    M = 5
    
    nlfs = [lf for lf in labeling_functions if nlf_prefix in lf]
    print(f"nlfs: {nlfs}")
    x_nlfs = pulp.LpVariable.dicts("x_nlf", nlfs, cat='Binary')

    P_vars = pulp.LpVariable.dicts("P", (range(num_instances), labeling_functions), 
                                   lowBound=-1, upBound=1, cat='Integer')
    
    is_abstain = pulp.LpVariable.dicts("is_abstain", 
                               (range(num_instances), labeling_functions), 
                               cat='Binary')

    flip_1_to_0 = pulp.LpVariable.dicts("flip_1_to_0", 
                                        (range(num_instances), labeling_functions), cat='Binary')
    flip_1_to_neg1 = pulp.LpVariable.dicts("flip_1_to_neg1", 
                                           (range(num_instances), labeling_functions), cat='Binary')
    flip_0_to_1 = pulp.LpVariable.dicts("flip_0_to_1", 
                                        (range(num_instances), labeling_functions), cat='Binary')
    flip_0_to_neg1 = pulp.LpVariable.dicts("flip_0_to_neg1", 
                                           (range(num_instances), labeling_functions), cat='Binary')
    flip_neg1_to_1 = pulp.LpVariable.dicts("flip_neg1_to_1", 
                                           (range(num_instances), labeling_functions), cat='Binary')
    flip_neg1_to_0 = pulp.LpVariable.dicts("flip_neg1_to_0", 
                                           (range(num_instances), labeling_functions), cat='Binary')

    # Binary variables to track correctness of predictions (1 if correct, 0 if not)
    correctness_vars = pulp.LpVariable.dicts("correct", 
                                             (range(num_instances), labeling_functions), cat='Binary')
    
    # Create auxiliary variables to represent active nLF abstains
    active_abstain = pulp.LpVariable.dicts("active_abstain", 
                                           (range(num_instances), nlfs), 
                                           cat='Binary')
    
    correct_and_active = pulp.LpVariable.dicts("correct_and_active", 
                                           (range(num_instances), nlfs), 
                                           cat='Binary')


    # Objective: Minimize the number of flips
    flip_cost = pulp.lpSum([flip_1_to_0[i][lf] + flip_1_to_neg1[i][lf] + 
                            flip_0_to_1[i][lf] + flip_0_to_neg1[i][lf] + 
                            flip_neg1_to_1[i][lf] + flip_neg1_to_0[i][lf] 
                            for i in range(num_instances) for lf in labeling_functions])

    prob += flip_cost + pulp.lpSum([new_lf_weight * x_nlfs[lf] for lf in nlfs]), "Minimize_Flips"


    # Mutual exclusivity
    for i in range(num_instances):
        for lf in labeling_functions:
            prob += (flip_1_to_0[i][lf] + flip_1_to_neg1[i][lf] + 
                     flip_0_to_1[i][lf] + flip_0_to_neg1[i][lf] + 
                     flip_neg1_to_1[i][lf] + flip_neg1_to_0[i][lf]) <= 1, f"Flip_Exclusivity_{i}_{lf}"

    for i in range(num_instances):
        for lf in labeling_functions:
            original_val = df.loc[i, lf]
            if original_val == 1:
                prob += P_vars[i][lf] == 0 * flip_1_to_0[i][lf] + \
                (-1) * flip_1_to_neg1[i][lf] + 1 * (1 - flip_1_to_0[i][lf] - flip_1_to_neg1[i][lf]), f"Flip_From_1_{i}_{lf}"
                
            elif original_val == 0:                
                prob += P_vars[i][lf] == 1 * flip_0_to_1[i][lf] + \
                (-1) * flip_0_to_neg1[i][lf] + 0 * (1 - flip_0_to_1[i][lf] - flip_0_to_neg1[i][lf]), f"Flip_From_0_{i}_{lf}"
                
            elif original_val == -1:
                prob += P_vars[i][lf] == 1 * flip_neg1_to_1[i][lf] + 0 * flip_neg1_to_0[i][lf] + (-1) * (1 - flip_neg1_to_1[i][lf] - flip_neg1_to_0[i][lf]), f"Flip_From_neg1_{i}_{lf}"
    
    for i in range(num_instances):
        for lf in labeling_functions:
            prob += P_vars[i][lf] >= -1 - (1 - is_abstain[i][lf]) * M, f"Abstain_LowerBound_{i}_{lf}"
            prob += P_vars[i][lf] <= -1 + (1 - is_abstain[i][lf]) * M, f"Abstain_UpperBound_{i}_{lf}"

            # If is_abstain[i][lf] == 0, P_vars[i][lf] can only be 0 or 1
            prob += P_vars[i][lf] >= 0 - is_abstain[i][lf] * M, f"Non_Abstain_LowerBound_{i}_{lf}"
            prob += P_vars[i][lf] <= 1 + is_abstain[i][lf] * M, f"Non_Abstain_UpperBound_{i}_{lf}"
    
    # Set up the constraints for the auxiliary variables
    for i in range(num_instances):
        for lf in nlfs:
            # Ensure active_abstain[i][lf] is 1 only if both is_abstain[i][lf] == 1 and x_nlfs[lf] == 1
            prob += active_abstain[i][lf] <= is_abstain[i][lf], f"ActiveAbstain_LF_{lf}_Instance_{i}_1"
            prob += active_abstain[i][lf] <= x_nlfs[lf], f"ActiveAbstain_LF_{lf}_Instance_{i}_2"
            prob += active_abstain[i][lf] >= is_abstain[i][lf] + x_nlfs[lf] - 1, f"ActiveAbstain_LF_{lf}_Instance_{i}_3"

    for i in range(num_instances):
        for lf in nlfs:
            # correct_and_active[i][lf] should be 1 only if both correctness_vars[i][lf] == 1 and x_nlfs[lf] == 1
            prob += correct_and_active[i][lf] <= correctness_vars[i][lf], f"CorrectAndActive_UpperBound_1_{i}_{lf}"
            prob += correct_and_active[i][lf] <= x_nlfs[lf], f"CorrectAndActive_UpperBound_2_{i}_{lf}"
            prob += correct_and_active[i][lf] >= correctness_vars[i][lf] + x_nlfs[lf] - 1, f"CorrectAndActive_LowerBound_{i}_{lf}"
        
    
    for lf in labeling_functions:
        num_instances_abstain = pulp.lpSum([is_abstain[i][lf] for i in range(num_instances)])
        if lf in nlfs:
            lf_correct_predictions = pulp.lpSum([correctness_vars[i][lf] for i in range(num_instances)])
            prob += lf_correct_predictions >= lf_acc_thresh * (num_instances-num_instances_abstain) - M * (1 - x_nlfs[lf]), f"LF_{lf}_Accuracy"
        else:
            lf_correct_predictions = pulp.lpSum([correctness_vars[i][lf] for i in range(num_instances)])
            prob += lf_correct_predictions >= lf_acc_thresh * (num_instances-num_instances_abstain), f"LF_{lf}_Accuracy"



    for i in range(num_instances):
        for lf in nlfs:
            # Ensure that correctness_vars[i][lf] is counted only if x_nlf[lf] = 1
            prob += correctness_vars[i][lf] <= M * x_nlfs[lf], f"{lf}_active_{i}"
            
        correct_predictions_per_instance = pulp.lpSum([correctness_vars[i][lf] for lf in labeling_functions if lf not in nlfs]) + \
                               pulp.lpSum([correct_and_active[i][lf] for lf in nlfs])
        instance_abstain_count = pulp.lpSum([is_abstain[i][lf] for lf in labeling_functions if lf not in nlfs]) + \
                                 pulp.lpSum([active_abstain[i][lf] for lf in nlfs]) 
        
        num_labeling_functions_used = len(labeling_functions) - len(nlfs) + pulp.lpSum(x_nlfs.values())
        prob += correct_predictions_per_instance >= instance_acc_thresh * num_labeling_functions_used, f"Instance_{i}_Accuracy"
        prob += instance_abstain_count <= num_labeling_functions_used *(1- min_non_abstain_thresh), f"Instance_{i}_NonAbastain"

        
    for i in range(num_instances):
        for lf in labeling_functions:
            true_label = df[expected_label_col][i]
            # Ensure that correctness_vars[i][lf] is 1 if P_vars[i][lf] equals true_label, else 0
            prob += P_vars[i][lf] - true_label <= M * (1 - correctness_vars[i][lf]),\
                                     f"Correctness_UpperBound_{i}_{lf}"
            prob += true_label - P_vars[i][lf] <= M * (1 - correctness_vars[i][lf]), \
                                     f"Correctness_LowerBound_{i}_{lf}"


    # Solve the integer program
    prob.solve()

    p_vars_solution = pd.DataFrame(index=df.index, columns=labeling_functions)
    active_abstain_df = pd.DataFrame(index=df.index, columns=labeling_functions)
    is_abstain_df = pd.DataFrame(index=df.index, columns=labeling_functions)
    
    for i in range(num_instances):
        for lf in labeling_functions:
            p_vars_solution.loc[i, lf] = int(pulp.value(P_vars[i][lf]))
    
    correctness_solution = pd.DataFrame(index=df.index, columns=labeling_functions)
    for i in range(num_instances):
        for lf in labeling_functions:
            correctness_solution.loc[i, lf] = int(pulp.value(correctness_vars[i][lf]))
    
    x_nlfs_solution = {lf: pulp.value(x_nlfs[lf]) for lf in nlfs}
    
    print(f"Status: {pulp.LpStatus[prob.status]}")
    print(f"pulp.value(num_labeling_functions_used) : {pulp.value(num_labeling_functions_used)}")
    
    for i in range(num_instances):
        for lf in labeling_functions:
            is_abstain_df.loc[i, lf] = int(pulp.value(is_abstain[i][lf]))
    for i in range(num_instances):
        for lf in nlfs:
            active_abstain_df.loc[i, lf] = int(pulp.value(active_abstain[i][lf]))
    
    return p_vars_solution, x_nlfs_solution, pulp, prob, active_abstain_df, is_abstain_df


In [12]:
# for c in list(combined_df):
#     print(f"{c}: {combined_df[c].value_counts().to_dict()}")

In [13]:
def create_solver_input_df_copies(lf_names_after_fix, user_input_df, res_df):
    df_copies = {}

    cols_needed = ['text', 'expected_label', 'cid']

    # Loop through each column in df2 and create a copy of df1 with modified 'expected_label'
    for lf in lf_names_after_fix:
        # Create a deep copy of df1
        df_copy = user_input_df.copy(deep=True)

        # Update the 'expected_label' column based on the corresponding column in df2
        df_copy['expected_label'] = res_df[lf].values

        # Store the modified dataframe in the dictionary with key as the labeling function name
        df_copies[lf] = df_copy[cols_needed]
    
    return df_copies



In [14]:
# import numpy as np

# test_res = {}

# for la in np.arange(0.2, 1, 0.2):
#     for ia in np.arange(0.25, 1, 0.25):
#         print(f"la: {la}, ia: {ia}")


#         res_df, inclusion_dict = lf_constraint_solve(df=combined_df, lf_acc_thresh=la,
#                         instance_acc_thresh=ia,
#                         nlf_prefix='nlf_',
#                         expected_label_col='expected_label',
#                         new_lf_weight=1)
        
# #         inclusion_dict['nlf_3']=1
# #         # testing purpose, remember to delete
        
#         lfs_witan = [l for l in list(combined_df) if ('nlf' not in l and l!='expected_label')]
#         lfs_manual_added =  [x for x in inclusion_dict if inclusion_dict[x]==1]
#         lf_names_after_fix = lfs_witan +lfs_manual_added
        
#         print(f"lf_names_after_fix: {lf_names_after_fix}")
#         df_copies = create_solver_input_df_copies(lf_names_after_fix=lf_names_after_fix,
#                                          user_input_df=user_input_df,
#                                          res_df=res_df)
        
#         fix_book_keeping_dict = {k.id:{'rule':k, 'deleted':False,
#                                'pre_fix_size':k.size, 
#                                'after_fix_size':k.size, 
#                                'pre-deleted': False} for k in amazon_treerules}
        
#         max_rule_id = max(list(fix_book_keeping_dict))+1
        
#         if(lfs_manual_added):
#             for lma in lfs_manual_added:
#                 fix_book_keeping_dict[max_rule_id]={'rule':manual_func_dict[lma], 'deleted':False,
#                                'pre_fix_size':manual_func_dict[lma], 
#                                'after_fix_size':manual_func_dict[lma], 
#                                'pre-deleted': False}
#                 max_rule_id+=1
        
#         df_list = list(df_copies.values())
        
#         print(f"df_list: {df_list}")
        
#         for i in range(len(df_list)):
#             fix_book_keeping_dict[i]['user_input'] = df_list[i]

#         test_res[f'random_sample_state42_amazon_full_la_{la}_ia_{ia}_input.pkl']=fix_book_keeping_dict


In [15]:
# test_res = {}
# inclusion_dicts = {}

# for la in np.arange(0.1, 1, 0.2):
#     for ia in np.arange(0.1, 1, 0.2):
#         print(f"la: {la}, ia: {ia}")

#         fix_book_keeping_dict = {k.id:{'rule':k, 'deleted':False,
#                                'pre_fix_size':k.size, 
#                                'after_fix_size':k.size, 
#                                'pre-deleted': False} for k in amazon_treerules}

#         res_df, inclusion_dict, res_pulp, prob, active_abstain_df, is_abstain_df  = lf_constraint_solve(df=combined_df, lf_acc_thresh=la,
#                         instance_acc_thresh=ia,
#                         nlf_prefix='nlf_',
#                         expected_label_col='expected_label',
#                         new_lf_weight=1)

#         lfs_witan = [l for l in list(combined_df) if ('nlf' not in l and l!='expected_label')]
#         lfs_manual_added =  [x for x in inclusion_dict if inclusion_dict[x]==1]
#         lf_names_after_fix = lfs_witan +lfs_manual_added

#         df_copies = create_solver_input_df_copies(lf_names_after_fix=lf_names_after_fix,
#                                          user_input_df=user_input_df,
#                                          res_df=res_df)
#         df_list = list(df_copies.values())\


#         max_rule_id = max(list(fix_book_keeping_dict))+1

#         if(lfs_manual_added):
#             for lma in lfs_manual_added:
#                 fix_book_keeping_dict[max_rule_id]={'rule':manual_func_dict[lma], 'deleted':False,
#                                'pre_fix_size':manual_func_dict[lma], 
#                                'after_fix_size':manual_func_dict[lma], 
#                                'pre-deleted': False}
#                 max_rule_id+=1

#         for i in range(len(df_list)):
#             fix_book_keeping_dict[i]['user_input'] = df_list[i]
        
#         test_res[f'random_sample_state42_amazon_full_with_kwg_la_{la}_ia_{ia}_input.pkl']=fix_book_keeping_dict
#         inclusion_dicts[f'random_sample_state42_amazon_full_with_kwg_la_{la}_ia_{ia}_input.pkl'] = inclusion_dict

In [16]:
# retrain 

In [17]:
# sentences_df=pd.read_sql(f'SELECT * FROM amazon', conn)
# sentences_df = sentences_df.rename(columns={"class": "expected_label", "content": "old_text"})
# sentences_df['text'] = sentences_df['old_text'].apply(lambda s: clean_text(s))
# sentences_df = sentences_df[~sentences_df['text'].isna()]
# print(f"--------------------------------------------------------")
# # print(f"results for {r}")
# # new_trees = [x['rule'] for x in res[r].values()]
# #     print(new_funcs)
# new_rules = [x.gen_label_rule() for x in new_trees]
# applier = PandasLFApplier(lfs=new_rules)
# initial_vectors = applier.apply(df=sentences_df, progress_bar=False)
# model = LabelModel(cardinality=2, verbose=True, device='cpu')
# model.fit(L_train=initial_vectors, n_epochs=500, log_freq=100, seed=123)

# probs_test= model.predict_proba(L=initial_vectors)
# df_sentences_filtered, probs_test_filtered, filtered_vectors, df_no_signal  = filter_unlabeled_dataframe(
#     X=sentences_df, y=probs_test, L=initial_vectors
# )	

# df_sentences_filtered = df_sentences_filtered.reset_index(drop=True)
# prob_diffs = [abs(t[0]-t[1]) for t in probs_test_filtered]
# prob_diffs_tuples = [(t[0],t[1]) for t in probs_test_filtered]
# df_sentences_filtered['model_pred_diff'] = pd.Series(prob_diffs)
# df_sentences_filtered['model_pred_prob_tuple'] = pd.Series(prob_diffs_tuples)
# df_sentences_filtered['model_pred'] = pd.Series(model.predict(L=filtered_vectors))

# wrong_preds = df_sentences_filtered[(df_sentences_filtered['expected_label']!=df_sentences_filtered['model_pred'])]
# # df_sentences_filtered.to_csv('predictions_shakira.csv', index=False)
# # logger.critical(wrong_preds)
# accuracy=(len(df_sentences_filtered)-len(wrong_preds))/len(df_sentences_filtered)

# print(f"""
#     out of {len(sentences_df)} sentences, {len(df_sentences_filtered)} actually got at least one signal to \n
#     make prediction. Out of all the valid predictions, we have {len(wrong_preds)} wrong predictions, \n
#     accuracy = {(len(df_sentences_filtered)-len(wrong_preds))/len(df_sentences_filtered)} 
#     """)
# print('\n')

In [18]:
def lf_constraint_solve_no_new_lf(df, lf_acc_thresh=0.5, 
                        instance_acc_thresh=0.5,
                        min_non_abstain_thresh=0.8,
                        nlf_prefix='nlf_',
                        expected_label_col='expected_label',
#                         new_lf_weight=0.1
                       ):
    
    # Problem initialization
    prob = pulp.LpProblem("Label_Flip_Minimization", pulp.LpMinimize)

    # Parameters
#     labeling_functions = df.columns[:-1]
    labeling_functions = [lf_name for lf_name in df.columns if lf_name!=expected_label_col]
    print(f"lf_acc: {lf_acc_thresh}, ins_acc:{instance_acc_thresh}")
    print(f"labeling_functions: {labeling_functions}")
    num_instances = len(df)
    print(f"num_instances: {num_instances}")
    M = 5
    
#     nlfs = [lf for lf in labeling_functions if nlf_prefix in lf]
#     print(f"nlfs: {nlfs}")
#     x_nlfs = pulp.LpVariable.dicts("x_nlf", nlfs, cat='Binary')

    P_vars = pulp.LpVariable.dicts("P", (range(num_instances), labeling_functions), 
                                   lowBound=-1, upBound=1, cat='Integer')
    
    is_abstain = pulp.LpVariable.dicts("is_abstain", 
                               (range(num_instances), labeling_functions), 
                               cat='Binary')

    flip_1_to_0 = pulp.LpVariable.dicts("flip_1_to_0", 
                                        (range(num_instances), labeling_functions), cat='Binary')
    flip_1_to_neg1 = pulp.LpVariable.dicts("flip_1_to_neg1", 
                                           (range(num_instances), labeling_functions), cat='Binary')
    flip_0_to_1 = pulp.LpVariable.dicts("flip_0_to_1", 
                                        (range(num_instances), labeling_functions), cat='Binary')
    flip_0_to_neg1 = pulp.LpVariable.dicts("flip_0_to_neg1", 
                                           (range(num_instances), labeling_functions), cat='Binary')
    flip_neg1_to_1 = pulp.LpVariable.dicts("flip_neg1_to_1", 
                                           (range(num_instances), labeling_functions), cat='Binary')
    flip_neg1_to_0 = pulp.LpVariable.dicts("flip_neg1_to_0", 
                                           (range(num_instances), labeling_functions), cat='Binary')

    # Binary variables to track correctness of predictions (1 if correct, 0 if not)
    correctness_vars = pulp.LpVariable.dicts("correct", 
                                             (range(num_instances), labeling_functions), cat='Binary')
    
#     # Create auxiliary variables to represent active nLF abstains
#     active_abstain = pulp.LpVariable.dicts("active_abstain", 
#                                            (range(num_instances), nlfs), 
#                                            cat='Binary')
    
#     correct_and_active = pulp.LpVariable.dicts("correct_and_active", 
#                                            (range(num_instances), nlfs), 
#                                            cat='Binary')


    # Objective: Minimize the number of flips
    flip_cost = pulp.lpSum([flip_1_to_0[i][lf] + flip_1_to_neg1[i][lf] + 
                            flip_0_to_1[i][lf] + flip_0_to_neg1[i][lf] + 
                            flip_neg1_to_1[i][lf] + flip_neg1_to_0[i][lf] 
                            for i in range(num_instances) for lf in labeling_functions])

#     prob += flip_cost + pulp.lpSum([new_lf_weight * x_nlfs[lf] for lf in nlfs]), "Minimize_Flips"
    prob += flip_cost, "Minimize_Flips"


    # Mutual exclusivity
    for i in range(num_instances):
        for lf in labeling_functions:
            prob += (flip_1_to_0[i][lf] + flip_1_to_neg1[i][lf] + 
                     flip_0_to_1[i][lf] + flip_0_to_neg1[i][lf] + 
                     flip_neg1_to_1[i][lf] + flip_neg1_to_0[i][lf]) <= 1, f"Flip_Exclusivity_{i}_{lf}"

    for i in range(num_instances):
        for lf in labeling_functions:
            original_val = df.loc[i, lf]
            if original_val == 1:
                prob += P_vars[i][lf] == 0 * flip_1_to_0[i][lf] + \
                (-1) * flip_1_to_neg1[i][lf] + 1 * (1 - flip_1_to_0[i][lf] - flip_1_to_neg1[i][lf]), f"Flip_From_1_{i}_{lf}"
                
            elif original_val == 0:                
                prob += P_vars[i][lf] == 1 * flip_0_to_1[i][lf] + \
                (-1) * flip_0_to_neg1[i][lf] + 0 * (1 - flip_0_to_1[i][lf] - flip_0_to_neg1[i][lf]), f"Flip_From_0_{i}_{lf}"
                
            elif original_val == -1:
                prob += P_vars[i][lf] == 1 * flip_neg1_to_1[i][lf] + 0 * flip_neg1_to_0[i][lf] + (-1) * (1 - flip_neg1_to_1[i][lf] - flip_neg1_to_0[i][lf]), f"Flip_From_neg1_{i}_{lf}"
    
    for i in range(num_instances):
        for lf in labeling_functions:
            prob += P_vars[i][lf] >= -1 - (1 - is_abstain[i][lf]) * M, f"Abstain_LowerBound_{i}_{lf}"
            prob += P_vars[i][lf] <= -1 + (1 - is_abstain[i][lf]) * M, f"Abstain_UpperBound_{i}_{lf}"

            # If is_abstain[i][lf] == 0, P_vars[i][lf] can only be 0 or 1
            prob += P_vars[i][lf] >= 0 - is_abstain[i][lf] * M, f"Non_Abstain_LowerBound_{i}_{lf}"
            prob += P_vars[i][lf] <= 1 + is_abstain[i][lf] * M, f"Non_Abstain_UpperBound_{i}_{lf}"
    
    # Set up the constraints for the auxiliary variables
#     for i in range(num_instances):
#         for lf in nlfs:
#             # Ensure active_abstain[i][lf] is 1 only if both is_abstain[i][lf] == 1 and x_nlfs[lf] == 1
#             prob += active_abstain[i][lf] <= is_abstain[i][lf], f"ActiveAbstain_LF_{lf}_Instance_{i}_1"
#             prob += active_abstain[i][lf] <= x_nlfs[lf], f"ActiveAbstain_LF_{lf}_Instance_{i}_2"
#             prob += active_abstain[i][lf] >= is_abstain[i][lf] + x_nlfs[lf] - 1, f"ActiveAbstain_LF_{lf}_Instance_{i}_3"

#     for i in range(num_instances):
#         for lf in nlfs:
#             # correct_and_active[i][lf] should be 1 only if both correctness_vars[i][lf] == 1 and x_nlfs[lf] == 1
#             prob += correct_and_active[i][lf] <= correctness_vars[i][lf], f"CorrectAndActive_UpperBound_1_{i}_{lf}"
#             prob += correct_and_active[i][lf] <= x_nlfs[lf], f"CorrectAndActive_UpperBound_2_{i}_{lf}"
#             prob += correct_and_active[i][lf] >= correctness_vars[i][lf] + x_nlfs[lf] - 1, f"CorrectAndActive_LowerBound_{i}_{lf}"
        
    
    for lf in labeling_functions:
        num_instances_abstain = pulp.lpSum([is_abstain[i][lf] for i in range(num_instances)])
#         if lf in nlfs:
#             lf_correct_predictions = pulp.lpSum([correctness_vars[i][lf] for i in range(num_instances)])
#             prob += lf_correct_predictions >= lf_acc_thresh * (num_instances-num_instances_abstain) - M * (1 - x_nlfs[lf]), f"LF_{lf}_Accuracy"
#         else:
        lf_correct_predictions = pulp.lpSum([correctness_vars[i][lf] for i in range(num_instances)])
        prob += lf_correct_predictions >= lf_acc_thresh * (num_instances-num_instances_abstain), f"LF_{lf}_Accuracy"



    for i in range(num_instances):
#         for lf in nlfs:
#             # Ensure that correctness_vars[i][lf] is counted only if x_nlf[lf] = 1
#             prob += correctness_vars[i][lf] <= M * x_nlfs[lf], f"{lf}_active_{i}"
            
#         correct_predictions_per_instance = pulp.lpSum([correctness_vars[i][lf] for lf in labeling_functions if lf not in nlfs]) + \
#                                pulp.lpSum([correct_and_active[i][lf] for lf in nlfs])
        correct_predictions_per_instance = pulp.lpSum([correctness_vars[i][lf] for lf in labeling_functions])
            
#         instance_abstain_count = pulp.lpSum([is_abstain[i][lf] for lf in labeling_functions if lf not in nlfs]) + \
#                                  pulp.lpSum([active_abstain[i][lf] for lf in nlfs]) 
        instance_abstain_count = pulp.lpSum([is_abstain[i][lf] for lf in labeling_functions])
        
#         num_labeling_functions_used = len(labeling_functions) - len(nlfs) + pulp.lpSum(x_nlfs.values())
        num_labeling_functions_used = len(labeling_functions)

        prob += correct_predictions_per_instance >= instance_acc_thresh * (num_labeling_functions_used-instance_abstain_count), f"Instance_{i}_Accuracy"
        prob += instance_abstain_count <= num_labeling_functions_used *(1- min_non_abstain_thresh), f"Instance_{i}_NonAbastain"

        
    for i in range(num_instances):
        for lf in labeling_functions:
            true_label = df[expected_label_col][i]
            # Ensure that correctness_vars[i][lf] is 1 if P_vars[i][lf] equals true_label, else 0
            prob += P_vars[i][lf] - true_label <= M * (1 - correctness_vars[i][lf]),\
                                     f"Correctness_UpperBound_{i}_{lf}"
            prob += true_label - P_vars[i][lf] <= M * (1 - correctness_vars[i][lf]), \
                                     f"Correctness_LowerBound_{i}_{lf}"


    # Solve the integer program
    prob.solve()

    p_vars_solution = pd.DataFrame(index=df.index, columns=labeling_functions)
    active_abstain_df = pd.DataFrame(index=df.index, columns=labeling_functions)
    is_abstain_df = pd.DataFrame(index=df.index, columns=labeling_functions)
    
    for i in range(num_instances):
        for lf in labeling_functions:
            p_vars_solution.loc[i, lf] = int(pulp.value(P_vars[i][lf]))
    
    correctness_solution = pd.DataFrame(index=df.index, columns=labeling_functions)
    for i in range(num_instances):
        for lf in labeling_functions:
            correctness_solution.loc[i, lf] = int(pulp.value(correctness_vars[i][lf]))
    
#     x_nlfs_solution = {lf: pulp.value(x_nlfs[lf]) for lf in nlfs}
    
    print(f"Status: {pulp.LpStatus[prob.status]}")
    print(f"pulp.value(num_labeling_functions_used) : {pulp.value(num_labeling_functions_used)}")
    
#     for i in range(num_instances):
#         for lf in labeling_functions:
#             is_abstain_df.loc[i, lf] = int(pulp.value(is_abstain[i][lf]))
#     for i in range(num_instances):
#         for lf in nlfs:
#             active_abstain_df.loc[i, lf] = int(pulp.value(active_abstain[i][lf]))
    
#     return p_vars_solution, x_nlfs_solution, pulp, prob, active_abstain_df, is_abstain_df

    return p_vars_solution, pulp, prob

In [19]:
import math

In [20]:
def main(user_input_size,
         lf_acc_thresh,
         instance_acc_thresh,
         min_non_abstain_thresh,
        dataset_name,
        random_state):
    
    user_complaint_size = math.floor(user_input_size * 0.5)
    user_confirm_size = user_input_size - user_complaint_size
    
    conn = psycopg2.connect(dbname='label', user='postgres')
 
    amazon_treerules_for_user_input = gen_amazon_funcs()
    
    amazon_treerules = gen_amazon_funcs()
    
    amazon_funcs = [f.gen_label_rule() for f in amazon_treerules]
    
    df_sentences_filtered, correct_preds_by_snorkel, wrong_preds_by_snorkel, filtered_vectors, correct_predictions, incorrect_predictions, global_accuracy =run_snorkel_with_funcs(dataset_name=dataset_name, funcs=amazon_funcs, conn=conn)
    
    user_vecs, gts, user_input_df = select_user_input(user_confirm_size, user_complaint_size, random_state,
                      filtered_vectors,correct_preds_by_snorkel,
                      wrong_preds_by_snorkel, correct_predictions, incorrect_predictions)
    
    used_keywords =  gather_used_keywords(amazon_treerules_for_user_input)
    
    funcs_for_signal_collection = gen_amazon_funcs()
    
    uinput_unsatisfied_counts = gather_user_input_signals_on_rules(funcs_for_signal_collection, user_input_df)

    sorted_textids = sorted(uinput_unsatisfied_counts, key=uinput_unsatisfied_counts.get, reverse=True)
    sorted_df = user_input_df.set_index('cid').loc[sorted_textids].reset_index()
    
    used_keyword_set = set(used_keywords)
    
    kwrm = KeyWordRuleMiner(df=sorted_df)
    
    new_funcs = kwrm.gen_funcs(count=15, 
                           apply_to_sentence_percentage_thresh=0.05, 
                           label_accuracy_thresh=0.7,
                           label_accuracy_cap=1,
                           pickle_it=False, 
                           pickle_file_name=None, 
                           checked_words=used_keyword_set, 
                           is_good=False, 
                           cardinality_thresh=1)
    
    new_trees_from_kwg = new_funcs[0]
    new_funcs = [x.gen_label_rule() for x in new_trees_from_kwg]
    
    new_func_names = [f'nlf_{i}' for i in range(len(new_trees_from_kwg))]
    
    new_func_dict = dict(zip(new_func_names,new_trees_from_kwg))

    
    new_rules_vector = apply_new_lfs_to_df(new_funcs, user_input_df)
        
    combined_df = construct_input_df_to_solver(new_rules_vector,user_vecs, gts)

    res_df, inclusion_dict, res_pulp, prob, active_abstain_df, is_abstain_df = lf_constraint_solve(df=combined_df, 
                lf_acc_thresh=lf_acc_thresh,
                instance_acc_thresh=instance_acc_thresh,
                min_non_abstain_thresh=min_non_abstain_thresh,      
                nlf_prefix='nlf_',
                expected_label_col='expected_label',
                new_lf_weight=1)
    
    fix_book_keeping_dict = {'original_'+str(k.id):{'rule':k, 'deleted':False,
                       'pre_fix_size':k.size, 
                       'after_fix_size':k.size, 
                       'pre-deleted': False} for k in amazon_treerules}
    
    lfs_witan = [l for l in list(combined_df) if ('nlf' not in l and l!='expected_label')]
    lfs_manual_added =  [x for x in inclusion_dict if inclusion_dict[x]==1]
    lf_names_after_fix = lfs_witan +lfs_manual_added

    df_copies = create_solver_input_df_copies(lf_names_after_fix=lf_names_after_fix,
                                     user_input_df=user_input_df,
                                     res_df=res_df)
    df_list = list(df_copies.values())

    if(lfs_manual_added):
        for lma in lfs_manual_added:
            fix_book_keeping_dict[new_func_dict[lma].id]={'rule':new_func_dict[lma], 'deleted':False,
                           'pre_fix_size':new_func_dict[lma], 
                           'after_fix_size':new_func_dict[lma], 
                           'pre-deleted': False}
            
    book_keeping_dict_list = list(fix_book_keeping_dict)
    
    for i in range(len(df_list)):
        fix_book_keeping_dict[book_keeping_dict_list[i]]['user_input'] = df_list[i]
        fix_book_keeping_dict[book_keeping_dict_list[i]]['user_input']['id'] = \
        fix_book_keeping_dict[book_keeping_dict_list[i]]['user_input'].reset_index().index
    
    
#     for i in fix_book_keeping_dict:
#         print(f"rule {i}: rule_id: {fix_book_keeping_dict[i]['rule'].id}")
#         print(f"rule details")
#         print(f"{fix_book_keeping_dict[i]['rule']}")
#         print(f"expected labels {fix_book_keeping_dict[i]['user_input']['expected_label'].to_list()}")
#         print("\n\n")

    fix_rules_with_solver_input(fix_book_keeping_dict=fix_book_keeping_dict)
    
#     for i in fix_book_keeping_dict:
#         print(f"tree {i}")
#         dot_string = fix_book_keeping_dict[i]['rule'].gen_dot_string('')
#         graph = pydot.graph_from_dot_data(dot_string)[0]
#         plt = Image(graph.create_png())
#         display(plt)
#         print('\n')
    
    new_trees = [x['rule'] for x in fix_book_keeping_dict.values()]
    funcs_after_fix = [f.gen_label_rule() for f in new_trees]

    df_sentences_filtered, correct_preds_by_snorkel, wrong_preds_by_snorkel, filtered_vectors, correct_predictions, incorrect_predictions, new_global_accuracy =run_snorkel_with_funcs(dataset_name=dataset_name, funcs=funcs_after_fix, conn=conn) 
    
    complaints = user_input_df[user_input_df['expected_label']!=user_input_df['model_pred']]
    complant_ids = complaints['cid'].to_list()
    confirms = user_input_df[user_input_df['expected_label']==user_input_df['model_pred']]
    confirm_ids = confirms['cid'].to_list()
    
    df_confirms_after_fix = df_sentences_filtered[(df_sentences_filtered['cid'].isin(confirm_ids))]
    df_complaints_after_fix = df_sentences_filtered[(df_sentences_filtered['cid'].isin(complant_ids))]
    
    confirm_preserv_rate = len(df_confirms_after_fix[df_confirms_after_fix['expected_label']==df_confirms_after_fix['model_pred']])/len(df_confirms_after_fix)
    complain_fix_rate = len(df_complaints_after_fix[df_complaints_after_fix['expected_label']==df_complaints_after_fix['model_pred']])/len(df_complaints_after_fix)
    
    ret = {'before_fix_global_accuracy':global_accuracy,
           'user_input_size':user_input_size,
           'lf_acc_thresh':lf_acc_thresh,
           'instance_acc_thresh':instance_acc_thresh,
           'min_non_abstain_thresh':min_non_abstain_thresh,
           'dataset_name':dataset_name,
           'random_state':random_state,
           'confirm_prev_rate':confirm_preserv_rate,
           'complain_fix_rate':complain_fix_rate,
           'new_global_accuracy':new_global_accuracy
           }
    
    
    res_to_save = {'summary': ret, 'fix_details': fix_book_keeping_dict}

    with open(f'new_solver_amazon_sample_params_{user_input_size}-{lf_acc_thresh}-{instance_acc_thresh}-{min_non_abstain_thresh}-{random_state}.pkl', 'wb') as resf:
        pickle.dump(res_to_save, resf)
    
    
    return inclusion_dict, fix_book_keeping_dict, res_df, gts, user_input_df, df_sentences_filtered, ret

In [21]:
# instance accuracy: |correct_predictions_from_included_lfs|/|included_lfs|
# lf accuracy: |correct_predictions_from_each_lf|/|non_abstain_preds_from_the_lf|
# instance_non_abstain_thresh: each instance cant have more than (instance_non_abstain_thresh*100)% abstains

In [22]:
# import signal
# import time 

# class TimeoutException(Exception):
#     pass

# def timeout_handler(signum, frame):
#     raise TimeoutException

# def run_with_params(params):
#     time.sleep(params)
#     return f"Finished params: {params}"

# def test_params_with_timeout(params_list, time_limit_minutes):
#     time_limit_seconds = int(time_limit_minutes * 60)
#     signal.signal(signal.SIGALRM, timeout_handler)
#     results = []

#     for params in params_list:
#         signal.alarm(time_limit_seconds)  # Set the timeout
#         try:
#             result = run_with_params(params)
#             print(result)
#             results.append(result)
#         except TimeoutException:
#             print(f"Params {params} exceeded time limit, moving to next.")
#         finally:
#             signal.alarm(0)  # Reset the alarm

#     return results

# Example usage
# params_list = [1, 5, 10, 2]  # Parameters that would be passed to the function
# results = test_params_with_timeout(params_list, time_limit_minutes=0.1)

In [23]:
# import concurrent.futures
# import time


In [24]:
def run_main_with_params(user_input_size, lf_acc_thresh, instance_acc_thresh, 
                         min_non_abstain_thresh, random_state, dataset_name):

    # Run the main function with the provided parameters
    inclusion_dict, fix_book_keeping_dict, res_df, gts, user_input_df, df_sentences_filtered, summary = main(
        user_input_size=user_input_size,
        lf_acc_thresh=lf_acc_thresh,
        instance_acc_thresh=instance_acc_thresh,
        min_non_abstain_thresh=min_non_abstain_thresh,
        random_state=random_state,
        dataset_name=dataset_name
    )
    # Save the results
    res_to_save = {'summary': summary, 'fix_details': fix_book_keeping_dict}
    return res_to_save

In [25]:
def test_main_with_timeout(params_list, time_limit_minutes):
    time_limit_seconds = time_limit_minutes * 60
    results = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for params in params_list:
            future = executor.submit(run_main_with_params, *params)
            try:
                result = future.result(timeout=time_limit_seconds)
                print(f"Params {params} finished successfully.")
                results.append(result)
            except concurrent.futures.TimeoutError:
                print(f"Params {params} exceeded the time limit, moving to the next set.")
    
    return results


In [26]:

# inclusion_dict, fix_book_keeping_dict, res_df, gts, user_input_df, df_sentences_filtered, summary = main(user_input_size=40,
#  lf_acc_thresh=0.6,
#  instance_acc_thresh=0.6,
#  min_non_abstain_thresh=0.3,
#  random_state=42,
#  dataset_name='amazon01')



In [27]:
def frange(start, stop, step):
    while start < stop:
        yield round(start, 10)  # Rounding to avoid floating-point precision issues
        start += step

In [28]:
import itertools
import random

# Define the parameter ranges
user_input_size_range = range(20, 81, 20)  # From 20 to 80 with step 20
lf_acc_thresh_range = [round(x, 1) for x in frange(0.2, 1.0, 0.2)]  # From 0.2 to 0.8 with step 0.2
instance_acc_thresh_range = [round(x, 1) for x in frange(0.2, 1.0, 0.2)]
min_non_abstain_thresh_range = [round(x, 1) for x in frange(0.2, 1.0, 0.2)]
random_states = [42, 100]
dataset_names = ['amazon']

# Select 2 random states (you can choose others if preferred)

# Generate all combinations of the parameter values
params_list = list(itertools.product(
    user_input_size_range,
    lf_acc_thresh_range,
    instance_acc_thresh_range,
    min_non_abstain_thresh_range,
    random_states,
    dataset_names,
))


# Now, randomly sample around 25 parameter combinations from the full list
sampled_params_list = random.sample(params_list, 15)


In [29]:
sampled_params_list

[(60, 0.2, 0.8, 0.2, 100, 'amazon'),
 (80, 0.4, 0.8, 0.8, 100, 'amazon'),
 (80, 0.8, 0.8, 0.8, 100, 'amazon'),
 (80, 0.4, 0.6, 0.2, 100, 'amazon'),
 (80, 0.4, 0.2, 0.4, 42, 'amazon'),
 (20, 0.2, 0.4, 0.6, 42, 'amazon'),
 (20, 0.6, 0.2, 0.6, 100, 'amazon'),
 (20, 0.8, 0.2, 0.6, 100, 'amazon'),
 (20, 0.2, 0.8, 0.8, 42, 'amazon'),
 (40, 0.6, 0.8, 0.6, 100, 'amazon'),
 (20, 0.6, 0.8, 0.4, 100, 'amazon'),
 (20, 0.6, 0.8, 0.4, 42, 'amazon'),
 (60, 0.8, 0.4, 0.4, 42, 'amazon'),
 (80, 0.2, 0.2, 0.2, 42, 'amazon'),
 (60, 0.8, 0.8, 0.8, 42, 'amazon')]

In [30]:
# results = test_main_with_timeout(sampled_params_list, time_limit_minutes=20)

In [31]:
# inclusion_dict, fix_book_keeping_dict, res_df, gts, user_input_df, df_sentences_filtered, summary = main(
#     user_input_size=20,
#     lf_acc_thresh=0.7,
#     instance_acc_thresh=0.8,
#     min_non_abstain_thresh=0.1,
#     random_state=42,
#     dataset_name='amazon01'
# )

In [32]:
# import pandas as pd
# import numpy as np

In [33]:
def gen_dummy_solver_input():
    # Set random seed for reproducibility
    np.random.seed(42)

    # Number of rows and columns
    num_instances = 20
    lf_columns = [f'lf_{i}' for i in range(1, 11)] + [f'nlf_{i}' for i in range(1, 6)]

    # Generate random values (-1, 0, 1) ensuring unique patterns for each LF
    data = {col: np.random.choice([-1, 0, 1], size=num_instances, replace=True) for col in lf_columns}

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Calculate expected_label based on majority vote pattern
    # Majority vote: if majority is -1, label is -1, if majority is 1, label is 1, otherwise label is 0
    df['expected_label'] = df.apply(lambda row: 1 if (row == 1).sum() >= (row == 0).sum() else 0, axis=1)
    
    return df

In [34]:
df_dummy_solver_input = gen_dummy_solver_input()

In [35]:
df_dummy_solver_input

Unnamed: 0,lf_1,lf_2,lf_3,lf_4,lf_5,lf_6,lf_7,lf_8,lf_9,lf_10,nlf_1,nlf_2,nlf_3,nlf_4,nlf_5,expected_label
0,1,-1,-1,0,1,1,-1,1,1,-1,-1,-1,-1,-1,1,1
1,-1,-1,1,-1,-1,1,1,1,1,1,0,1,-1,0,1,1
2,1,0,1,0,0,1,0,0,0,0,-1,0,1,1,1,0
3,1,0,-1,0,-1,-1,1,0,0,-1,0,0,0,1,0,0
4,-1,-1,-1,0,-1,1,1,-1,0,-1,0,-1,-1,0,-1,0
5,-1,-1,1,0,0,1,-1,1,-1,-1,1,0,1,0,1,1
6,1,-1,0,0,1,-1,1,1,0,1,0,0,1,1,-1,1
7,0,1,-1,0,-1,1,-1,1,-1,1,1,1,1,1,0,1
8,1,1,0,0,0,-1,1,-1,-1,0,-1,1,0,0,0,0
9,1,1,0,-1,-1,0,0,-1,0,1,-1,1,1,-1,1,1


In [36]:
lfs_no_new_lfs= [x for x in list(df_dummy_solver_input) if 'nlf_' not in x]

In [37]:
df_dummy_solver_input_no_new_lf = df_dummy_solver_input[lfs_no_new_lfs]

In [38]:
df_dummy_solver_input_no_new_lf

Unnamed: 0,lf_1,lf_2,lf_3,lf_4,lf_5,lf_6,lf_7,lf_8,lf_9,lf_10,expected_label
0,1,-1,-1,0,1,1,-1,1,1,-1,1
1,-1,-1,1,-1,-1,1,1,1,1,1,1
2,1,0,1,0,0,1,0,0,0,0,0
3,1,0,-1,0,-1,-1,1,0,0,-1,0
4,-1,-1,-1,0,-1,1,1,-1,0,-1,0
5,-1,-1,1,0,0,1,-1,1,-1,-1,1
6,1,-1,0,0,1,-1,1,1,0,1,1
7,0,1,-1,0,-1,1,-1,1,-1,1,1
8,1,1,0,0,0,-1,1,-1,-1,0,0
9,1,1,0,-1,-1,0,0,-1,0,1,1


In [39]:
solutions = []

In [40]:
solutions_no_new_lf = []

In [41]:
for i in [0.1,0.5, 0.8]:
    dummy_p_vars_solution, dummy_x_nlfs_solution, dummy_pulp, dummy_prob, dummy_active_abstain_df, \
    dummy_is_abstain_df = lf_constraint_solve(df=df_dummy_solver_input, lf_acc_thresh=0.5, 
                            instance_acc_thresh=0.8,
                            min_non_abstain_thresh=i,
                            nlf_prefix='nlf_',
                            expected_label_col='expected_label',
                            new_lf_weight=1)
    
    print(f"dummy_x_nlfs_solution: {dummy_x_nlfs_solution}")
#     lfs_in_the_result = [x for x in cols if ('nlf_' not in x and x!='expected_label')] + [k for k,v in dummy_x_nlfs_solution.items() if v==1]
    solutions.append(dummy_p_vars_solution)

DEBUG 2024-10-15 12:35:31,218 [coin_api.py:solve_CBC:165] /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/268f35ea6555412f94e585c253129ecd-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/268f35ea6555412f94e585c253129ecd-pulp.sol 


lf_acc: 0.5, ins_acc:0.8
labeling_functions: ['lf_1', 'lf_2', 'lf_3', 'lf_4', 'lf_5', 'lf_6', 'lf_7', 'lf_8', 'lf_9', 'lf_10', 'nlf_1', 'nlf_2', 'nlf_3', 'nlf_4', 'nlf_5']
num_instances: 20
nlfs: ['nlf_1', 'nlf_2', 'nlf_3', 'nlf_4', 'nlf_5']
Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/268f35ea6555412f94e585c253129ecd-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/268f35ea6555412f94e585c253129ecd-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 3160 COLUMNS
At line 20081 RHS
At line 23237 BOUNDS
At line 26443 ENDATA
Problem MODEL has 3155 rows, 2905 columns and 9305 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 2.5 - 0.03 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 807 strengthened rows, 618 substituti

DEBUG 2024-10-15 12:35:32,574 [coin_api.py:solve_CBC:165] /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/2c6a2bd5403a4267bd88570c11b1cda0-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/2c6a2bd5403a4267bd88570c11b1cda0-pulp.sol 


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/2c6a2bd5403a4267bd88570c11b1cda0-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/2c6a2bd5403a4267bd88570c11b1cda0-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 3160 COLUMNS
At line 20081 RHS
At line 23237 BOUNDS
At line 26443 ENDATA
Problem MODEL has 3155 rows, 2905 columns and 9305 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 2.5 - 0.03 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 819 strengthened rows, 618 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 503 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 67 strengthened rows, 0 substitutions
Cgl0004I processed model has 1633 rows, 1362 columns (1362 integer (1104 of which bin

DEBUG 2024-10-15 12:35:33,459 [coin_api.py:solve_CBC:165] /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/9757c17895fe45abac623a3bf24bb807-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/9757c17895fe45abac623a3bf24bb807-pulp.sol 


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/9757c17895fe45abac623a3bf24bb807-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/9757c17895fe45abac623a3bf24bb807-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 3160 COLUMNS
At line 20081 RHS
At line 23237 BOUNDS
At line 26443 ENDATA
Problem MODEL has 3155 rows, 2905 columns and 9305 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 2.5 - 0.03 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 816 strengthened rows, 618 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 506 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 64 strengthened rows, 0 substitutions
Cgl0004I processed model has 1633 rows, 1362 columns (1362 integer (1104 of which bin

In [42]:
for i in [0.1, 0.1, 0.5, 0.8]:
    no_new_lf_p_vars_solution, no_new_lf_pulp, no_new_lf_prob= lf_constraint_solve_no_new_lf(df=df_dummy_solver_input_no_new_lf, lf_acc_thresh=0.5, 
                            instance_acc_thresh=0.8,
                            min_non_abstain_thresh=i,
                            nlf_prefix='nlf_',
                            expected_label_col='expected_label')
    
#     print(f"dummy_x_nlfs_solution: {dummy_x_nlfs_solution}")
#     lfs_in_the_result = [x for x in cols if ('nlf_' not in x and x!='expected_label')] + [k for k,v in dummy_x_nlfs_solution.items() if v==1]
    solutions_no_new_lf.append(no_new_lf_p_vars_solution)

lf_acc: 0.5, ins_acc:0.8
labeling_functions: ['lf_1', 'lf_2', 'lf_3', 'lf_4', 'lf_5', 'lf_6', 'lf_7', 'lf_8', 'lf_9', 'lf_10']
num_instances: 20


DEBUG 2024-10-15 12:35:34,454 [coin_api.py:solve_CBC:165] /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/ecfe2a2147a94649891877a2236a221b-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/ecfe2a2147a94649891877a2236a221b-pulp.sol 


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/ecfe2a2147a94649891877a2236a221b-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/ecfe2a2147a94649891877a2236a221b-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 1655 COLUMNS
At line 11656 RHS
At line 13307 BOUNDS
At line 15308 ENDATA
Problem MODEL has 1650 rows, 1800 columns and 5200 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0 - 0.01 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 395 strengthened rows, 271 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 257 strengthened rows, 0 substitutions
Cgl0004I processed model has 901 rows, 837 columns (837 integer (666 of which binary)) and 3461 elements
Cutoff increment increased from 1e-05 to 0.9999
Cbc0038I I

DEBUG 2024-10-15 12:35:34,874 [coin_api.py:solve_CBC:165] /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/7fa99b778f2d438caf3caace0131de08-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/7fa99b778f2d438caf3caace0131de08-pulp.sol 


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/7fa99b778f2d438caf3caace0131de08-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/7fa99b778f2d438caf3caace0131de08-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 1655 COLUMNS
At line 11656 RHS
At line 13307 BOUNDS
At line 15308 ENDATA
Problem MODEL has 1650 rows, 1800 columns and 5200 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0 - 0.01 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 395 strengthened rows, 271 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 257 strengthened rows, 0 substitutions
Cgl0004I processed model has 901 rows, 837 columns (837 integer (666 of which binary)) and 3461 elements
Cutoff increment increased from 1e-05 to 0.9999
Cbc0038I I

DEBUG 2024-10-15 12:35:35,296 [coin_api.py:solve_CBC:165] /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/7c567c9bf39b4c1b9301068f76e19393-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/7c567c9bf39b4c1b9301068f76e19393-pulp.sol 


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/7c567c9bf39b4c1b9301068f76e19393-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/7c567c9bf39b4c1b9301068f76e19393-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 1655 COLUMNS
At line 11656 RHS
At line 13307 BOUNDS
At line 15308 ENDATA
Problem MODEL has 1650 rows, 1800 columns and 5200 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0 - 0.01 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 395 strengthened rows, 271 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 257 strengthened rows, 0 substitutions
Cgl0004I processed model has 901 rows, 837 columns (837 integer (666 of which binary)) and 3461 elements
Cutoff increment increased from 1e-05 to 0.9999
Cbc0038I I

DEBUG 2024-10-15 12:35:35,711 [coin_api.py:solve_CBC:165] /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/160f951095ac429f8ab106187a3c8446-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/160f951095ac429f8ab106187a3c8446-pulp.sol 


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/opc/.pyenv/versions/3.8.0/envs/label/lib/python3.8/site-packages/pulp/solverdir/cbc/linux/64/cbc /tmp/160f951095ac429f8ab106187a3c8446-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /tmp/160f951095ac429f8ab106187a3c8446-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 1655 COLUMNS
At line 11656 RHS
At line 13307 BOUNDS
At line 15308 ENDATA
Problem MODEL has 1650 rows, 1800 columns and 5200 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0 - 0.01 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 395 strengthened rows, 271 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 257 strengthened rows, 0 substitutions
Cgl0004I processed model has 901 rows, 837 columns (837 integer (666 of which binary)) and 3461 elements
Cutoff increment increased from 1e-05 to 0.9999
Cbc0038I I

In [43]:
solutions_no_new_lf[0]

Unnamed: 0,lf_1,lf_2,lf_3,lf_4,lf_5,lf_6,lf_7,lf_8,lf_9,lf_10
0,1,-1,-1,0,1,1,-1,1,1,-1
1,-1,-1,1,-1,-1,1,1,1,1,1
2,1,0,1,0,0,0,0,0,0,0
3,0,0,-1,0,-1,-1,1,0,0,-1
4,-1,-1,-1,0,-1,0,0,-1,0,-1
5,-1,-1,1,0,1,1,-1,1,-1,-1
6,1,-1,0,1,1,-1,1,1,1,1
7,0,1,-1,1,-1,1,-1,1,-1,1
8,1,0,0,0,0,-1,0,-1,-1,0
9,1,1,0,-1,-1,1,-1,-1,1,1


In [44]:
solutions_no_new_lf[2]

Unnamed: 0,lf_1,lf_2,lf_3,lf_4,lf_5,lf_6,lf_7,lf_8,lf_9,lf_10
0,1,-1,-1,0,1,1,-1,1,1,-1
1,-1,-1,1,-1,-1,1,1,1,1,1
2,1,0,1,0,0,0,0,0,0,0
3,1,0,-1,0,-1,-1,0,0,0,-1
4,-1,-1,-1,0,0,0,1,-1,0,-1
5,-1,-1,1,0,1,1,-1,1,-1,-1
6,1,-1,0,1,1,-1,1,1,1,1
7,0,1,-1,1,-1,1,-1,1,-1,1
8,1,0,0,0,0,-1,0,-1,-1,0
9,1,1,0,-1,-1,1,-1,-1,1,1


In [45]:
solutions_no_new_lf[3]

Unnamed: 0,lf_1,lf_2,lf_3,lf_4,lf_5,lf_6,lf_7,lf_8,lf_9,lf_10
0,1,1,1,0,1,1,-1,1,1,-1
1,1,-1,1,-1,1,1,1,1,1,1
2,1,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,-1,1,0,0,-1
4,0,0,-1,0,0,1,0,0,0,-1
5,1,1,1,0,1,1,1,1,-1,-1
6,1,-1,1,0,1,-1,1,1,1,1
7,0,1,-1,1,1,1,1,1,-1,1
8,1,0,0,0,0,-1,0,0,-1,0
9,1,1,1,-1,-1,0,1,1,1,1


In [46]:
df_dummy_solver_input

Unnamed: 0,lf_1,lf_2,lf_3,lf_4,lf_5,lf_6,lf_7,lf_8,lf_9,lf_10,nlf_1,nlf_2,nlf_3,nlf_4,nlf_5,expected_label
0,1,-1,-1,0,1,1,-1,1,1,-1,-1,-1,-1,-1,1,1
1,-1,-1,1,-1,-1,1,1,1,1,1,0,1,-1,0,1,1
2,1,0,1,0,0,1,0,0,0,0,-1,0,1,1,1,0
3,1,0,-1,0,-1,-1,1,0,0,-1,0,0,0,1,0,0
4,-1,-1,-1,0,-1,1,1,-1,0,-1,0,-1,-1,0,-1,0
5,-1,-1,1,0,0,1,-1,1,-1,-1,1,0,1,0,1,1
6,1,-1,0,0,1,-1,1,1,0,1,0,0,1,1,-1,1
7,0,1,-1,0,-1,1,-1,1,-1,1,1,1,1,1,0,1
8,1,1,0,0,0,-1,1,-1,-1,0,-1,1,0,0,0,0
9,1,1,0,-1,-1,0,0,-1,0,1,-1,1,1,-1,1,1


In [47]:
# lfs_in_the_result = [x for x in cols if ('nlf_' not in x and x!='expected_label')] + [k for k,v in dummy_x_nlfs_solution.items() if v==1]

In [48]:
# df_dummy_solver_input['expected_label']

In [49]:
# df_dummy_solver_input[lfs_in_the_result]

In [50]:
# dummy_p_vars_solution[lfs_in_the_result]

In [51]:
for i in range(len(solutions)-1):
    comparison = solutions[i]!= solutions[i+1]

    # Get the indices (row, column) where the values are different
    diff_indices = np.where(comparison)

#     Print the indices
    for row, col in zip(diff_indices[0], diff_indices[1]):
        print(f"Different value at row {row}, column {col}")
    num_differences = comparison.sum().sum()
    print(f"number of differentces between {i} and {i+1}: {num_differences}")

Different value at row 0, column 2
Different value at row 0, column 6
Different value at row 1, column 1
Different value at row 1, column 4
Different value at row 1, column 10
Different value at row 2, column 11
Different value at row 2, column 13
Different value at row 3, column 4
Different value at row 3, column 9
Different value at row 3, column 13
Different value at row 3, column 14
Different value at row 4, column 0
Different value at row 4, column 1
Different value at row 4, column 5
Different value at row 4, column 7
Different value at row 5, column 11
Different value at row 5, column 13
Different value at row 6, column 2
Different value at row 6, column 3
Different value at row 7, column 0
Different value at row 7, column 2
Different value at row 7, column 10
Different value at row 7, column 11
Different value at row 7, column 14
Different value at row 8, column 7
Different value at row 8, column 8
Different value at row 8, column 12
Different value at row 8, column 14
Differen

In [52]:
solutions_no_new_lf[0]

Unnamed: 0,lf_1,lf_2,lf_3,lf_4,lf_5,lf_6,lf_7,lf_8,lf_9,lf_10
0,1,-1,-1,0,1,1,-1,1,1,-1
1,-1,-1,1,-1,-1,1,1,1,1,1
2,1,0,1,0,0,0,0,0,0,0
3,0,0,-1,0,-1,-1,1,0,0,-1
4,-1,-1,-1,0,-1,0,0,-1,0,-1
5,-1,-1,1,0,1,1,-1,1,-1,-1
6,1,-1,0,1,1,-1,1,1,1,1
7,0,1,-1,1,-1,1,-1,1,-1,1
8,1,0,0,0,0,-1,0,-1,-1,0
9,1,1,0,-1,-1,1,-1,-1,1,1


In [53]:
for i in range(len(solutions_no_new_lf)-1):
    comparison = solutions_no_new_lf[i]!= solutions_no_new_lf[i+1]

    # Get the indices (row, column) where the values are different
    diff_indices = np.where(comparison)

#     Print the indices
    for row, col in zip(diff_indices[0], diff_indices[1]):
        print(f"Different value at row {row}, column {col}")
    num_differences = comparison.sum().sum()
    print(f"number of differentces between {i} and {i+1}: {num_differences}")

number of differentces between 0 and 1: 0
Different value at row 3, column 0
Different value at row 3, column 6
Different value at row 4, column 4
Different value at row 4, column 6
Different value at row 14, column 0
Different value at row 14, column 3
number of differentces between 1 and 2: 6
Different value at row 0, column 1
Different value at row 0, column 2
Different value at row 1, column 0
Different value at row 1, column 4
Different value at row 3, column 0
Different value at row 3, column 2
Different value at row 3, column 4
Different value at row 3, column 6
Different value at row 4, column 0
Different value at row 4, column 1
Different value at row 4, column 5
Different value at row 4, column 6
Different value at row 4, column 7
Different value at row 5, column 0
Different value at row 5, column 1
Different value at row 5, column 6
Different value at row 6, column 2
Different value at row 6, column 3
Different value at row 7, column 4
Different value at row 7, column 6
Diff

In [63]:
for i in range(len(solutions)-1):
    comparison = solutions[i][list(solutions_no_new_lf[0])]!= solutions[i+1][list(solutions_no_new_lf[0])]

    # Get the indices (row, column) where the values are different
    diff_indices = np.where(comparison)

#     Print the indices
#     for row, col in zip(diff_indices[0], diff_indices[1]):
#         print(f"Different value at row {row}, column {col}")
    num_differences = comparison.sum().sum()
    print(f"number of differentces between {i} and {i+1}: {num_differences}")

number of differentces between 0 and 1: 36
number of differentces between 1 and 2: 38


In [69]:
sol1 = solutions[0][list(solutions_no_new_lf[0])]
sol1['expected_label']=df_dummy_solver_input['expected_label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sol1['expected_label']=df_dummy_solver_input['expected_label']


In [71]:
sol1.to_csv('sol1.csv',index=False)

In [72]:
sol2 = solutions[1][list(solutions_no_new_lf[0])]
sol2['expected_label']=df_dummy_solver_input['expected_label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sol2['expected_label']=df_dummy_solver_input['expected_label']


In [74]:
sol2.to_csv('sol2.csv',index=False)

In [75]:
sol2

Unnamed: 0,lf_1,lf_2,lf_3,lf_4,lf_5,lf_6,lf_7,lf_8,lf_9,lf_10,expected_label
0,1,1,1,0,1,1,-1,1,1,1,1
1,1,-1,1,-1,1,1,1,1,1,1,1
2,0,0,1,0,0,1,0,0,0,0,0
3,0,0,0,0,0,-1,0,0,0,-1,0
4,0,0,0,0,0,1,0,-1,0,0,0
5,1,1,1,0,0,1,1,1,1,1,1
6,1,1,1,0,1,1,1,1,0,1,1
7,1,1,-1,0,1,1,1,1,1,1,1
8,0,0,0,0,0,-1,0,-1,0,0,0
9,1,1,0,1,1,1,1,1,0,1,1
