In [3]:
import random
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from cleanlab.multilabel_classification.filter import find_label_issues
from cleanlab.multilabel_classification.rank import get_label_quality_scores
from cleanlab.internal.multilabel_utils import int2onehot, onehot2int
from cleanlab.multiannotator import get_majority_vote_label, get_label_quality_multiannotator, get_active_learning_scores

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset = pd.read_csv("data/datafiles/annotationRecords1.csv")

In [5]:
pred_probs_df = pd.read_csv("logs/2023-10-13T16-49-47_four/Predcition_prob.csv")
labels_df=pd.read_csv("logs/2023-10-13T16-49-47_four/Predcition_pred.csv")

In [6]:
#get image name from path
pred_probs_df['image_name'] = pred_probs_df['path'].apply(lambda x: x.split('/')[-1])

In [7]:
labels_df['image_name'] = labels_df['path'].apply(lambda x: x.split('/')[-1])

In [8]:
#checking if same images exist in both 
is_same_values = (pred_probs_df['image_name'] == labels_df['image_name']).all()

In [9]:
is_same_values

True

In [10]:
#only retaining the images that are in the probs.csv
df1 = dataset[dataset['image_name'].isin(pred_probs_df['image_name'])]

In [11]:
df1.columns

Index(['_id', 'annotator', 'au10_raising_of_upper_lip',
       'au12_pulling_at_corner_lip', 'au20_lip_stretcher', 'au24_lip_presser',
       'au25_parting_lips', 'au26_jaw_drop', 'au27_mouth_stretch',
       'au43_eyes_closed', 'au4_brow_lowering', 'au6_cheek_raising',
       'au7_tightning_of_eyelids', 'au9_wrinkling_of_nose', 'comments', 'date',
       'end_time', 'filepath', 'image_name', 'no_particular_expression',
       'patientID', 'smile', 'start_time', 'time_in_seconds',
       'wrinkled_forehead', 'unclear'],
      dtype='object')

In [12]:
df1=df1.drop(columns=['comments','_id','date','end_time', 'filepath','start_time', 'time_in_seconds','no_particular_expression',
       'patientID', 'smile', 'start_time', 'time_in_seconds',
       'wrinkled_forehead', 'unclear'])

In [13]:
#renaming columns for consistency
name_map={'au10_raising_of_upper_lip':'AU10',
       'au12_pulling_at_corner_lip':'AU12', 'au20_lip_stretcher':'AU20', 'au24_lip_presser':'AU24',
       'au25_parting_lips':'AU25', 'au26_jaw_drop':'AU26', 'au27_mouth_stretch':'AU27',
       'au43_eyes_closed':'AU43', 'au4_brow_lowering':'AU4', 'au6_cheek_raising':'AU6',
       'au7_tightning_of_eyelids':'AU7', 'au9_wrinkling_of_nose':'AU9'}
df1.rename(columns=name_map, inplace=True)

In [14]:
#grouping by image_name
grouped_df = df1.groupby(['image_name','annotator'], as_index=False).first()
grouped_df

Unnamed: 0,image_name,annotator,AU10,AU12,AU20,AU24,AU25,AU26,AU27,AU43,AU4,AU6,AU7,AU9
0,2021-06-01 15-01-02_000000000519.jpg,babatundeshofolu,0,0,0,0,1,0,0,1,0,0,0,0
1,2021-06-01 15-01-02_000000000519.jpg,hannahweisman,0,0,0,0,1,0,0,1,0,0,0,0
2,2021-06-01 15-01-02_000000000519.jpg,jennifer.noa,0,0,0,0,1,0,0,1,0,0,0,0
3,2021-06-01 15-01-02_000000000519.jpg,rishika.patel@ufl.edu\health,0,0,0,0,1,0,0,1,0,0,0,0
4,2021-06-01 15-01-02_000000000525.jpg,babatundeshofolu,0,0,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128047,2022-03-30 17-14-24_000000001763.jpg,aribahali,0,0,0,0,1,0,0,0,0,0,0,0
128048,2022-03-30 17-14-24_000000001764.jpg,aribahali,0,0,0,0,0,1,0,0,0,0,0,0
128049,2022-03-30 17-14-24_000000001765.jpg,aribahali,0,0,0,0,0,1,0,0,0,0,0,0
128050,2022-03-30 17-14-24_000000001767.jpg,aribahali,0,0,0,0,1,0,0,0,0,0,0,0


In [15]:
labels_df.columns

Index(['path', 'AU1', 'AU2', 'AU4', 'AU6', 'AU7', 'AU9', 'AU10', 'AU12',
       'AU14', 'AU15', 'AU17', 'AU20', 'AU23', 'AU24', 'AU25', 'AU26', 'AU27',
       'AU43', 'image_name'],
      dtype='object')

In [16]:
labels_df=labels_df.drop(columns=['path','AU1','AU2', 'AU14','AU15', 'AU17','AU23'])

In [17]:
order_mapping = {value: index for index, value in enumerate(labels_df['image_name'])}

grouped_df['order'] = grouped_df['image_name'].map(order_mapping)

In [18]:
grouped_df = grouped_df.sort_values(by='order').drop(columns='order')

In [19]:
grouped_df

Unnamed: 0,image_name,annotator,AU10,AU12,AU20,AU24,AU25,AU26,AU27,AU43,AU4,AU6,AU7,AU9
79919,2022-01-29 08-54-02_000000000260.jpg,hannahweisman,0,0,0,0,1,0,0,1,0,0,0,0
79918,2022-01-29 08-54-02_000000000260.jpg,ezequielbautista,0,0,0,0,1,0,0,1,0,0,0,0
79920,2022-01-29 08-54-02_000000000260.jpg,kmaisuria,0,0,0,0,0,1,0,1,0,0,0,0
79921,2022-01-29 08-54-02_000000000260.jpg,wkratochvil,0,0,0,0,0,1,0,1,0,0,0,0
79924,2022-01-29 09-09-07_000000000184.jpg,kmaisuria,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71516,2022-01-20 08-37-38_000000000430.jpg,hannahweisman,0,0,0,0,1,0,0,1,0,0,0,0
71517,2022-01-20 08-37-38_000000000459.jpg,hannahweisman,0,0,0,0,0,0,0,1,0,0,0,0
71518,2022-01-20 08-37-38_000000000460.jpg,hannahweisman,0,0,0,0,1,0,0,1,0,0,0,0
71519,2022-01-20 08-37-38_000000000462.jpg,hannahweisman,0,0,0,0,1,0,0,1,0,0,0,0


### reshaping to (N,K) 

In [20]:
df4= grouped_df[['image_name','annotator','AU4']]

In [21]:
multi_annotator_labels_au4 = df4.pivot(index='image_name', columns='annotator', values='AU4')

In [22]:
multi_annotator_labels_au4.reset_index(inplace=True)

In [23]:
order_mapping = {value: index for index, value in enumerate(labels_df['image_name'])}
multi_annotator_labels_au4['order'] = multi_annotator_labels_au4['image_name'].map(order_mapping)
multi_annotator_labels_au4 = multi_annotator_labels_au4.sort_values(by='order').drop(columns='order')

In [24]:
multi_annotator_labels_au4.shape

(54714, 10)

In [25]:
pred_probs_df = pred_probs_df.drop_duplicates(subset='image_name')


In [26]:
labels_df = labels_df.drop_duplicates(subset='image_name')


In [27]:
multi_annotator_labels_au4=multi_annotator_labels_au4.drop(columns=['image_name'])
multi_annotator_labels_au4.index.name = None

In [28]:
pred_prob = pred_probs_df.drop(columns=[ 'path','AU1', 'AU2', 'AU6', 'AU7', 'AU9', 'AU10', 'AU12', 'AU14',
       'AU15', 'AU17', 'AU20', 'AU23', 'AU24', 'AU25', 'AU26', 'AU27', 'AU43'])
pred_probs = pred_prob.iloc[:, 0:].values

In [29]:
labels_df.columns
label=labels_df.drop(columns=['AU6', 'AU7', 'AU9', 'AU10', 'AU12', 'AU20', 'AU24', 'AU25',
       'AU26', 'AU27', 'AU43'])

In [30]:
pred_prob.rename(columns={'AU4': 'probability'}, inplace=True)
prob = pd.merge(pred_prob, label, on='image_name', how='inner')

In [31]:
df_au4_0 = prob[prob['AU4'] == 0]
df_au4_1 = prob[prob['AU4'] == 1]

In [32]:
df_au4_0.rename(columns={'AU4': 'NotAU4'}, inplace=True)
df_au4_0.rename(columns={'probability': 'Notprobability'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_au4_0.rename(columns={'AU4': 'NotAU4'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_au4_0.rename(columns={'probability': 'Notprobability'}, inplace=True)


In [33]:
df_au4_0['AU4']=1.0
df_au4_0['probability'] = 1 - df_au4_0['Notprobability']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_au4_0['AU4']=1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_au4_0['probability'] = 1 - df_au4_0['Notprobability']


In [34]:
df_au4_1['NotAU4']=0.0
df_au4_1['Notprobability'] = 1 - df_au4_1['probability']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_au4_1['NotAU4']=0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_au4_1['Notprobability'] = 1 - df_au4_1['probability']


In [35]:
probability = pd.concat([df_au4_0, df_au4_1], ignore_index=True)

In [36]:
probability = probability.drop(columns=['image_name','AU4','NotAU4'])

In [37]:
multi_annotator_labels_au4.index.name = None

In [38]:
probability=probability.iloc[:, 0:].values

### clean lab

In [39]:
results = get_label_quality_multiannotator(multi_annotator_labels_au4, probability, consensus_method=['majority_vote','best_quality'],verbose=False)

In [42]:
results["label_quality"]

Unnamed: 0,consensus_label,consensus_quality_score,annotator_agreement,num_annotations,consensus_label_best_quality,consensus_quality_score_best_quality,annotator_agreement_best_quality
38489,0,0.898590,1.0,4,0,0.895343,1.0
38490,0,0.898590,1.0,4,0,0.895342,1.0
38491,0,0.898590,1.0,4,0,0.895342,1.0
38492,0,0.898590,1.0,4,0,0.895343,1.0
38493,0,0.898590,1.0,4,0,0.895343,1.0
...,...,...,...,...,...,...,...
36794,0,0.898595,1.0,1,0,0.895355,1.0
36795,0,0.898595,1.0,1,0,0.895355,1.0
36796,0,0.898594,1.0,1,0,0.895354,1.0
36797,0,0.898594,1.0,1,0,0.895354,1.0


In [43]:
results["label_quality"]


Unnamed: 0,consensus_label,consensus_quality_score,annotator_agreement,num_annotations,consensus_label_best_quality,consensus_quality_score_best_quality,annotator_agreement_best_quality
38489,0,0.898590,1.0,4,0,0.895343,1.0
38490,0,0.898590,1.0,4,0,0.895342,1.0
38491,0,0.898590,1.0,4,0,0.895342,1.0
38492,0,0.898590,1.0,4,0,0.895343,1.0
38493,0,0.898590,1.0,4,0,0.895343,1.0
...,...,...,...,...,...,...,...
36794,0,0.898595,1.0,1,0,0.895355,1.0
36795,0,0.898595,1.0,1,0,0.895355,1.0
36796,0,0.898594,1.0,1,0,0.895354,1.0
36797,0,0.898594,1.0,1,0,0.895354,1.0


In [44]:
results["annotator_stats"]

Unnamed: 0_level_0,annotator_quality,agreement_with_consensus,worst_class,num_examples_labeled
annotator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
wkratochvil,0.666942,0.666944,1,1204
ezequielbautista,0.773808,0.773809,0,16181
kmaisuria,0.858978,0.858978,1,3269
kaelynnrodriguez,0.864455,0.864456,1,9923
aribahali,0.885981,0.897144,1,17714
babatundeshofolu,0.886172,0.886172,1,5930
hannahweisman,0.920045,0.947756,1,45039
jennifer.noa,0.965931,0.965931,1,8160
rishika.patel@ufl.edu\health,0.969367,0.969368,1,20632


In [46]:
consensus_quality_score = results["label_quality"]["consensus_quality_score"]

In [262]:
true_label = []

for value in pred:
    true_label.append(value)
truth_labels = np.array(true_label)
truth_labels

array(['dataset', 'path', 'AU1', 'AU2', 'AU4', 'AU6', 'AU7', 'AU9',
       'AU10', 'AU12', 'AU14', 'AU15', 'AU17', 'AU20', 'AU23', 'AU24',
       'AU25', 'AU26', 'AU27', 'AU43', 'image_name'], dtype='<U10')

In [199]:
multi_annotator_labels_au4 = multi_annotator_labels_au4[multi_annotator_labels_au4.notna().any(axis=1)]

In [231]:
probability.shape

(12549, 2)

In [201]:
empty_array = np.full((12549, 2), None, dtype=object)

In [69]:
active_learning_scores = get_active_learning_scores(multi_annotator_labels_au4, probability,None)

In [65]:
active_learning_scores=np.array(active_learning_scores[0],dtype=object)

In [66]:
def get_idx_to_label(
    active_learning_scores,
    batch_size_to_label=100,
    active_learning_scores_unlabeled=None,
):
    if active_learning_scores_unlabeled is None:
        active_learning_scores_unlabeled = np.array([])
    
    num_labeled = len(active_learning_scores)

    active_learning_scores_combined = np.concatenate((active_learning_scores, active_learning_scores_unlabeled))

    if batch_size_to_label > len(active_learning_scores_combined):

        raise ValueError("num_examples_to_relabel is larger than the total number of examples available")

    to_label_idx_combined = np.argsort(active_learning_scores_combined)[:batch_size_to_label]
    to_label_idx = to_label_idx_combined[to_label_idx_combined < num_labeled]
    to_label_idx_unlabeled = (to_label_idx_combined[to_label_idx_combined >= num_labeled] - num_labeled)

    return to_label_idx, to_label_idx_unlabeled

In [67]:
relabel_idx, relabel_idx_unlabeled = get_idx_to_label(
        active_learning_scores=active_learning_scores,
        active_learning_scores_unlabeled=None,
        batch_size_to_label=100,
    )

In [68]:
relabel_idx

array([25136, 25108, 25137, 25134, 24309, 25109, 48055, 48046, 48052,
       48053, 48047, 48049, 48048, 48051, 48043, 48056, 48045, 48038,
       48036, 48039, 48037, 48050, 48042, 48044, 48040, 48054, 48041,
       18443, 21033, 20583, 18187, 18452, 18480, 18181, 18482, 18299,
       18444, 18186, 18486, 18483, 21036, 21037, 21039, 18481, 18474,
       21456, 18357, 23222, 18358, 23223, 18310, 18182, 20210, 17988,
         946,  1204,  3742,   947,   753,  1345,  1241,  3256,  3265,
        3257,  2565,  2665,  2664,  1250,  2650,  2723,  1251,  3264,
         658,  2689,  3258,   714,   715,  2566,  1239,  1249,  2721,
         384,  3263,   550,   551,  3255,  3205,  2649,  1240,  1448,
        1473,   231,  2595,  3274,  1242,  1247,  1248,  3267,  1253,
        3472])