In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
from collections import Counter
import ipywidgets as widgets
from IPython.display import display, clear_output
import os as sys
import io
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import seaborn as sns
import matplotlib.pyplot as plt
import ast

#@title Calculating annotators inner agreement level
#@markdown # **Instructions**
#@markdown Please make sure you have an available csv file on your main folder on google drive ("MyDrive")
#@markdown Make sure to name it "screenshot.csv"

#@markdown Alternatively, you can directly upload a csv file by pressing Upload.  **Note:**
#@markdown * You'll Need to first press the Play button.
#@markdown * this process might take some time.


#@markdown  # Please press on the Play button on the left to begin.
#@markdown After pressing Play and providing the csv, press Run to start.

# Add instruction for the user
output = widgets.Output()
name_box = widgets.Text(
    value="Please click Upload to upload your CSV file",
    continuous_update=False,
    disabled=True,
    layout=widgets.Layout(width='auto')
)
# Create a FileUpload widget
uploader = widgets.FileUpload(
    accept='.csv',  # Accepted file type
    multiple=False  # True to accept multiple files upload else False
)
# Create a Run button widget
run_button = widgets.Button(description='Run')

def on_run_click(b):
  global data
  # Get the uploaded file
  if len(uploader.value) > 0:
    uploaded_file = uploader.value[list(uploader.value.keys())[0]]
    content = uploaded_file['content']
    #Convert the uploaded file to a dataframe
    data = pd.read_csv(io.BytesIO(content))
  else:
    drive.mount('/content/drive')
    csv_path = ('/content/drive/MyDrive/screenshot.csv')
    data = pd.read_csv(csv_path)
  # Hide the FileUpload widget and the instruction text
  name_box.layout.display = 'none'
  uploader.layout.display = 'none'
  run_button.layout.display = 'none'
  clear_output()

  # Call the displaySecondMenu function
  displaySecondMenu(data)

# Add an event listener to the Run button
run_button.on_click(on_run_click)
# Display the uploader and the instruction text
display(name_box, uploader, run_button)


#secondary menu
def displaySecondMenu(data):
  annotators = load_annotators(data)
  tasks = create_tasks(data)
  # Create dropdown widgets for annotator selection
  annotator1_dropdown = widgets.Dropdown(options= annotators, description='Annotator1:')
  annotator2_dropdown = widgets.Dropdown(options= annotators, description='Annotator2:')

  # Create dropdown widget for task selection
  task_dropdown = widgets.Dropdown(options=tasks, description='Task:')

  # Create a button widget for calculating cohen kappa
  calc_button = widgets.Button(description='Calculate Agreement')

  # Define a callback function for the button Calculate Agreement
  def on_calc_agreement_click(b):
    clear_output()
    annotator_a = annotator1_dropdown.value
    annotator_b = annotator2_dropdown.value
    taskChoice = task_dropdown.value
    task = 'sentiment' if taskChoice == "Sentiment" else 'emotions' if taskChoice == "Emotions" else 'label' if taskChoice == "NER" else 'Stats' if taskChoice == 'Stats' else None
    display(annotator1_dropdown, annotator2_dropdown,
        task_dropdown, calc_button, output)
    calc_agreement(data,annotator_a,annotator_b,task)

  # Register the callback function with the button save
  calc_button.on_click(on_calc_agreement_click)
  # Display the Menu
  output = widgets.Output()
  display(annotator1_dropdown, annotator2_dropdown,
        task_dropdown, calc_button, output)






#Utility functions
def load_annotators(data):
    data = data[data['annotator'].notnull()]
    return  np.unique(data['annotator'].dropna().values)

def create_tasks(data):
  tasks = ['Stats']
  if 'sentiment' in data.columns:
    tasks.append('Sentiment')
  if 'emotions' in data.columns:
      tasks.append('Emotions')
  if 'label' in data.columns:
      tasks.append('NER')
  return tasks

def filter_following_sentences(data):
    filtered_data = data[data['Connected_sentence'].isnull()]
    return filtered_data

def filter_by_annotator(data, annotator):
    filtered_data = data[data['annotator'] == annotator].copy()
    filtered_data= filtered_data.drop_duplicates(subset=['annotator','id'])
    annotator_a_data_filtered = filtered_data[['id', 'sentiment', 'label', 'emotions','Connected_sentence']].copy()
    return annotator_a_data_filtered

def get_vmin_vmax(confusion_matrix):
    min_val = confusion_matrix.min().min()
    max_val = confusion_matrix.max().max()
    avg_val = confusion_matrix.mean().mean()

    if max_val > min_val + 200:
        # Use the min-max method otherwise
        vmin = min_val
        vmax = (avg_val + min_val + max_val) / 3

    else:
        # Use the IQR method if there are values significantly far from the mean
        q1 = np.percentile(confusion_matrix, 25)
        q3 = np.percentile(confusion_matrix, 75)
        iqr = q3 - q1
        vmin = max(0, q1 - 1.5 * iqr)
        vmax = q3 + 1.5 * iqr

    return vmin, vmax

def transform_column(id_column, label_column):
    rows = []
    for id_value, label_value in zip(id_column, label_column):
        label_value = ast.literal_eval(label_value)
        for annotation in label_value:
            row = {}
            row['id'] = id_value
            row['label1'] = annotation
            row['text'] = annotation['text']
            row['task1'] = annotation['labels'][0]
            row['Interval'] = (annotation['start'], annotation['end'])
            rows.append(row)
    df = pd.DataFrame(rows)
    return df

def interval_intersection(a, b):
    start = max(a[0], b[0])
    end = min(a[1], b[1])
    if start < end:
        return (start, end)
    else:
        return None


## for verification dive the annotator filtered data and the new data frame of the same annotator alone
def verify_counts(data, df):
    for id_value in data['id'].dropna().unique():
        label_value = data.loc[data['id'] == id_value, 'label'].iloc[0]
        label_value = ast.literal_eval(label_value)
        num_tasks = len(label_value)
        num_rows = len(df.loc[df['id'] == id_value])
        if num_tasks != num_rows:
            print(f"Error: id {id_value} has {num_tasks} tasks but {num_rows} rows")
            return False
    #print("Verification successful")
    return True

def calculate_metrics(annotator_a_data, annotator_b_data, partial_ratio_threshold=0.5):
    # Initialize list to store results
    results = []

    # Loop through samples
    for id_value in annotator_a_data['id'].unique():
        # Get annotations for current sample
        a_annotations = annotator_a_data[annotator_a_data['id'] == id_value]
        b_annotations = annotator_b_data[annotator_b_data['id'] == id_value]

        # Loop through annotations from annotator A
        for _, a_annotation in a_annotations.iterrows():
            result = {
                'id': id_value,
                'Annotator A Entity Type': a_annotation['task1'],
                'Annotator A Surface String': a_annotation['text'],
                'Annotator B Entity Type': '',
                'Annotator B Surface String': '',
                'Type': 'MIS',
                'Partial': 'MIS',
                'Exact': 'MIS',
                'Strict': 'MIS'
            }

            # Check if there is an overlapping annotation from annotator B
            overlap = False
            for _, b_annotation in b_annotations.iterrows():
                intersection = interval_intersection(a_annotation['Interval'], b_annotation['Interval'])
                if intersection is not None:
                    overlap = True
                    result['Annotator B Entity Type'] = b_annotation['task1']
                    result['Annotator B Surface String'] = b_annotation['text']
                    if a_annotation['task1'] == b_annotation['task1']:
                        result['Type'] = 'COR'
                        if a_annotation['text'] == b_annotation['text']:
                            result['Exact'] = 'COR'
                            result['Strict'] = 'COR'
                        else:
                            result['Exact'] = 'INC'
                            result['Strict'] = 'INC'
                    else:
                        result['Type'] = 'INC'
                        result['Strict'] = 'INC'
                        if a_annotation['text'] == b_annotation['text']:
                            result['Exact'] = 'COR'
                        else:
                            result['Exact'] = 'INC'

                    # Calculate ratio of intersection
                    intersection_length = intersection[1] - intersection[0]
                    a_length = a_annotation['Interval'][1] - a_annotation['Interval'][0]
                    b_length = b_annotation['Interval'][1] - b_annotation['Interval'][0]
                    ratio = 2 * intersection_length / (a_length + b_length)
                    if ratio >= partial_ratio_threshold:
                        result['Partial'] = 'PAR'
                    else:
                        result['Partial'] = result['Exact']

                    break

            results.append(result)

        # Count spurious annotations from annotator B
        for _, b_annotation in b_annotations.iterrows():
            overlap = False
            for _, a_annotation in a_annotations.iterrows():
                if interval_intersection(a_annotation['Interval'], b_annotation['Interval']) is not None:
                    overlap = True;
                    break;
            if not overlap:
                result = {
                    'id': id_value,
                    'Annotator A Entity Type': '',
                    'Annotator A Surface String': '',
                    'Annotator B Entity Type': b_annotation['task1'],
                    'Annotator B Surface String': b_annotation['text'],
                    'Type': 'SPU',
                    'Partial': 'SPU',
                    'Exact': 'SPU',
                    'Strict': 'SPU'
                }
                results.append(result)

    return pd.DataFrame(results)

def calculate_scores(results_df):
    # Calculate number of gold-standard annotations
    pos = (results_df['Type'] != 'SPU').sum()

    # Calculate number of annotations produced by the NER system
    act = (results_df['Type'] != 'MIS').sum()

    # Initialize dictionary to store scores
    scores = {
        'Measure': ['Correct', 'Incorrect', 'Partial', 'Missed', 'Spurius', 'Precision', 'Recall', 'F1']
    }

    # Calculate scores for each evaluation schema
    for schema in ['Type', 'Partial', 'Exact', 'Strict']:
        # Calculate number of correct, incorrect, partial, missed, and spurious annotations
        cor = (results_df[schema] == 'COR').sum()
        inc = (results_df[schema] == 'INC').sum()
        par = (results_df[schema] == 'PAR').sum()
        mis = (results_df[schema] == 'MIS').sum()
        spu = (results_df[schema] == 'SPU').sum()

        # Calculate precision and recall
        if schema == 'Partial':
            precision = (cor + par) / act
            recall = (cor + par) / pos
        else:
            precision = cor / act
            recall = cor / pos

        # Calculate F1-score
        f1 = 2 * precision * recall / (precision + recall)

        # Store scores
        scores[schema] = ['{:.0f}'.format(cor), inc, par, mis, spu, precision, recall, f1]

    return pd.DataFrame(scores)



def cal_average_NER(scores_dfA, scores_dfB):
    average = scores_dfA.copy()
    average.iloc[:8, 1:] = ((scores_dfA.iloc[:8, 1:].copy().astype(float) + scores_dfB.iloc[:8, 1:].copy().astype(float))) / 2
    average.iloc[:5, 1:] = average.iloc[:5, 1:].astype(str)
    return average.tail(3)



def labelcalc():
    global dfa
    global dfb
    id_column_a = annotator_a_task['id'].dropna()
    label_column_a = annotator_a_task['label'].dropna()
    dfa = transform_column(id_column_a, label_column_a)
    id_column_b = annotator_b_task['id'].dropna()
    label_column_b = annotator_b_task['label'].dropna()
    dfb = transform_column(id_column_b, label_column_b)
    #if verify_counts(annotator_a_task, dfa) and verify_counts(annotator_b_task, dfb):
    results_dfA = calculate_metrics(dfa, dfb)
    results_dfB = calculate_metrics(dfb, dfa)
    scores_dfA = calculate_scores(results_dfA)
    scores_dfB = calculate_scores(results_dfB)
    display(cal_average_NER(scores_dfA, scores_dfB))

def display_stats(data):
    null_data = data[data['id'].isna()]
    nonnull_data = data[data['id'].notna()]
    nonnull_data = nonnull_data.drop_duplicates(subset='id')
    unique_data = pd.concat([null_data,nonnull_data])
    overall_samples =  unique_data.shape[0]
    id_counts = data['id'].value_counts()
    annotated_more_than_one = sum(id_counts > 1)
    print(f"\n\n There are overall {data.shape[0]} annotations over {overall_samples} samples.\n {annotated_more_than_one} samples were annotated by more than a single annotator. \n\n")
    number_of_annotations_a = annotator_a_data.shape[0]
    number_of_annotations_b = annotator_b_data.shape[0]
    number_of_annotations_a_connected_s = filter_following_sentences(annotator_a_data).shape[0]
    number_of_annotations_b_connected_s = filter_following_sentences(annotator_b_data).shape[0]
    annotator_a_emotions_notnull = annotator_a_data[annotator_a_data['emotions'].notnull()].shape[0]
    annotator_b_emotions_notnull = annotator_b_data[annotator_b_data['emotions'].notnull()].shape[0]
    annotator_a_sentiment_notnull = annotator_a_data[annotator_a_data['sentiment'].notnull()].shape[0]
    annotator_b_sentiment_notnull = annotator_b_data[annotator_b_data['sentiment'].notnull()].shape[0]
    joined_data = pd.merge(annotator_a_data, annotator_b_data,on='id', how='inner', suffixes=('_a', '_b'))
    joined_data_filtered = joined_data[['id', 'sentiment_a', 'sentiment_b','emotions_a','emotions_b']]
    num_overlap_sentiment = joined_data_filtered[['id', 'sentiment_a', 'sentiment_b']].shape[0]
    num_overlap_emotion = joined_data_filtered[['id', 'emotions_a', 'emotions_b']].shape[0]
    string_dropped = "Annotations after droping conneced sentences"
    combined_df = pd.DataFrame({
    'Metric': ['Total annotations', string_dropped, 'Number of connected sentences marked', 'Emotion annotations', 'Sentiment annotations', 'Overlap Sentiment', 'Overlap Emotion'],
    'Annotator A': [number_of_annotations_a, number_of_annotations_a_connected_s, number_of_annotations_a - number_of_annotations_a_connected_s, annotator_a_emotions_notnull, annotator_a_sentiment_notnull, num_overlap_sentiment, num_overlap_emotion],
    'Annotator B': [number_of_annotations_b, number_of_annotations_b_connected_s, number_of_annotations_b - number_of_annotations_b_connected_s, annotator_b_emotions_notnull, annotator_b_sentiment_notnull, num_overlap_sentiment, num_overlap_emotion]
})
    #  styling
    styled_df = combined_df.style.set_properties(**{'text-align': 'center'}).set_table_styles([
        {'selector': 'th.col_heading', 'props': 'text-align: center'},
        {'selector': 'caption', 'props': [('text-align', 'center'), ('font-size', '11pt'), ('font-weight', 'bold')]}
    ])
    styled_df = styled_df.hide(axis="index")

    display(styled_df)

#Logic Funtions:
def task_agreement(task, annotator_a_data, annotator_b_data):
    global annotator_a_task
    global annotator_b_task
    if task == 'label':
        annotator_a_task = annotator_a_data[['id', f'{task}']].copy()
        annotator_b_task = annotator_b_data[['id', f'{task}']].copy()
        labelcalc()
    else:
        annotator_a_task = annotator_a_data[f'{task}']
        annotator_b_task = annotator_b_data[f'{task}']
        agreement = cohen_kappa_score(annotator_a_task, annotator_b_task)
        visualize_agreement_matrix(f'{task}', agreement)

def calc_agreement(data,annotator_a,annotator_b,task):
  global annotator_a_data
  global annotator_b_data
  global overlap_ids

  # flag for removing connected sentences
  if task != 'label':
    newdata = filter_following_sentences(data)
  else:
    newdata = data.copy()
  #end of flag

  newdata = data
  annotator_a_data = filter_by_annotator(newdata, annotator_a)
  annotator_b_data = filter_by_annotator(newdata, annotator_b)
  if task == 'Stats':
    display_stats(newdata)
  else:
    #calculate overlapping samples
    annotator_a_data_notnull = annotator_a_data[annotator_a_data[f'{task}'].notnull()]
    annotator_b_data_notnull = annotator_b_data[annotator_b_data[f'{task}'].notnull()]
    overlap_ids = set(annotator_a_data_notnull['id']).intersection(set(annotator_b_data_notnull['id']))
    annotator_a_data = annotator_a_data[(annotator_a_data['id'].isin(overlap_ids))]
    annotator_b_data = annotator_b_data[(annotator_b_data['id'].isin(overlap_ids))]
    annotator_a_data = annotator_a_data.astype({'sentiment': str, 'emotions': str})
    annotator_b_data = annotator_b_data.astype({'sentiment': str, 'emotions': str})

    if len(overlap_ids) > 0:
      task_agreement(task, annotator_a_data, annotator_b_data)
    else:
      print("Selected annotators have no overlapping annotations.")


def basic_agreement(task):
    joined_data = pd.merge(annotator_a_data, annotator_b_data,on='id', how='inner', suffixes=('_a', '_b'))
    joined_data_filtered = joined_data[['id', 'sentiment_a', 'sentiment_b','emotions_a','emotions_b']]
    df = joined_data_filtered[['id', f'{task}_a', f'{task}_b']]
    agreement_cases = df[df[f'{task}_a'] == df[f'{task}_b']]
    return agreement_cases.shape[0] / df.shape[0]

def visualize_agreement_matrix(task, kappa):
    # Filter the data
    joined_data = pd.merge(annotator_a_data, annotator_b_data,on='id', how='inner', suffixes=('_a', '_b'))
    joined_data_filtered = joined_data[['id', 'sentiment_a', 'sentiment_b','emotions_a','emotions_b','label_a','label_b']]
    joined_data_filtered.head()
    df = joined_data_filtered[['id', f'{task}_a', f'{task}_b']]
    # Get unique values
    values = np.flip(np.sort(df[f'{task}_a'].unique()))
    # Create an empty confusion matrix
    confusion_matrix = pd.DataFrame(0, index=values, columns=values)
    basic_agreement_val = basic_agreement(f'{task}')
    # Fill the confusion matrix
    for i in values:
        for j in values:
            confusion_matrix.loc[i, j] = len(df[(df[f'{task}_a'] == i) & (df[f'{task}_b'] == j)])
    # Visualize the confusion matrix
    min, max = get_vmin_vmax(confusion_matrix)
    sns.heatmap(confusion_matrix, cmap='RdYlBu', annot=True, fmt='.5g', vmin=min, vmax=max)
    title = f'Agreement Matrix for {task}\n\n Cohen’s Kappa: = {kappa:.2f}\n IAA Basic score: = {basic_agreement_val:.2f}'
    title = '\n'.join(line.center(20) for line in title.split('\n'))
    plt.title(title)
    plt.show()


Text(value='Please click Upload to upload your CSV file', continuous_update=False, disabled=True, layout=Layou…

FileUpload(value={}, accept='.csv', description='Upload')

Button(description='Run', style=ButtonStyle())