# Phase 1 - Evaluate annotators and get true label

## Imports

In [719]:
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

## Part 1

### One hot encoding

In [720]:
def get_encoded_annotations(df: pd.DataFrame, num_annotators: int):
	"""Encode the labels

	Args:
		df (pd.DataFrame): The original data from the csv file
		num_annotators (int): The number of annotators

	Returns:
		tuple(2): (
			HoneHotEncoder: The encoder object used to encode the labels
			list: The encoded annotations for each annotator
		)
	"""
	cols = df.columns

	# Fill the missing labels with the new label "missing"
	df = df.fillna("missing")

	# Encode each label using One-Hot-Encoding technique
	encoder = OneHotEncoder(sparse=False)

	# Concatenate each annotations from all annotators annotators to encode the labels
	res = encoder.fit_transform(np.array([
		df[cols[1 + i]] for i in range(num_annotators)
		]).reshape(-1, 1))

	# Re affect the annotations to each annotator, encoded
	chunk_size = len(res) // num_annotators

	res_annot = []

	for i in range(num_annotators - 1):
		res_annot.append(res[i * chunk_size : (i+1) * chunk_size])

	res_annot.append(res[(num_annotators - 1) * chunk_size :])
		
	print(f"Categories: {list(encoder.categories_[0])}")

	return (encoder, res_annot)

### Kappa score

In [721]:
def get_kappa(annot_1, annot_2):
	"""Calculate the kappa, between annotator 1 and 2

	Args:
		annot_1 (list): Encoded annotations of annotator 1
		annot_2 (list): Encoded annotations of annotator 2

	Returns:
		float: The kappa score between the two annotators
	"""
	assert len(annot_1) == len(annot_2)

	return cohen_kappa_score(annot_1.flatten(), annot_2.flatten())

In [722]:
def get_scores(df: pd.DataFrame):
	"""Get the average kappa score for each annotator for a given file

	Args:
		df (pd.DataFrame): The data

	Returns:
		tuple(3): (
			OneHotEncoder: The encoder object used to encode the labels
			list: The encoded annotations
			list: The average kappa score for each annotator (list[0] == average kappa for annotator 1)
		)
	"""

	num_annotators = len(df.columns) - 1

	# Encode the labels
	encoder, annotators = get_encoded_annotations(df, num_annotators)

	# Calculate the average kappa for each annotator
	scores_annot = []
	for i in range(num_annotators):
		kappa = 0
		for j in range(num_annotators):
			if i == j: continue
			kappa += get_kappa(annotators[i], annotators[j])

		scores_annot.append(kappa / (num_annotators - 1))


	return (encoder, annotators, scores_annot)

In [750]:
def get_categories(encoder: OneHotEncoder):
	"""Get the categories encoded by the encoder

	Args:
		encoder (OneHotEncoder): The encoder used to encode the labels

	Returns:
		list: The decoded labels
	"""
	return list(encoder.categories_[0])

In [724]:
def reverse_encoding(encoder: OneHotEncoder, input_encoded: np.array):
	"""Reverse the encoding One-hot-encoding => string

	Args:
		encoder (OneHotEncoder): The encoder used to encode the labels
		input_encoded (np.array): The encoded label

	Returns:
		str: The decoded label
	"""
	categories = get_categories(encoder)
	assert input_encoded.shape == (len(categories),)

	return categories[list(input_encoded).index(1)]

In [751]:
def reverse_encoding_list(encoder: OneHotEncoder, input_encoded: np.array):
	"""Reverse the encoding for a list of encoded labels

	Args:
		encoder (OneHotEncoder): The encoder used to encode the labels
		input_encoded (np.array): The list of encoded labels

	Returns:
		list: The lits of decoded labels
	"""
	categories = get_categories(encoder)

	res = []

	for i in range(len(input_encoded)):
		res.append(reverse_encoding(encoder, input_encoded[i]))

	return res

## Part 2

In [726]:
def get_keys_from_value(dict: dict, value: any):
	"""Return the keys associated to a value in a dictionary

	Args:
		dict (dict): The dictionary to get the keys from
		value (any): The value to look for

	Returns:
		list: The list of keys where (key, value)
	"""
	keys = list(dict.keys())
	values = list(dict.values())

	return [
		keys[i] for i in range(len(keys)) 
		if values[i] == value
		]

In [742]:
def process(file: str):
	"""Associate text => true label for a given file

	Args:
		file (str): The file to process

	Returns:
		pd.DataFrame: The dataframe containing text => true label
	"""
	# Load data from file
	df = pd.read_csv(file, header=None).drop(0)

	encoder, encoded_annotations, kappa_values = get_scores(df)

	unreliable_annot = [i for i in range(len(kappa_values)) if kappa_values[i] < 0.2]

	print(f"Unreliable annotators (starts at 0): {unreliable_annot}")

	# Remove kappa score for unreliable annotators
	kappa_values = [
		kappa_values[i] for i in range(len(kappa_values)) 
		if i not in unreliable_annot
	]

	# Get annotations of reliable annotators
	annotations = [
		reverse_encoding_list(encoder, encoded_annotations[i]) 
		for i in range(len(encoded_annotations)) 
		if i not in unreliable_annot
		]


	# Init dataframe with the text and empty label
	final_df = pd.DataFrame()
	final_df = final_df.reindex(columns = ["text", "label"])   
	final_df["text"] = df[df.columns[0]]      
	final_df["label"] = final_df["label"].astype(str)

	# Fill in the label for each text
	for row in range(len(annotations[0])):

		# Init each label to 0 occurrence
		res = {
			key: 0 for key in get_categories(encoder)
		}

		# Count the number of occurrence of each label for a given sample
		for annotator in range(len(annotations)):
			res[annotations[annotator][row]] += 1

		# Get the majority labels
		max_val = max(res.values())
		keys = get_keys_from_value(res, max_val)

		# A distinct majority 
		if (len(keys) == 1): final_df.at[row + 1, "label"] = keys[0]

		# Unclear => assign the label (one with the majority votes) of the annotator with the max kappa
		else:
			# Max first
			ordered_kappa = kappa_values.copy()
			ordered_kappa.sort(reverse=True)

			i = 0
			while i < len(ordered_kappa):
				index_annot = kappa_values.index(ordered_kappa[i])
				label_annot_max_kappa = annotations[index_annot][row]

				# If the annotator with the current max kappa annotated such as its annotation is in the majority, assign this label, else next annotator, based on decreasing kappa score
				if label_annot_max_kappa in keys:  
					final_df.at[row + 1, "label"] = label_annot_max_kappa
					break

				i += 1

	return final_df

In [740]:
def verif_no_na(df: pd.DataFrame):
	"""Check if a text has no assigned label

	Args:
		df (pd.DataFrame): The dataframe to look at
	"""
	indexes_na = []

	# I noticed df.isna().value.any() did not detect "nan", so I implemented it
	for i in range(df.shape[0]):
		if df[df.columns[1]][i+1] == "nan":
			indexes_na.append(i+1)

	# Return the indexes of the missing labels, if any
	if len(indexes_na) > 0:
		print("Na values detected at indexes (start at 1):")
		print(indexes_na)
		print("\n")
	else:
		print("All good, no na value\n")

In [729]:
def concat_and_write(list: list, output_file: str):
	"""Concatenate the list of dataframe and write the result in output_file

	Args:
		list (list): The list of dataframes to concatenate
		output_file (str): The name of the output file
	"""
	df = pd.concat([df for df in list], ignore_index=True)
	df.to_csv(output_file)
	

### Main

#### changeorg_stance


_I modified one file, containing 2 labels for a given annotator / sample, since these documents contain mutually exclusive labels_

In [747]:
df0_changeorg = process("./Datasets/changeorg_stance/changeorg_stance_0.csv")
verif_no_na(df0_changeorg)

df1_changeorg = process("./Datasets/changeorg_stance/changeorg_stance_1.csv")
verif_no_na(df1_changeorg)

df2_changeorg = process("./Datasets/changeorg_stance/changeorg_stance_2.csv")
verif_no_na(df2_changeorg)

df3_changeorg = process("./Datasets/changeorg_stance/changeorg_stance_3.csv")
verif_no_na(df3_changeorg)

df4_changeorg = process("./Datasets/changeorg_stance/changeorg_stance_4.csv")
verif_no_na(df4_changeorg)

Categories: ['anti-mitigation', 'missing', 'pro-mitigation', 'unclear']
Unreliable annotators (starts at 0): [1]
All good, no na value

Categories: ['anti-mitigation', 'missing', 'pro-mitigation', 'unclear']
Unreliable annotators (starts at 0): [3]
All good, no na value

Categories: ['anti-mitigation', 'missing', 'pro-mitigation', 'unclear']
Unreliable annotators (starts at 0): []
All good, no na value

Categories: ['anti-mitigation', 'missing', 'pro-mitigation', 'unclear']
Unreliable annotators (starts at 0): []
All good, no na value

Categories: ['anti-mitigation', 'missing', 'pro-mitigation', 'unclear']
Unreliable annotators (starts at 0): []
All good, no na value



In [733]:
concat_and_write([df0_changeorg, df1_changeorg, df2_changeorg, df3_changeorg, df4_changeorg], "changeorg_stance.csv")

#### nyt_stance

I modified 2 files (_1 and _2), containing 2 labels for a given annotator / sample, since these documents contain mutually exclusive labels

In [748]:
df0_nyt = process("./Datasets/nyt_stance/nyt_stance_0.csv")
verif_no_na(df0_nyt)

df1_nyt = process("./Datasets/nyt_stance/nyt_stance_1.csv")
verif_no_na(df1_nyt)

df2_nyt = process("./Datasets/nyt_stance/nyt_stance_2.csv")
verif_no_na(df2_nyt)

Categories: ['anti-mitigation', 'missing', 'pro-mitigation', 'unclear']
Unreliable annotators (starts at 0): [2, 3]
All good, no na value

Categories: ['anti-mitigation', 'pro-mitigation', 'unclear']
Unreliable annotators (starts at 0): []
All good, no na value

Categories: ['anti-mitigation', 'missing', 'pro-mitigation', 'unclear']
Unreliable annotators (starts at 0): [1]
All good, no na value



In [736]:
concat_and_write([df0_nyt, df1_nyt, df2_nyt], "nyt_stance.csv")