In [None]:
# Imports and libs
import os
import os.path
import pathlib

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import io
import scipy.misc
import numpy as np
import pandas as pd

import tensorflow as tf
tf.debugging.set_log_device_placement(False)
tf.get_logger().setLevel('ERROR')

import tensorflow_hub as hub

import PIL.Image as Image
import PIL.ImageColor as ImageColor
import PIL.ImageDraw as ImageDraw
import PIL.ImageFont as ImageFont

import six

from concurrent.futures import ThreadPoolExecutor

import collections

import ipywidgets as widgets
from IPython.display import display
from wordcloud import WordCloud

import time

# Load object detection lib
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.utils import ops as utils_ops

from bs4 import BeautifulSoup

from urllib import request, parse
import json
import gzip

from io import BytesIO

from copy import deepcopy

import re

from statsmodels.stats.inter_rater import fleiss_kappa

%matplotlib inline
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
matplotlib.use('pgf')
matplotlib.rcParams.update({
    'pgf.texsystem': 'pdflatex',
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'font.size': 14,
    'figure.figsize': (9,8)
})

In [None]:
# GLOBAL VARIABLES
USE_LEGACY = False
PATH_TO_LABELS = './tensorflow_models/research/object_detection/data/mscoco_label_map.pbtxt'
CATEGORY_INDEX = None
ALL_MODELS = {
    'EfficientDet' : 'https://tfhub.dev/tensorflow/efficientdet/d7/1',
    'Faster R-CNN' : 'https://tfhub.dev/tensorflow/faster_rcnn/inception_resnet_v2_1024x1024/1',
    'Faster R-CNN (OpenImages)' : 'https://tfhub.dev/google/faster_rcnn/openimages_v4/inception_resnet_v2/1',
    'CenterNet' : 'https://tfhub.dev/tensorflow/centernet/resnet101v1_fpn_512x512/1'
}
MODEL_NAME = 'EfficientDet'
MODEL_HANDLE = 'https://tfhub.dev/tensorflow/efficientdet/d7/1'
THRESHOLD = .5
IMAGE_BATCH_SIZE = 500

In [None]:
# The Faster R-CNN (Open Images) model requires use of legacy code, so set the value accordingly
def set_legacy(use_legacy):
    global USE_LEGACY, PATH_TO_LABELS, CATEGORY_INDEX
    
    USE_LEGACY = use_legacy
    if USE_LEGACY:
        tf.config.run_functions_eagerly(True)
        PATH_TO_LABELS = './tensorflow_models/research/object_detection/data/oid_v4_label_map.pbtxt'
    else:
        tf.config.run_functions_eagerly(False)
        PATH_TO_LABELS = './tensorflow_models/research/object_detection/data/mscoco_label_map.pbtxt'
    CATEGORY_INDEX = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)

In [None]:
# Load model
def load_model(model_handle):
    if USE_LEGACY:
        hub_model = hub.load(model_handle).signatures['default']
    else:
        hub_model = hub.load(model_handle)
    
    print('Model loaded!')
    
    return hub_model

In [None]:
# Get all valid image paths
# use image_type = 'colorized_artistic' for brighter/more daring colors
# use image_type = 'grayscale' for the original grayscale images
def get_valid_image_paths(path='inputs/colorized_stable/'):
    valid_paths = []
    for image_path in os.listdir(path):
        if (image_path.endswith('.jpg')):
            # Collect image as valid path
            valid_paths.append(image_path)
    return valid_paths

# Load given image into numpy array using matplotlib imload
def load_valid_image(filename, path='inputs/colorized_stable/'):
    img = mpimg.imread(path + filename, format='jpg')
    # Reshape image array to fit specifications required by tensorflow model
    (rows, columns, channels) = img.shape
    image = {
        'image': img.reshape((1, rows, columns, channels)),
        'filename': filename,
        'detection': {}
    }
    return image

# The images are objects of the shape:
# {
#     'image': Actual Image Object,
#     'filename': Name of the original image
# }
def load_images(path='inputs/colorized_stable/'):
    valid_paths = get_valid_image_paths(path)
    
    # How many images were loaded so far
    progress_load = widgets.IntProgress(min=0, max=len(valid_paths), description='Loaded: ') # instantiate the bar
    load_label = widgets.Label(value='0 / ' + str(len(valid_paths)))
    display(widgets.HBox([progress_load, load_label])) # display the bar
    
    images = []
    for filename in valid_paths:
        images.append(load_valid_image(filename, path))
        
        progress_load.value += 1
        load_label.value = str(progress_load.value) + ' / ' + str(len(valid_paths))
        
    return images

def load_image(filename, path='inputs/colorized_stable/'):
    return load_valid_image(filename, path)

In [None]:
def run_inference_for_image(image, model):
    # Run object detection and save results
    if USE_LEGACY:
        converted = tf.image.convert_image_dtype(image['image'], tf.float32)
    else:
        converted = image['image']
    detected = model(converted)

    return {key:value.numpy() for key,value in detected.items()}

# Run inference with selected model on given images
# The array of images that was returned contains objects of the shape:
# {
#     'image': Actual Image Object,
#     'filename': Name of the original image
#     'detection': Detected objects on the image
# }
def run_inference(images, model):
    # How many images were used for inference so far
    progress_inference = widgets.IntProgress(min=0, max=len(images), description='Inferred: ')
    inference_label = widgets.Label(value='0 / ' + str(len(images)))
    display(widgets.HBox([progress_inference, inference_label]))
    
    for index, image in enumerate(images):
        image[index]['detection'] = run_inference_for_image(image, model)

        progress_inference.value += 1
        inference_label.value = str(progress_inference.value) + ' / ' + str(len(images))

In [None]:
# This is a shortened version of the tensorflow library function to visualize detections on images
# It has been shortened/adapted to support the use of variable font-sized based on the images dimensions
STANDARD_COLORS = [
    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
    'WhiteSmoke', 'Yellow', 'YellowGreen'
]

def visualize_boxes_and_labels_on_image_array(
    image,
    boxes,
    classes,
    scores,
    category_index,
    use_normalized_coordinates=False,
    max_boxes_to_draw=20,
    min_score_thresh=.5,
    agnostic_mode=False,
    line_thickness=4,
    font_size=24,
    mask_alpha=.4,
    groundtruth_box_visualization_color='black'):

  # Create a display string (and color) for every box location, group any boxes
  # that correspond to the same location.
  box_to_display_str_map = collections.defaultdict(list)
  box_to_color_map = collections.defaultdict(str)
  box_to_track_ids_map = {}
  if not max_boxes_to_draw:
    max_boxes_to_draw = boxes.shape[0]
  for i in range(boxes.shape[0]):
    if max_boxes_to_draw == len(box_to_color_map):
      break
    if scores is None or scores[i] > min_score_thresh:
      box = tuple(boxes[i].tolist())
      if scores is None:
        box_to_color_map[box] = groundtruth_box_visualization_color
      else:
        display_str = ''
        if not agnostic_mode:
            if classes[i] in six.viewkeys(category_index):
              class_name = category_index[classes[i]]['name']
            else:
              class_name = 'N/A'
            display_str = str(class_name) + ' '
        box_to_display_str_map[box].append(display_str)
        if agnostic_mode:
          box_to_color_map[box] = 'DarkOrange'
        else:
          box_to_color_map[box] = STANDARD_COLORS[
              classes[i] % len(STANDARD_COLORS)]

  # Draw all boxes onto image.
  for box, color in box_to_color_map.items():
    ymin, xmin, ymax, xmax = box
    draw_bounding_box_on_image_array(
        image,
        ymin,
        xmin,
        ymax,
        xmax,
        color=color,
        thickness=line_thickness,
        font_size=font_size,
        display_str_list=box_to_display_str_map[box],
        use_normalized_coordinates=use_normalized_coordinates)

  return image

def draw_bounding_box_on_image_array(
    image,
    ymin,
    xmin,
    ymax,
    xmax,
    color='red',
    thickness=4,
    font_size=24,
    display_str_list=(),
    use_normalized_coordinates=True):

  image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
  draw_bounding_box_on_image(image_pil, ymin, xmin, ymax, xmax, color,
                             thickness, font_size, display_str_list,
                             use_normalized_coordinates)
  np.copyto(image, np.array(image_pil))

def draw_bounding_box_on_image(
    image,
    ymin,
    xmin,
    ymax,
    xmax,
    color='red',
    thickness=4,
    font_size=24,
    display_str_list=(),
    use_normalized_coordinates=True):

  draw = ImageDraw.Draw(image)
  im_width, im_height = image.size
  if use_normalized_coordinates:
    (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
                                  ymin * im_height, ymax * im_height)
  else:
    (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
  if thickness > 0:
    draw.line([(left, top), (left, bottom), (right, bottom), (right, top),
               (left, top)],
              width=thickness,
              fill=color)
  try:
    font = ImageFont.truetype('arial.ttf', font_size)
  except IOError:
    font = ImageFont.load_default()

  # If the total height of the display strings added to the top of the bounding
  # box exceeds the top of the image, stack the strings below the bounding box
  # instead of above.
  display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
  # Each display_str has a top and bottom margin of 0.05x.
  total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)

  if top > total_display_str_height:
    text_bottom = top
  else:
    text_bottom = bottom + total_display_str_height
  # Reverse list and print from bottom to top.
  for display_str in display_str_list[::-1]:
    text_width, text_height = font.getsize(display_str)
    margin = np.ceil(0.05 * text_height)
    draw.rectangle(
        [(left, text_bottom - text_height - 2 * margin), (left + text_width,
                                                          text_bottom)],
        fill=color)
    draw.text(
        (left + margin, text_bottom - text_height - margin),
        display_str,
        fill='black',
        font=font)
    text_bottom -= text_height - 2 * margin

In [None]:
# Determine how many objects were inferred and which object types
# The object that is returned will be of the following shape:
# {
#     'FILENAME': {
#         'class': {
#             'amount': int,
#             'scores': [float],
#             'boundingBoxes': [[ymin, xmin, ymax, xmax]]
#         },
#         'otherclass': {...},
#         ...
#     }
# }
def get_detections_for_image(image):
    detections = image['detection']
    boxes = detections['detection_boxes'] if USE_LEGACY else detections['detection_boxes'][0]
    scores = detections['detection_scores'] if USE_LEGACY else detections['detection_scores'][0]
    coordinates = {}
    for i in range(boxes.shape[0]):
        if scores[i] > THRESHOLD:
            class_id = int(detections['detection_class_labels'][i]) if USE_LEGACY else int(detections['detection_classes'][0][i])

            class_name = CATEGORY_INDEX[class_id]['name']

            if class_name in coordinates:
                coordinates[class_name]['amount'] += 1
                coordinates[class_name]['scores'].append(scores[i])
                coordinates[class_name]['boxes'].append(boxes[i])
            else:
                coordinates[class_name] = {
                    'amount': 1,
                    'scores': [scores[i]],
                    'boxes': [boxes[i]]
                }
    return {
        image['filename']: coordinates
    }

#
# {
#     'IMAGE1_FILENAME': {
#         'class': {
#             'amount': int,
#             'scores': [float],
#             'boundingBoxes': [[ymin, xmin, ymax, xmax]]
#         },
#         'otherclass': {...},
#         ...
#     },
#     'IMAGE2_FILENAME': ...
# }
def get_detections_for_images(images):
    all_detections = {}
    
    for image in images:
        all_detections.update(get_detections_for_image(image))
        
    return all_detections

In [None]:
def font_size_for_width(width):
    return int((width * 2) / 100)

def line_width_for_width(width):
    return int(np.log((width * 2) / 1000) * 5) + 1

def draw_detections_on_image(image):
    # Determine font size based on image width
    (width, height, channels) = image['image'][0].shape

    # Draw detections
    visualize_boxes_and_labels_on_image_array(
          image['image'][0],
          image['detection']['detection_boxes'] if USE_LEGACY else image['detection']['detection_boxes'][0],
          image['detection']['detection_class_labels'].astype(int) if USE_LEGACY else image['detection']['detection_classes'][0].astype(int),
          image['detection']['detection_scores'] if USE_LEGACY else image['detection']['detection_scores'][0],
          CATEGORY_INDEX,
          use_normalized_coordinates=True,
          max_boxes_to_draw=200,
          # Change this if confidence score should be higher or lower
          min_score_thresh=THRESHOLD,
          line_thickness=line_width_for_width(width),
          font_size=font_size_for_width(width),
          agnostic_mode=False
    )

# Draw detections on copied image
def draw_detections_on_images(images_with_detections):
    # How many detections were visualized so far
    progress_save = widgets.IntProgress(min=0, max=len(images_with_detections), description='Visualized: ')
    save_label = widgets.Label(value='0 / ' + str(len(images_with_detections)))
    display(widgets.HBox([progress_save, save_label]))
    
    for image in images_with_detections:
        draw_detections_on_image(image)

        progress_save.value += 1
        save_label.value = str(progress_save.value) + ' / ' + str(len(images_with_detections))

In [None]:
def save_image(output_dir, image, filetype='jpg'):
    # Create directory if not exists
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    plt.imsave(output_dir + image['filename'], image['image'][0], format=filetype)

# Add output sub-directory named after the model that was used (if it doesn't already exist)
def save_images(images_with_detections, model_name):
    # How many images were saved so far
    progress_save = widgets.IntProgress(min=0, max=len(images_with_detections), description='Saved: ')
    save_label = widgets.Label(value='0 / ' + str(len(images_with_detections)))
    display(widgets.HBox([progress_save, save_label]))
    
    output_dir = 'outputs/' + model_name + '/'

    # Save images
    for image in images_with_detections:
        save_image(output_dir + image['filename'], image)
        
        progress_save.value += 1
        save_label.value = str(progress_save.value) + ' / ' + str(len(images_with_detections))

In [None]:
# Run this if you want statistics on all inferences run for each model
# @param image_count: number of images that were used 
def get_statistics_for_detections(detections, image_count, single=False):
    dataFrame = pd.DataFrame([detections], index=range(1, image_count + 1))
    
    if single:
        dataFrame[dataFrame.columns[0]].name = MODEL_NAME

    pathlib.Path('outputs/statistics/').mkdir(parents=True, exist_ok=True)
    
    if single:
        dataFrame.to_csv('outputs/statistics/detections_single.csv')
    else:
        dataFrame.to_csv('outputs/statistics/detections_all.csv')

In [1]:
def save_fig(name):
    plt.savefig(name + '.svg', format='svg', bbox_inches='tight')
    plt.savefig(name + '.jpg', format='jpg', bbox_inches='tight')
    plt.savefig(name + '.pgf', format='pgf', bbox_inches='tight')

# Run this for charts
# @param image_count: number of images that were used
def generate_detection_charts(all_detections, image_count):
    plt.figure()
    
    # Total detections per image
    total_detections = {}
    for model, detections in all_detections.items():
        model_detections = [np.sum([info['amount'] for label, info in stats.items()]) for filename, stats in detections.items()]
        total_detections[model] = np.sum(model_detections)
        print(model + ':', total_detections[model])
        plt.plot(model_detections)
    plt.legend(list(total_detections.keys()), loc='upper left')
    
    save_fig('outputs/statistics/statistics_by_image')
    
    # Average detections per image
    plt.figure()
    plt.barh(
        [name for name in total_detections.keys()],
        [value / image_count for value in total_detections.values()]
    )

    save_fig('outputs/statistics/statistics_average')

# Run this for charts for a single model
# @param image_count: number of images that were used
def generate_detection_chart(detections, image_count):
    plt.figure()
    
    # Total detections per image
    model_detections = [np.sum([info['amount'] for label, info in stats.items()]) for filename, stats in DETECTED_CLASSES.items()]
    plt.plot(model_detections)
    
    total_detections = np.sum(model_detections)
    print('Average detections per image: %.2f' % (total_detections / image_count))
    
    plt.ylabel('Detections per Image')
    plt.xlabel('Image Index')
    plt.legend([MODEL_NAME], loc='upper right')

    plt.savefig('outputs/statistics/statistics_single')

In [None]:
search_params = {
    'lang': 'EN',
    'word': ''
}
synset_params = {
    'searchLang': 'EN',
    'targetLang': 'EN',
    'key': 'efeb755b-7c79-41f9-abd3-22d25254c9c2',
    'id': ''
}

# Get all detected labels, without duplicates
# The returned array will contain objects of the following structure:
#{
#    'class': string,
#    'babelnetid': string,
#    'wikidataid': string,
#    'babelnet': url,
#    'wikidata': url,
#    'labelMatch': boolean  # True if label of mapped Wikidata Entity exactly matches the class name
#}
def get_detected_labels(detections):
    labels = [label for stats in detections.values() for label in stats]
    return list(map(lambda x: {'class': x.capitalize(), 'babelnetid': '', 'wikidataid': '', 'babelnet': '', 'wikidata': '', 'labelMatch': False}, list(dict.fromkeys(labels))))

def make_http_request(url, params, encoding):
    babelnet_request = request.Request(url + '?' + parse.urlencode(params))
    if encoding != 'html':
        babelnet_request.add_header('Accept-encoding', encoding)
    response = request.urlopen(babelnet_request)
    
    if encoding != 'html' and response.info().get('Content-Encoding') == encoding:
        buf = BytesIO(response.read())
        f = gzip.GzipFile(fileobj=buf)
        return json.loads(f.read())
    else:
        return response.read().decode('utf8')

    
def get_search_page_for(label):
    search_params['word'] = label
    html = make_http_request('https://babelnet.org/search', search_params, 'html')
    search_params['word'] = ''
    return html

# Get synset for a given sense
def get_synset_for(synset_id):
    synset_params['id'] = synset_id
    synset = make_http_request('https://babelnet.io/v6/getSynset', synset_params, 'gzip')
    synset_params['id'] = ''
    return synset
        
# Optimize relation checking, sometimes entries have multiple wikidata sources which are sorted
# alphabetically (not by importance), therefore a different wikidata source could be a more
# accurate representation
def find_exact_match(senses, label):
    sense = None
    
    # Search for exact match
    for current in senses:
        # Remove all non-alphabetic characters
        lemma = re.sub('[^a-zA-Z]', '', current['properties']['fullLemma'])
        label = re.sub('[^a-zA-Z]', '', label)

        # Compare lowercase representations of label and lemma
        if label.casefold() in lemma.casefold():
            sense = current
            break
                
    return sense

# Read information from sense into given label
def update_label_from_sense(label, sense):
    if sense is not None:
        label['babelnetid'] = sense['properties']['synsetID']['id']
        label['babelnet'] = 'http://babelnet.org/synset?lang=EN&id=' + sense['properties']['synsetID']['id'].replace(':', '%3A')
        label['wikidataid'] = sense['properties']['senseKey']
        label['wikidata'] = 'http://www.wikidata.org/wiki/' + sense['properties']['senseKey']
        
        lemma = re.sub('[^a-zA-Z]', '', sense['properties']['fullLemma'])
        class_label = re.sub('[^a-zA-Z]', '', label['class'])
        label['labelMatch'] = class_label.casefold() == lemma.casefold()
    else:
        label['babelnetid'] = 'No match'
        label['babelnet'] = 'No match'
        label['wikidataid'] = 'No match'
        label['wikidata'] = 'No match'
        label['labelMatch'] = False

# Use page crawler to get BableNet senses for detections, based on ranking on the BableNet search page
def get_entity_mapping(detection_labels):
    for label in detection_labels:
        print('Label:', label['class'])
        soup = BeautifulSoup(get_search_page_for(label['class']), 'html.parser')
        synsets_html = soup.select('div[data-type="CONCEPT"]')
        synset_detail_html = soup.select_one('.synset-wrapper')
        
        first = None
        sense = None
        # Search page
        if len(synsets_html) > 0:
            # Limit amount of searched synsets to a maximum of 3
            for synset_html in synsets_html[:min(4, len(synsets_html))]:
                try:
                    synset = get_synset_for(synset_html['data-id'])
                    senses = [sense for sense in synset['senses'] if sense['properties']['source'] == 'WIKIDATA']
                    
                    # Remember first set of senses and use first of these senses as fallback if no exact match can
                    # be found
                    if first is None and len(senses) > 0:
                        first = senses[0]

                    sense = find_exact_match(senses, label['class'])
                    
                    # We have found an exact match and do not have to look further
                    if sense is not None:
                        break
                except:
                    print("Unexpected error:", sys.exc_info()[0])
                    print(synset_html)
        # Detail page
        elif synset_detail_html is not None:
            synset_id = soup.select_one('.id')
            try:
                synset = get_synset_for(synset_id.text)
                senses = [sense for sense in synset['senses'] if sense['properties']['source'] == 'WIKIDATA']
                
                if len(senses) > 0:
                    first = senses[0]
                sense = find_exact_match(senses, label['class'])
            except:
                print("Unexpected error:", sys.exc_info()[0])
                print(synset_html)
        
        # If no exact match could be found, just use the first sense of the first result
        if sense is None and first is not None:
            sense = first
        
        update_label_from_sense(label, sense)
        
def save_entity_mapping(mapped_entities, path):
    pd.DataFrame(mapped_entities).to_csv(path)

In [None]:
def plot_participant_statistics(series, separated, name):
    data = series
    if not separated:
        data = [series]
    plot = pd.DataFrame(data).plot(
        kind='barh',
        legend=False
    )
    if not separated:
        plot.legend(prop={'size': 14})
        plot.tick_params(bottom=False, labelbottom=False)
        
    save_fig('outputs/statistics/survey_' + name)
    
# Generate statistics about study participants
def generate_participant_statistics(results, separated=True):
    # How many participants have completed the survey
    labels = ['None', '1-5\n(Background)', '6-100', '101-200', '201-324', '325\n(Completed)']
    if not separated:
        labels = ['None', '1-5 (Background)', '6-100', '101-200', '201-324', '325 (Completed)']
    lastpage = pd.cut(
        results[results.columns[0]].fillna(0),
        bins=[0, 1, 5, 100, 200, 324, 327],
        # Combined bars
        # labels=['None', '1-5 (Background)', '6-100', '101-200', '201-324', '325 (Completed)'],
        # Separate bars
        labels=labels,
        include_lowest=True
    ).value_counts(dropna=False).sort_index()
    lastpage.name = 'Completion'
    plot_participant_statistics(lastpage, separated, 'questions')
    
    # Gender
    gender = results[results.columns[1]].value_counts().sort_index()
    gender.name = 'Gender'
    plot_participant_statistics(gender, separated, 'gender')
    
    # Field of employment
    employment = {}
    for column_name in results.iloc[:, 2:8]:
        column = results[column_name].value_counts()
        employment[column_name[column_name.find("[")+1:column_name.find("]")]] = column['Yes']
    employment['Other'] = results.iloc[:, 8:9].value_counts().sum()
    employment_series = pd.Series(employment)
    employment_series.name = 'Field of employment'
    plot_participant_statistics(employment_series, separated, 'employment')
    
    # Age
    age = pd.cut(
        results[results.columns[9]].dropna().apply(
            lambda x: 0 if type(x) == str and x == 'Prefer not to answer' else int(x)
        ),
        bins=[0, 18, 24, 30, 40, 50, 1000],
        labels=['Prefer not to answer', '18-24', '25-30', '31-40', '40-50', 'Over 50'],
        include_lowest=True
    ).value_counts().sort_index()
    age.name = 'Age groups'
    plot_participant_statistics(age, separated, 'age')
    
    # Educational background
    education = results[results.columns[10]].value_counts()
    education.name = 'Educational background'
    plot_participant_statistics(education, separated, 'education')

In [None]:
# The subjects in our case are the survey questions and there are 320 of them
# The categories in our case are the 5 options of the Likkert-Scale we chose
# Therefore we want a matrix that looks like this:
#
# Header:       "Very inaccurate", "Mostly inaccurate", "Neither", "Mostly accurate", "Very accurate"
# Question 1:  [       0                   1               0              1                 10       ]
# Question 2:  [                                          ...                                        ]
# ...
#
# M[0,0] = Number of participants (0), that assigned the first question to the first category ("Very inaccurate")
def calculate_iaa(detection_results):
    matrix = detection_results.dropna().to_numpy().T
    iaa = []
    
    for row in matrix:
        unique, counts = np.unique(row, return_counts=True)
        
        new_row = np.zeros(5)
        for index, current in enumerate(unique):
            if current == 'Very inaccurate':
                new_row[0] = counts[index]
            elif current == 'Mostly inaccurate':
                new_row[1] = counts[index]
            elif current == 'Neither accurate nor inaccurate':
                new_row[2] = counts[index]
            elif current == 'Mostly accurate':
                new_row[3] = counts[index]
            elif current == 'Very accurate':
                new_row[4] = counts[index]

        iaa.append(new_row)
    
    return fleiss_kappa(np.asmatrix(iaa))