# Discoure Marker Distribution

### Import Statements

In [1]:
import ipywidgets as widgets
from IPython.display import display

import matplotlib.pyplot as plt
import statistics
import pandas as pd
import numpy as np
import ast
import json
from collections import Counter

### Helper functions to compute stuff

In [2]:
def compute_statistics(values):
    """
    computes the min, max, arithmetic mean, harmonic mean, median and mode
    of an iterable set of values
    :param values: the iterable set of values to compute the statistics of.
    :return: returns a list with [min, a_mean, h_mean, median, mode, max] values
    """

    min_dm_per_sentence = min(values)
    max_dm_per_sentence = max(values)
    arith_mean_dm_per_sentence = statistics.mean(values)
    harmonic_mean_dem_per_sentence = statistics.harmonic_mean(values)
    median_dm_per_sentence = statistics.median(values)
    mode_dm_per_sentence = statistics.mode(values)

    return [min_dm_per_sentence,
            arith_mean_dm_per_sentence, harmonic_mean_dem_per_sentence,
            median_dm_per_sentence, mode_dm_per_sentence,
            max_dm_per_sentence]


def percentage(part, whole):
    return (float(part) * 100) / (float(whole))


def show_dataframe(data1, data2, data3, data4):
    """
    Prints a pandas dataframe
    :param data1:
    :param data2:
    :param data3:
    :param data4:
    :return:
    """
    pass


def list_all_markers(data1, data2=None, data3=None, data4=None):
    markers = data1.get_markers()

    if data2 is not None:
        for marker in data2.get_markers():
            if marker not in markers:
                markers.append(marker)

    if data3 is not None:
        for marker in data3.get_markers():
            if marker not in markers:
                markers.append(marker)

    if data4 is not None:
        for marker in data4.get_markers():
            if marker not in markers:
                markers.append(marker)

    return markers

### DatasetScores class 
creates an object of each set of scores an provides functions to retrieve these scores

In [3]:
class DatasetScores:

    def __init__(self, scorefile, jsonfile):
        self.scores = pd.read_csv(scorefile)
        self.total_sentences = sum(self.scores['sentence_count_doc'])

        with open(jsonfile, 'r', encoding='utf-8') as data_json:
            dictionary = json.load(data_json)

        self.total_docs = dictionary['stats']['total_docs']
        self.total_markers = dictionary['stats']['total_markers']
        self.different_markers = dictionary['stats']['different_markers']
        self.total_sb = dictionary['stats']['total_sb']
        self.total_sm = dictionary['stats']['total_sm']
        self.total_se = dictionary['stats']['total_se']
        self.total_db = dictionary['stats']['total_db']
        self.total_dm = dictionary['stats']['total_dm']
        self.total_de = dictionary['stats']['total_de']
        self.marker_dict = dictionary['marker']

        del dictionary

    def get_total_dm_count_statistics(self):
        """
        Computes the min, mean, max of the total number of DM per Text
        :return: a list of [min, a_mean, h_mean, median, mode, max] values
        """
        return compute_statistics(self.scores['dm_count_doc'].dropna())

    def get_percent_dm_count_statistics(self):
        """
        Computes the min, mean, max of the percentage share that the DM
        have in a text
        :return: a list of [min, a_mean, h_mean, median, mode, max] values
        """
        return compute_statistics(self.scores['dm_words_perc'].dropna())

    def get_total_dm_sentences_statistics(self):
        """
        Computes the min, mean, max of the total number of sentences
        that contain at least one DM per Text
        :return: a list of [min, a_mean, h_mean, median, mode, max] values
        """
        return compute_statistics(self.scores['dm_sentences'].dropna())

    def get_percent_dm_sentences_statistics(self):
        """
        Computes the min, mean, max of the percentage share that sentences
        containing at least one DM have of a text
        :return:
        """
        return compute_statistics(self.scores['dm_sentences_perc'].dropna())

    def get_total_dm_per_sentence_statistics(self):
        """
        Computes the min, mean, max of dm per sentence per text
        :return:
        """
        return compute_statistics(self.get_sentence_counts())

    def get_total_dm_positions_sentence(self):
        """
        Computes the total number of DM at the beginning, the middle and the end
        of a sentence
        :return: a list [count_begin, count_middle, count_end]
        """
        return [sum(self.scores['dm_pos_sent_begin'].dropna()),
                sum(self.scores['dm_pos_sent_middle'].dropna()),
                sum(self.scores['dm_pos_sent_end'].dropna())]

    def get_percent_dm_positions_sentence(self):
        """
        Computes the perceantage share of dm that stand in the beginning, the middle
        or the end of a sentence
        :return:
        """
        values = self.get_total_dm_positions_sentence()
        whole = sum(values)

        return [percentage(values[0], whole),
                percentage(values[1], whole),
                percentage(values[2], whole)]

    def get_sentence_position_values(self):
        return [self.scores['dm_pos_sent_begin'].dropna(),
                self.scores['dm_pos_sent_middle'].dropna(),
                self.scores['dm_pos_sent_end'].dropna()]

    def get_total_dm_positions_document(self):
        """
        Computes the total number of DM at the beginning, the middle and the end
        of a document
        :return: a list [count_begin, count_middle, count_end]
        """
        return [sum(self.scores['dm_pos_doc_begin'].dropna()),
                sum(self.scores['dm_pos_doc_middle'].dropna()),
                sum(self.scores['dm_pos_doc_end'].dropna())]

    def get_percent_dm_positions_document(self):
        """
        Computes the perceantage share of dm that stand in the beginning, the middle
        or the end of a document
        :return:
        """
        values = self.get_total_dm_positions_document()
        whole = sum(values)

        return [percentage(values[0], whole),
                percentage(values[1], whole),
                percentage(values[2], whole)]

    def get_document_position_values(self):
        return [self.scores['dm_pos_doc_begin'].dropna(),
                self.scores['dm_pos_doc_middle'].dropna(),
                self.scores['dm_pos_doc_end'].dropna()]

    def get_sentence_counts(self):
        """
        Returns a list of counts that indicates how many sentences in this dataset contain
        as many discourse markers.
        E.g.: if  three sentences each contain 2 DM, 2 is added to the list 3 times
        :return: a list of all the counts
        """
        values = []

        for doc in self.scores['dm_count_sent'].dropna():
            doc_counts = ast.literal_eval(doc)

            for dm_counter in doc_counts:
                sentence_counter = int(doc_counts[dm_counter])
                for i in range(sentence_counter):
                    values.append(dm_counter)

        return values

    def compute_dm_per_sentence(self):
        """
        Create two lists, one of which contains the number of dm per sentence (x)
        and the other one (y) contains the number of sentences that contain as many dms.
        :return: a list of [[x_values],[y_values]]
        """
        values = {}

        for doc in self.scores['dm_count_sent'].dropna():
            doc_counts = ast.literal_eval(doc)

            for dm_counter in doc_counts:
                if dm_counter not in values:
                    values[dm_counter] = int(doc_counts[dm_counter])
                else:
                    values[dm_counter] += int(doc_counts[dm_counter])

        # x values are the number of dms per sentence
        x_values = []
        # y values are the number of sentences that contain as many dms.
        y_values = []
        for element in sorted(values.items()):
            x_values.append(element[0])
            y_values.append(element[1])

        return [x_values, y_values]

    # ------- Functionaliyt concerning the marker dictionary with the single markers

    def get_total_marker_values(self):
        """
        Creates a dictionary with the markers as keys and their total number of occurrence in this dataset as value
        :return:
        """
        markers = {}
        for marker in self.marker_dict:
            markers[marker] = self.marker_dict[marker]['total']

        return markers

    def get_total_marker_percents(self):
        """
        Creates a dictionary with the markers as keys
        and their percentage-share in all the markers in this dataset as value
        :return:
        """
        percents = {}
        for marker in self.marker_dict:
            percents[marker] = (self.marker_dict[marker]['total'] * 100) / self.total_markers

        return percents

    def get_total_marker_statistics(self):
        """
        Creates a dictionary with the markers as keys and their average number of occurences
        (a_mean, h_mean, median, mode) over all the documents in this dataset as value-dict
        :return:
        """
        statistics = {}

        for marker in self.marker_dict:
            statistics[marker] = {}
            statistics[marker]['a_mean'] = self.marker_dict[marker]['total'] / self.total_docs
            statistics[marker]['h_mean'] = self.total_docs / self.marker_dict[marker]['inverse_sum_total']
            statistics[marker]['median'] = self.marker_dict[marker]['median_total']
            statistics[marker]['mode'] = self.marker_dict[marker]['mode_total'][0][0]

        return statistics

    def get_markers(self):
        """
        :return: a list of all the markers in this dataset
        """
        markers = []
        for marker in self.marker_dict:
            markers.append(marker)
        return markers

    def get_marker_values_at_position(self, position, average=False, perc=False):
        """
        Creates a dict with the marker as key and the position value as value
        :param position: the position to get the values for: sb, sm, se, db, dm, de
        :return:
        """

        markers = {}
        if position == "sb":
            for marker in self.marker_dict:
                if average:
                    markers[marker] = self.marker_dict[marker]['sent_begin'] / self.total_sentences
                elif perc:
                    markers[marker] = self.marker_dict[marker]['sent_begin'] * 100 / self.total_sb
                else:
                    markers[marker] = self.marker_dict[marker]['sent_begin']
        elif position == "sm":
            for marker in self.marker_dict:
                if average:
                    markers[marker] = self.marker_dict[marker]['sent_middle'] / self.total_sentences
                elif perc:
                    markers[marker] = self.marker_dict[marker]['sent_middle'] * 100 / self.total_sm
                else:
                    markers[marker] = self.marker_dict[marker]['sent_middle']
        elif position == "se":
            for marker in self.marker_dict:
                if average:
                    markers[marker] = self.marker_dict[marker]['sent_end'] / self.total_sentences
                elif perc:
                    markers[marker] = self.marker_dict[marker]['sent_end'] * 100 / self.total_se
                else:
                    markers[marker] = self.marker_dict[marker]['sent_end']

        elif position == "db":
            for marker in self.marker_dict:
                if average:
                    markers[marker] = self.marker_dict[marker]['doc_begin'] / self.total_sentences
                elif perc:
                    markers[marker] = self.marker_dict[marker]['doc_begin'] * 100 / self.total_db
                else:
                    markers[marker] = self.marker_dict[marker]['doc_begin']
        elif position == "dm":
            for marker in self.marker_dict:
                if average:
                    markers[marker] = self.marker_dict[marker]['doc_middle'] / self.total_sentences
                elif perc:
                    markers[marker] = self.marker_dict[marker]['doc_middle'] * 100 / self.total_dm
                else:
                    markers[marker] = self.marker_dict[marker]['doc_middle']
        elif position == "de":
            for marker in self.marker_dict:
                if average:
                    markers[marker] = self.marker_dict[marker]['doc_end'] / self.total_sentences
                elif perc:
                    markers[marker] = self.marker_dict[marker]['doc_end'] * 100 / self.total_de
                else:
                    markers[marker] = self.marker_dict[marker]['doc_end']

        return markers

    def get_all_marker_values(self, marker):
        """
        Creates a List with all the values for a marker:
        [total, a_mean, h_mean, median, mode]
        :param marker:
        :return:
        """
        if marker in self.marker_dict:
            marker_values = [  # self.marker_dict[marker]['total'],
                self.marker_dict[marker]['total'] / self.total_docs,
                self.total_docs / self.marker_dict[marker]['inverse_sum_total'],
                self.marker_dict[marker]['median_total'],
                self.marker_dict[marker]['mode_total'][0][0]]
        else:
            marker_values = [0] * 4

        return marker_values

    def get_marker_total(self, marker):
        """
        Gets the total number of occurrences for the given marker in the dataset.
        :param marker:
        :return: total number of occurrences in the dataset (0 if none)
        """
        if marker in self.marker_dict:
            return self.marker_dict[marker]['total']
        else:
            return 0

    def get_most_common_markers(self, number, position=None, perc=False, average=False):
        if position:
            marker_count = Counter(self.get_marker_values_at_position(position, average=average, perc=perc))
        elif not position and perc:
            marker_count = Counter(self.get_total_marker_percents())
        else:
            marker_count = Counter(self.get_total_marker_values())
        markers = []
        marker_values = []
        for item in marker_count.most_common(number):
            markers.append(item[0])
            marker_values.append(item[1])

        return markers, marker_values

### Functions to Plot the data

In [4]:
def draw_simple_barchart(figuretitle, titles, data, colors):
    """
    Creates subplots, each a simple barchart for one set of y values over x values in the specified color
    :param titles: list of titles
    :param data: list of [[xvalues1, yvalues1],[xvalues2, yvalues2], ...] for each dataset
    :param colors: list of colors
    :return:
    """
    plt.style.use('fivethirtyeight')

    fig, axes = plt.subplots(ncols=2, nrows=2, sharey=True)
    row = 0
    column = 0
    for i in range(len(data)):
        axes[row][column].bar(data[i][0], data[i][1], color=colors[i])
        axes[row][column].set_title(titles[i])

        if i % 2 == 0:
            axes[row][column].set_ylabel("Number Sentences")

        column += 1
        if column == 2:
            column = 0
            row += 1

    fig.suptitle(figuretitle)
    plt.tight_layout()
    plt.show()

### Vertical Barcharts

In [5]:
def draw_barchart(title, x, y_1, y_1_label,
                  y_2=None, y_2_label=None, y_3=None, y_3_label=None, y_4=None, y_4_label=None,
                  color_1='k', color_2='k', color_3='k', color_4='k',
                  x_label=None, y_label=None, x_ticks=None, y_ticks=None):
    """
    Draws a Barchart of the given Data
    :param title: the title of the plot
    :param x: the array of x values
    :param y_1: the array of the first set of y values
    :param y_1_label: the label of the first dataset
    :param y_2: the array of the second set of y values
    :param y_2_label: the label of the second dataset
    :param y_3: the array of the third set of y values
    :param y_3_label: the label of the third dataset
    :param y_4: the array of the fourth set of y values
    :param y_4_label: the label of the fourth dataset
    :param color_1: color for the first set of data
    :param color_2: color for the second set of data
    :param color_3: color for the third set of data
    :param color_4: color for the fourth set of data
    :param x_label: the label for the x-axis
    :param y_label: the label for the y-axis
    :param x_ticks: array of [[x-ticks], [labels for those ticks]]
    :param y_ticks: array of [[y-ticks], [labels for those ticks]]
    :return: nothing
    """

    plt.style.use('fivethirtyeight')
    width = 0.15

    '''For two sets of data we need to set the bars a bit appart, otherwise they would overlap'''
    x_indexes = x_ticks[0]

    if not y_2 and not y_3 and not y_4:
        plt.bar(x, y_1, color=color_1)

    elif y_2 and not y_3 and not y_4:
        plt.bar(x_indexes - (width / 4) * 3, y_1, width=width, color=color_1, label=y_1_label)
        plt.bar(x_indexes + (width / 4) * 3, y_2, width=width, color=color_2, label=y_2_label)

    elif not y_4:
        plt.bar(x_indexes - (width / 4) * 6, y_1, width=width, color=color_1, label=y_1_label)
        plt.bar(x_indexes, y_2, width=width, color=color_2, label=y_2_label)
        plt.bar(x_indexes + (width / 4) * 6, y_3, width=width, color=color_3, label=y_3_label)

    elif y_4:
        plt.bar(x_indexes - (width / 4) * 9, y_1, width=width, color=color_1, label=y_1_label)
        plt.bar(x_indexes - (width / 4) * 3, y_2, width=width, color=color_2, label=y_2_label)
        plt.bar(x_indexes + (width / 4) * 3, y_3, width=width, color=color_3, label=y_3_label)
        plt.bar(x_indexes + (width / 4) * 9, y_4, width=width, color=color_4, label=y_4_label)

    plt.title(title)

    '''Set the labels for the x- and the y-axis'''
    if x_label:
        plt.xlabel(x_label)
    if y_label:
        plt.ylabel(y_label)

    '''Set the ticks and their labels for x and y'''
    if x_ticks:
        plt.xticks(ticks=x_indexes, labels=x_ticks[1])
    if y_ticks:
        plt.yticks(ticks=y_ticks[0], labels=y_ticks[1])

    if y_2:
        '''Add a legend for more than one dataset to distinguish which color stands for which dataset'''
        plt.legend()

    plt.tight_layout()

    plt.show()

# ------------ PREPARE DATA ----------
def plot_vertical_barchart(title, y_values, x_labels, y_label,
                           label_1, label_2=None, label_3=None, label_4=None,
                           color_1='k', color_2='k', color_3='k', color_4='k'):
    """
    Prepare the Data for plotting
    :param title: Title of the plot
    :param y_values: Array of the y-value sets that are to be plottet: [data_1,data_2,etc]
    :param x_labels: Array of Labels for the x-ticks
    :param y_label: Label for the y-axis
    :param label_1: Label for the first set of y-values
    :param label_2: Label for the second set of y-values
    :param label_3: Label for the third set of y-values
    :param label_4: Label for the fourth set of y-values
    :param color_1: Color for the first set of y-values
    :param color_2: Color for the second set of y-values
    :param color_3: Color for the third set of y-values
    :param color_4: Color for the fourth set of y-values
    :return:
    """

    y_values_1 = y_values[0]
    y_values_2 = None
    if len(y_values) > 1:
        y_values_2 = y_values[1]
    y_values_3 = None
    if len(y_values) > 2:
        y_values_3 = y_values[2]
    y_values_4 = None
    if len(y_values) == 4:
        y_values_4 = y_values[3]

    x_values = np.arange(len(y_values_1))

    draw_barchart(title, x_values,
                  y_values_1, label_1, y_values_2, y_2_label=label_2,
                  y_label=y_label, x_ticks=[x_values, x_labels],
                  y_3=y_values_3, y_3_label=label_3, y_4=y_values_4, y_4_label=label_4,
                  color_1=color_1, color_2=color_2, color_3=color_3, color_4=color_4)

### Horizontal Barcharts

In [6]:
def draw_horizontal_barchart(title, y_data, x_1, label_1, x_label, y_ticks, x_2=None, label_2=None,
                             x_3=None, label_3=None, x_4=None, label_4=None,
                             color_1='k', color_2='k', color_3='k', color_4='k'):
    """
    Plots a horizontal barchart
    :param title: Titel of the barchart
    :param y_data: The Data to be shown (e.g. a list of all the markers]
    :param x_1: The Numbers for the first dataset (e.g. a list of occurrence-numbers of the markers)
    :param label_1: Name of the first dataset
    :param x_label: Label for the x values (e.g. "Number of occurrences")
    :param x_2: The Numbers for the first dataset (e.g. a list of occurrence-numbers of the markers)
    :param label_2: Name of the first dataset
    :param x_3: The Numbers for the first dataset (e.g. a list of occurrence-numbers of the markers)
    :param label_3: Name of the first dataset
    :param x_4: The Numbers for the first dataset (e.g. a list of occurrence-numbers of the markers)
    :param label_4: Name of the first dataset
    :param color_1: the color for the bars
    :param color_2: the color for the bars
    :param color_3: the color for the bars
    :param color_4: the color for the bars
    :return:
    """

    plt.style.use('fivethirtyeight')
    width = 0.15

    y_indexes = y_ticks[0]

    if not x_2 and not x_3 and not x_4:
        plt.barh(y_data, x_1, height=width, color=color_1, label=label_1)

    elif not x_3 and not x_4:
        plt.barh(y_indexes - (width / 4) * 3, x_1, height=width, color=color_1, label=label_1)
        plt.barh(y_indexes + (width / 4) * 3, x_2, height=width, color=color_2, label=label_2)

    elif not x_4:
        plt.barh(y_indexes - (width / 4) * 6, x_1, height=width, color=color_1, label=label_1)
        plt.barh(y_indexes, x_2, height=width, color=color_2, label=label_2)
        plt.barh(y_indexes + (width / 4) * 6, x_3, height=width, color=color_3, label=label_3)

    else:
        plt.barh(y_indexes - (width / 4) * 9, x_1, height=width, color=color_1, label=label_1)
        plt.barh(y_indexes - (width / 4) * 3, x_2, height=width, color=color_2, label=label_2)
        plt.barh(y_indexes + (width / 4) * 3, x_3, height=width, color=color_3, label=label_3)
        plt.barh(y_indexes + (width / 4) * 9, x_4, height=width, color=color_4, label=label_4)

    plt.title(title)

    plt.xlabel(x_label)
    plt.yticks(ticks=y_ticks[0], labels=y_ticks[1])

    plt.legend()

    plt.tight_layout()
    plt.show()
    

def draw_horizontal_subplots(title, y_data, x_1, label_1, x_label, x_2=None, label_2=None,
                             x_3=None, label_3=None, x_4=None, label_4=None,
                             color_1='k', color_2='k', color_3='k', color_4='k'):
    plt.style.use('fivethirtyeight')
    width = 0.15

    if not x_2 and not x_3 and not x_4:
        fig, ax = plt.subplots()
        ax.barh(y_data, x_1, height=width, color=color_1, label=label_1)

        ax.legend()
        ax.set_title(title)
        ax.set_xlabel(x_label)

    elif not x_3 and not x_4:
        fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, sharey=True)
        ax1.barh(y_data, x_1, height=width, color=color_1, label=label_1)
        ax2.barh(y_data, x_2, height=width, color=color_2, label=label_2)

        ax1.legend()
        ax1.set_title(title)
        ax1.set_xlabel(x_label)

        ax2.legend()
        ax2.set_xlabel(x_label)

    elif not x_4:
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, sharey=True)
        ax1.barh(y_data, x_1, height=width, color=color_1, label=label_1)
        ax2.barh(y_data, x_2, height=width, color=color_2, label=label_2)
        ax3.barh(y_data, x_3, height=width, color=color_3, label=label_3)

        ax1.legend()
        ax1.set_title(title)

        ax2.legend()
        ax2.set_xlabel(x_label)

        ax3.legend()
        ax3.set_xlabel(x_label)

    else:
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, sharey=True, sharex=True)
        ax1.barh(y_data, x_1, height=width, color=color_1, label=label_1)
        ax2.barh(y_data, x_2, height=width, color=color_2, label=label_2)
        ax3.barh(y_data, x_3, height=width, color=color_3, label=label_3)
        ax4.barh(y_data, x_4, height=width, color=color_4, label=label_4)

        ax1.set_title(title)
        ax1.legend()

        ax2.legend()

        ax3.legend()
        ax3.set_xlabel(x_label)

        ax4.legend()
        ax4.set_xlabel(x_label)

    plt.tight_layout()
    plt.show()
    

# ------------ PREPARE DATA ----------
def plot_horizontal_barchart(title, y_values, x_values, x_label,
                             label_1, label_2=None, label_3=None, label_4=None,
                             color_1='k', color_2='k', color_3='k', color_4='k',
                             subplot=False):
    """
    :param title: Title of the Barchart
    :param y_values: list of y_values (e.g. all the markers),
    :param x_values: the values for the x-achsis (e.g. the different numbers of occurrences)
    :param x_label: the label for the x-Achsis
    :param label_1: label for the first dataset
    :param label_2: label for the second dataset
    :param label_3: label for the third dataset
    :param label_4: label for the fourth dataset
    :param color_1: color for the first dataset
    :param color_2: color for the second dataset
    :param color_3: color for the third dataset
    :param color_4: color for the fourth dataset
    :param subplot: indicates whether the data is to be displayed in a single plot (False, default),
                    or in different subplots(True)
    :return:
    """

    y_values = y_values
    x_values_1 = x_values[0]
    x_values_2 = None
    if len(x_values) > 1:
        x_values_2 = x_values[1]
    x_values_3 = None
    if len(x_values) > 2:
        x_values_3 = x_values[2]
    x_values_4 = None
    if len(x_values) == 4:
        x_values_4 = x_values[3]

    y_ticks = [np.arange(len(x_values_1)), y_values]

    if subplot:
        draw_horizontal_subplots(title, y_values, x_values_1, label_1,
                                 x_label=x_label,
                                 x_2=x_values_2, label_2=label_2,
                                 x_3=x_values_3, label_3=label_3,
                                 x_4=x_values_4, label_4=label_4,
                                 color_1=color_1, color_2=color_2, color_3=color_3, color_4=color_4)
    else:
        draw_horizontal_barchart(title, y_values, x_values_1, label_1,
                                 x_label=x_label, y_ticks=y_ticks,
                                 x_2=x_values_2, label_2=label_2,
                                 x_3=x_values_3, label_3=label_3,
                                 x_4=x_values_4, label_4=label_4,
                                 color_1=color_1, color_2=color_2, color_3=color_3, color_4=color_4)

### Piecharts

In [7]:
def draw_piecharts(figuretitle, titles, slices, labels, colors, angle):
    """
    Draws a Piechart of the given data with a title and labels for the slices
    """

    fig, axes = plt.subplots(ncols=2, nrows=2, sharey=True)
    row = 0
    column = 0
    for i in range(len(titles)):
        axes[row][column].pie(slices[i], labels=labels, colors=colors[i],
                              startangle=angle, autopct='%1.1f%%')
        axes[row][column].set_title(titles[i])

        column += 1
        if column == 2:
            column = 0
            row += 1

    fig.suptitle(figuretitle)

    plt.tight_layout()
    plt.show()


# ------------ PREPARE DATA ----------
def plot_dm_position_piechart(title, data, labels, colors):
    """
    Prepares the Data for Piecharts:
    One for each Dataset with slices=[counter_begin, counter_middle, counder_end]
    and one for each Position (Begin, Middle, End) with slices for each dataset
    :param: data: array that contains the datasets
    :param: labels: array of labels for the datasets
    :param: colors: array of colors for the datasets
    :return:
    """

    set_labels = ["begin", "middle", "end"]
    begin_slices = []
    middle_slices = []
    end_slices = []
    titles = []
    slices = []
    for dataset, label in zip(data, labels):
        begin_slices.append(sum(dataset[0]))
        middle_slices.append(sum(dataset[1]))
        end_slices.append(sum(dataset[2]))
        slices.append([sum(dataset[0]), sum(dataset[1]), sum(dataset[2])])
        titles.append(label)

    draw_piecharts(title, titles, slices, set_labels, colors, 0)

### Plots for Discourse Markers

In [8]:
def draw_marker_subplots(figuretitle, start, markerlist, dataset1, dataset2, dataset3, dataset4,
                         label1, label2, label3, label4,
                         color1, color2, color3, color4,
                         x_ticks):
    """
    Plots a figure with 12 subplots, one for each marker in each dataset
    """
    plt.style.use('fivethirtyeight')
    y_indexes = x_ticks[0]

    width = 0.15

    fig, axes = plt.subplots(ncols=4, nrows=3, sharey=True)

    row = 0
    column = 0
    for i in range(start, start + 12):
        if i < len(markerlist):
            axes[row][column].bar(y_indexes - (width / 4) * 9, dataset1[i], width=width, color=color1, label=label1)
            axes[row][column].bar(y_indexes - (width / 4) * 3, dataset2[i], width=width, color=color2, label=label2)
            axes[row][column].bar(y_indexes + (width / 4) * 3, dataset3[i], width=width, color=color3, label=label3)
            axes[row][column].bar(y_indexes + (width / 4) * 9, dataset4[i], width=width, color=color4, label=label4)

            axes[row][column].set_title(markerlist[i])
            axes[row][column].set_xticks(x_ticks[0])
            axes[row][column].set_xticklabels(x_ticks[1])
            # axes1[row][column].legend()

            column += 1
            if column == 4:
                column = 0
                row += 1
        else:
            break

    fig.suptitle(figuretitle)
    plt.tight_layout()
    plt.show()


def plot_marker_subplots(title, markerlist, y_values, x_labels,
                         label1, label2, label3, label4,
                         color1, color2, color3, color4):
    """
    Processes the data and calls the plot function (12 times, as there are 142 markers
    and only 12 fit in one figure)
    """
    y_values_1 = y_values[0]
    y_values_2 = y_values[1]
    y_values_3 = y_values[2]
    y_values_4 = y_values[3]

    x_values = np.arange(len(y_values_1[0]))

    for i in range(0, 144, 12):
        draw_marker_subplots(title, i, markerlist, y_values_1, y_values_2, y_values_3, y_values_4,
                             label1, label2, label3, label4,
                             color1, color2, color3, color4,
                             [x_values, x_labels])


def prepare_marker_subplots(data1, data2, data3, data4):
    """
    Creates list containing all the values for the respective dataset matching up
    with the created list of markers
    :param data1: first dataset
    :param data2: second dataset
    :param data3: third dataset
    :param data4: fourth dataset
    :return: the list of markers and a list of lists of the dataset values
    """
    markers = list_all_markers(data1, data2, data3, data4)
    y_values_1 = []
    y_values_2 = []
    y_values_3 = []
    y_values_4 = []

    for marker in markers:
        y_values_1.append(data1.get_all_marker_values(marker))
        y_values_2.append(data2.get_all_marker_values(marker))
        y_values_3.append(data3.get_all_marker_values(marker))
        y_values_4.append(data4.get_all_marker_values(marker))

    return markers, [y_values_1, y_values_2, y_values_3, y_values_4]


def most_common_markers_plot(figuretitle, xlabel,
                             data1, label1, color1, data2=None, label2=None, color2=None,
                             data3=None, label3=None, color3=None, data4=None, label4=None, color4=None,
                             share=False):
    plt.style.use('fivethirtyeight')
    width = 0.15
    plt.rc('ytick', labelsize=10)
    plt.rc('xtick', labelsize=9)

    if not data2 and not data3 and not data4:
        fig, ax = plt.subplots()
        ax.barh(data1[0], data1[1], height=width, color=color1, label=label1)

        ax.legend()
        ax.set_title(label1)
        ax.set_xlabel(xlabel)

    elif not data3 and not data4:
        if share:
            fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=1, sharex=True)
        else:
            fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=1)
        ax1.barh(data1[0], data1[1], height=width, color=color1, label=label1)
        ax2.barh(data2[0], data2[1], height=width, color=color2, label=label2)

        ax1.set_title(label1)
        ax1.set_xlabel(xlabel)

        ax2.set_title(label2)
        ax2.set_xlabel(xlabel)

    elif not data4:
        if share:
            fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, sharex=True)
        else:
            fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3)
        ax1.barh(data1[0], data1[1], height=width, color=color1, label=label1)
        ax2.barh(data2[0], data2[1], height=width, color=color2, label=label2)
        ax3.barh(data3[0], data3[1], height=width, color=color3, label=label3)

        ax1.set_title(label1)

        ax2.set_title(label2)
        ax2.set_xlabel(xlabel)

        ax3.set_title(label3)

    else:
        if share:
            fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, sharex=True)
        else:
            fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2)
        ax1.barh(data1[0], data1[1], height=width, color=color1, label=label1)
        ax2.barh(data2[0], data2[1], height=width, color=color2, label=label2)
        ax3.barh(data3[0], data3[1], height=width, color=color3, label=label3)
        ax4.barh(data4[0], data4[1], height=width, color=color4, label=label4)

        ax1.set_title(label1)

        ax2.set_title(label2)

        ax3.set_title(label3)
        ax3.set_xlabel(xlabel)

        ax4.set_title(label4)
        ax4.set_xlabel(xlabel)

    fig.suptitle(figuretitle)

    plt.tight_layout()
    plt.show()

## Class that creates and holds the dataset objects

In [9]:
class CorpusData:
    def __init__(self,
                 spotify_scores, spotify_dict,
                 ted_scores, ted_dict,
                 ny_scores, ny_dict,
                 gig_scores, gig_dict):
        self.spotify = DatasetScores(spotify_scores, spotify_dict)
        self.ted = DatasetScores(ted_scores, ted_dict)
        self.ny = DatasetScores(ny_scores, ny_dict)
        self.gig = DatasetScores(gig_scores, gig_dict)

        # Colors:   [base, darker, lighter]
        self.spotify_color = '#1DB954'
        self.spotify_shades = ['#1DB954', '#0e5c2a', '#8edca9']
        self.ted_color = '#e62b1e'
        self.ted_shades = ['#e62b1e', '#73150f', '#f2958e']
        self.ny_color = '#FFA700'
        self.ny_shades = ['#FFA700', '#7f5300', '#ffd37f']
        self.gig_color = '#227DFB'
        self.gig_shades = ['#227DFB', '#113e7d', '#90befd']


data = CorpusData("../bigData/listenability-tools/scores/spotify-scores_short.csv",
                      "../bigData/listenability-tools/dict/spotify-dict.json",
                      "../bigData/listenability-tools/scores/ted-scores_short.csv",
                      "../bigData/listenability-tools/dict/ted-dict.json",
                      "../bigData/listenability-tools/scores/nytimes-scores_short.csv",
                      "../bigData/listenability-tools/dict/nytimes-dict.json",
                      "../bigData/listenability-tools/scores/gigaword-scores_short.csv",
                      "../bigData/listenability-tools/dict/gigaword-dict.json")

# Discourse Markers per Text
---
### DM per Text in Percent

In [14]:
%matplotlib widget
plot_vertical_barchart("Percent Discourse Markers per Text",
                              [data.spotify.get_percent_dm_count_statistics(),
                               data.ted.get_percent_dm_count_statistics(),
                               data.ny.get_percent_dm_count_statistics(),
                               data.gig.get_percent_dm_count_statistics()],
                              ["Min", "A_Mean", "H_Mean", "Median", "Mode", "Max"],
                              "Percent Markers",
                              label_1="Spotify", label_2="TED", label_3="NYTimes", label_4="Gigaword",
                              color_1=data.spotify_color, color_2=data.ted_color,
                              color_3=data.ny_color, color_4=data.gig_color
                              )

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Total DM per Text

In [15]:
%matplotlib widget
plot_vertical_barchart("Number Discourse Markers per Text",
                              [data.spotify.get_total_dm_count_statistics(),
                               data.ted.get_total_dm_count_statistics(),
                               data.ny.get_total_dm_count_statistics(),
                               data.ny.get_total_dm_count_statistics()],
                              ["Min", "A_Mean", "H_Mean", "Median", "Mode", "Max"],
                              "Number Markers",
                              label_1="Spotify", label_2="TED", label_3="NYTimes", label_4="Gigaword",
                              color_1=data.spotify_color, color_2=data.ted_color,
                              color_3=data.ny_color, color_4=data.gig_color
                              )

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Sentences Containing Discourse Markers

### Percent of Sentences of the Texts containing at least one DM

In [12]:
%matplotlib widget
plot_vertical_barchart("Percent of Sentences with DM per Text",
                              [data.spotify.get_percent_dm_sentences_statistics(),
                               data.ny.get_percent_dm_sentences_statistics(),
                               data.gig.get_percent_dm_sentences_statistics()],
                              ["Min", "A_Mean", "H_Mean", "Median", "Mode", "Max"],
                              "% Sentences containing DM",
                              label_1="Spotify", label_2="NYTimes", label_3="Gigaword",
                              color_1=data.spotify_color, color_2=data.ny_color, color_3=data.gig_color)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Total Number of Sentences containing at least one DM

In [13]:
%matplotlib widget
plot_vertical_barchart("Number of Sentences with DM per Text",
                              [data.spotify.get_total_dm_sentences_statistics(),
                               data.ny.get_total_dm_sentences_statistics(),
                               data.gig.get_total_dm_sentences_statistics()],
                              ["Min", "A_Mean", "H_Mean", "Median", "Mode", "Max"],
                              "# Sentences containing DM",
                              label_1="Spotify", label_2="NYTimes", label_3="Gigaword",
                              color_1=data.spotify_color, color_2=data.ny_color, color_3=data.gig_color)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Discourse Markers per Sentence
---
### Number of DM per sentence

In [16]:
%matplotlib widget
plot_vertical_barchart("Number of Discourse Markers per Sentence",
                       [data.spotify.get_total_dm_per_sentence_statistics(),
                        data.ny.get_total_dm_per_sentence_statistics(),
                        data.gig.get_total_dm_per_sentence_statistics()],
                       ["Min", "A_Mean", "H_Mean", "Median", "Mode", "Max"],
                       "# Markers per Sentence",
                       label_1="Spotify", label_2="NYTimes", label_3="Gigaword",
                       color_1=data.spotify_color, color_2=data.ny_color, color_3=data.gig_color)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Histogram of number of DM per sentence

In [32]:
%matplotlib widget
draw_simple_barchart("Number of Discourse Markers per Sentence",
                            ["Spotify", "NYTimes", "Gigaword"],
                            [data.spotify.compute_dm_per_sentence(),
                             data.ny.compute_dm_per_sentence(),
                             data.gig.compute_dm_per_sentence()],
                            [data.spotify_color, data.ny_color, data.gig_color])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Sentence Positions
---
### Percentage of DM at certain positions in a sentence

In [17]:
%matplotlib widget
plot_vertical_barchart("% of DM in a Position in a Sentence",
                              [data.spotify.get_percent_dm_positions_sentence(),
                               data.ny.get_percent_dm_positions_sentence(),
                               data.gig.get_percent_dm_positions_sentence()],
                              ["begin", "middle", "end"],
                              "% DM at Postion",
                              label_1="Spotify", label_2="NYTimes", label_3="Gigaword",
                              color_1=data.spotify_color, color_2=data.ny_color, color_3=data.gig_color)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [18]:
%matplotlib widget
plot_dm_position_piechart("DM in a Sentence at Position:",
                                 [data.spotify.get_sentence_position_values(),
                                  data.ny.get_sentence_position_values(),
                                  data.gig.get_sentence_position_values()
                                  ],
                                 ["Spotify Data", "NYTimes Data", "Gigaword Data"],
                                 [data.spotify_shades,
                                  data.ny_shades,
                                  data.gig_shades])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Number of DM at certain position in a sentence

In [19]:
%matplotlib widget
plot_vertical_barchart("Number of DM at a certain Position in a Sentence",
                              [data.spotify.get_total_dm_positions_sentence(),
                               data.ny.get_total_dm_positions_sentence(),
                               data.gig.get_total_dm_positions_sentence()],
                              ["begin", "middle", "end"],
                              "# DM at Postion",
                              label_1="Spotify", label_2="NYTimes", label_3="Gigaword",
                              color_1=data.spotify_color, color_2=data.ny_color, color_3=data.gig_color)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Document Positions
---
### Percentage of DM at certain positions in a document

In [20]:
%matplotlib widget
plot_vertical_barchart("% of DM in a Position in a Document",
                              [data.spotify.get_percent_dm_positions_document(),
                               data.ted.get_percent_dm_positions_document(),
                               data.ny.get_percent_dm_positions_document(),
                               data.gig.get_percent_dm_positions_document()],
                              ["begin", "middle", "end"],
                              "% DM at Postion",
                              label_1="Spotify", label_2="TED", label_3="NYTimes", label_4="Gigaword",
                              color_1=data.spotify_color, color_2=data.ted_color,
                              color_3=data.ny_color, color_4=data.gig_color)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [21]:
%matplotlib widget
plot_dm_position_piechart("Positions of Discourse Markers in the Documents",
                                 [data.spotify.get_document_position_values(),
                                  data.ted.get_document_position_values(),
                                  data.ny.get_document_position_values(),
                                  data.gig.get_document_position_values()
                                  ],
                                 ["Spotify Data", "TED Data", "NYTimes Data", "Gigaword Data"],
                                 [data.spotify_shades,
                                  data.ted_shades,
                                  data.ny_shades,
                                  data.gig_shades])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Number of DM at certain position in a sentence

In [22]:
%matplotlib widget
plot_vertical_barchart("Number of DM at a certain Position in a Document",
                              [data.spotify.get_total_dm_positions_document(),
                               data.ted.get_total_dm_positions_document(),
                               data.ny.get_total_dm_positions_document(),
                               data.gig.get_total_dm_positions_document()],
                              ["begin", "middle", "end"],
                              "# DM at Postion",
                              label_1="Spotify", label_2="TED", label_3="NYTimes", label_4="Gigaword",
                              color_1=data.spotify_color, color_2=data.ted_color,
                              color_3=data.ny_color, color_4=data.gig_color)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Most Common Discourse Markers

### In Percent

In [23]:
%matplotlib widget
most_common_markers_plot("Most Common Markers per Dataset in %", "Share in all Markers",
                                data.spotify.get_most_common_markers(15, perc=True),
                                "Spotify", data.spotify_color,
                                data2=data.ted.get_most_common_markers(15, perc=True),
                                label2="TED", color2=data.ted_color,
                                data3=data.ny.get_most_common_markers(15, perc=True),
                                label3="NYTimes",
                                color3=data.ny_color,
                                data4=data.gig.get_most_common_markers(15, perc=True),
                                label4="Gigaword",
                                color4=data.gig_color, share=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Total Numbers

In [27]:
%matplotlib widget
most_common_markers_plot("Most Common Markers per Dataset", "Number of Occurences",
                                data.spotify.get_most_common_markers(15),
                                "Spotify", data.spotify_color,
                                data2=data.ted.get_most_common_markers(15),
                                label2="TED", color2=data.ted_color,
                                data3=data.ny.get_most_common_markers(15),
                                label3="NYTimes", color3=data.ny_color,
                                data4=data.gig.get_most_common_markers(15),
                                label4="Gigaword", color4=data.gig_color)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Most Common in Sentene Positions

In [28]:
most_common_markers_plot("Most Common Markers: Sentence Begin", "Share in all Markers at Sent. Begin",
                                data.spotify.get_most_common_markers(15, position="sb", perc=True),
                                "Spotify", data.spotify_color,
                                data2=data.ny.get_most_common_markers(15, position="sb", perc=True),
                                label2="NYTimes", color2=data.ny_color,
                                data3=data.gig.get_most_common_markers(15, position="sb", perc=True),
                                label3="Gigaword", color3=data.gig_color, share=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [29]:
most_common_markers_plot("Most Common Markers: Sentence Middle", "Share in all Markers at Sent. Middle",
                                data.spotify.get_most_common_markers(15, position="sm", perc=True),
                                "Spotify", data.spotify_color,
                                data2=data.ny.get_most_common_markers(15, position="sm", perc=True),
                                label2="NYTimes", color2=data.ny_color,
                                data3=data.gig.get_most_common_markers(15, position="sm", perc=True),
                                label3="Gigaword", color3=data.gig_color, share=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [30]:
most_common_markers_plot("Most Common Markers: Sentence End", "Share in all Markers at Sent. End",
                                data.spotify.get_most_common_markers(15, position="se", perc=True),
                                "Spotify", data.spotify_color,
                                data2=data.ny.get_most_common_markers(15, position="se", perc=True),
                                label2="NYTimes", color2=data.ny_color,
                                data3=data.gig.get_most_common_markers(15, position="se", perc=True),
                                label3="Gigaword", color3=data.gig_color, share=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Most Common in Document Positions

In [31]:
most_common_markers_plot("Most Common Markers: Document Begin", "Share in all Markers at Doc. Begin",
                                data.spotify.get_most_common_markers(15, position="db", perc=True),
                                "Spotify", data.spotify_color,
                                data2=data.ted.get_most_common_markers(15, position="db", perc=True),
                                label2="TED", color2=data.ted_color,
                                data3=data.ny.get_most_common_markers(15, position="db", perc=True),
                                label3="NYTimes", color3=data.ny_color,
                                data4=data.gig.get_most_common_markers(15, position="db", perc=True),
                                label4="Gigaword", color4=data.gig_color, share=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [25]:
most_common_markers_plot("Most Common Markers: Document Middle", "Share in all Markers at Doc. Middle",
                                data.spotify.get_most_common_markers(15, position="dm", perc=True),
                                "Spotify", data.spotify_color,
                                data2=data.ted.get_most_common_markers(15, position="dm", perc=True),
                                label2="TED", color2=data.ted_color,
                                data3=data.ny.get_most_common_markers(15, position="dm", perc=True),
                                label3="NYTimes", color3=data.ny_color,
                                data4=data.gig.get_most_common_markers(15, position="dm", perc=True),
                                label4="Gigaword", color4=data.gig_color, share=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [26]:
most_common_markers_plot("Most Common Markers: Document End", "Share in all Markers at Doc. End",
                                data.spotify.get_most_common_markers(15, position="de", perc=True),
                                "Spotify", data.spotify_color,
                                data2=data.ted.get_most_common_markers(15, position="de", perc=True),
                                label2="TED", color2=data.ted_color,
                                data3=data.ny.get_most_common_markers(15, position="de", perc=True),
                                label3="NYTimes", color3=data.ny_color,
                                data4=data.gig.get_most_common_markers(15, position="de", perc=True),
                                label4="Gigaword", color4=data.gig_color, share=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …