#### DEPENDENCIES

In [None]:
import os
import sys
from collections import Counter
from datetime import datetime
import math

# data analysis
import pandas as pd
import numpy as np
from numpy import isneginf
from sklearn.metrics.pairwise import cosine_distances
from sklearn.manifold import TSNE
import scipy.stats

# plots
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns

# notebook settings
pd.set_option('mode.chained_assignment', None)
np.seterr(divide = 'ignore')                 # divide by zero warning log

#### PREPARATION

In [None]:
def read_data():
    """
    Read data from music task from folder /data.
    @return
        data: pandas dataframe with all data from pairs concatenated
    """
        
    # Search file names
    filenames = []
    for root, dirs, files in os.walk("data", topdown=False):
        for name in files:
            if name.startswith("turns") and name.endswith(".txt"):
                filenames.append(os.path.join(root, name))
    
    filenames = sorted(filenames)            

    # Construct data frame
    sep = '|'
    frames = []
    
    for file in filenames:
        data = pd.read_csv(
                    file, 
                    sep = '|',
                    na_values={'note_p1': [' '], "note_p2": [' ']},
                    names=["experiment_id",  # The timestamp when the experiment started
                           "game_number",    # Game number. Incremented by 1 each time they succeed OR timeout
                           "level",          # IGNORE THIS COLUMN
                           "score",          # IGNORE THIS COLUMN
                           "level_2",        # IGNORE THIS COLUMN
                           "participant",    # The actual, physical participant. P1 OR P2
                           "role",           # The role of the participant. director OR matcher OR NEWGAME
                           "timestamp",      # The timestamp when the data on that row was saved. Unix time
                           "note_p1",        # NEWGAME, NaN, C3ON, C3OFF, E3ON, E3OFF, G3ON, G3OFF
                           "note_p2",        # A3ON, A3OFF, D3ON, D3OFF, F3ON, F3OFF
                           "pressure",       # The "pressure" of the keypress (between 1 and 100)
                           "note_check",     # Records whether the particular note press/release was the correct note or NOT. NEWGAME, CORRECT, COMPLETE, ERROR, TIMEOUT
                           "succes_status",  # The first number is how many chords have been successfully completed. The second number is the length of the sequence (i.e. how many "chords")
                           "report_status",  # This column stores for each note of each chord when the key was pressed and when the key was released
                           "difficulty"],    # These are just the difficulty settings for the random chord generator. You can ignore this.  
                     dtype={"note_p1":pd.StringDtype(),
                            "note_p2":pd.StringDtype()}) 
        
        data.insert(0, "pair", file[5:11], True) # pair number
        frames.append(data)
    
    data = pd.concat(frames)
    data = data.reset_index(drop=True)
    
    return data

In [None]:
def preprocess_succesful_sequences(data):
    """
    Filter out the rows that are part of the successful sequences that determine the end of the game round.
    @params
        data: pandas dataframe with data from the music task
    @return
        data: copy? of original dataframe without the succesful rows
    """
    
    f = [True for i in data.index] # filter: True = keep, False = remove
    
    for i in data.index:
        
        if i > 0: #ignore first 'NEW GAME'
            if data.loc[i, 'note_check'] == 'NEWGAME':
            
                # check for succesful sequence
                j = i-1
                previous_row = data.iloc[j]
                
                while previous_row['note_check'] == 'COMPLETE' or previous_row['note_check'] == 'CORRECT':
                    f[j] = False
                    j -= 1
                    previous_row = data.iloc[j]
            else:
                pass
        else:
            pass

    # apply filter
    data = data[f]
    
    return data

In [None]:
def preprocess_key_setup(data):
    """
    Add a column with info about keyboard setup: 'parallel' or 'overlap'.
    @params
        data: pandas dataframe with data from the music task
    @return
        data: original dataframe plus an extra column with key setup
    """
    
    # parallel/overlap pairs
    parallel = ['pair02', 'pair03', 'pair06', 'pair07', 'pair09', 'pair11', 'pair13', 'pair15', 'pair16', 'pair18', 'pair19', 'pair20', 'pair23', 'pair24', 'pair27']
    overlap = ['pair04', 'pair05', 'pair08', 'pair10', 'pair12', 'pair14', 'pair17', 'pair21', 'pair22', 'pair25', 'pair26']

    # add setup
    data['setup'] = 'parallel'
    data.loc[data['pair'].isin(overlap), 'setup'] = 'overlap'
    
    return data

In [None]:
def preprocess_key_numbers(data):
    """
    Translate note letters to key numbers depending on participant, role and key setup.
    In pair 13 and 14, some key presses were logged wrong. In that case, the note letters are translated as 'ERR'.
    @params
        data: pandas dataframe with data from the music task
    @return
        data: orginal dataframe plus an extra column for each participant containing the translated keys
    """
    
    # translation for parallel key setup
    t_dict_p = {
        'note_p1': { 
            'C3ON': '1ON', 'C3OFF': '1OFF',
            'E3ON': '2ON', 'E3OFF': '2OFF',
            'G3ON': '3ON', 'G3OFF': '3OFF'
        },
        'note_p2': {
            'D3ON': '4ON', 'D3OFF': '4OFF',
            'F3ON': '5ON', 'F3OFF': '5OFF',
            'A3ON': '6ON', 'A3OFF': '6OFF'
        }
    }
    
    # translation for overlapping key setup
    t_dict_o = {
        'note_p1': { 
            'C3ON': '1ON', 'C3OFF': '1OFF',
            'D3ON': '2ON', 'D3OFF': '2OFF',
            'E3ON': '3ON', 'E3OFF': '3OFF'
        },
        'note_p2': {
            'E3ON': '4ON', 'E3OFF': '4OFF',
            'G3ON': '5ON', 'G3OFF': '5OFF',
            'F3ON': '6ON', 'F3OFF': '6OFF'
        }
    }
    
    key_p1 = []
    key_p2 = []

    for i in data.index:

        if data.loc[i, 'note_check'] == 'NEWGAME':
            key_p1.append('NEWGAME')
            key_p2.append('NEWGAME')            
        
        elif data.loc[i, 'note_check'] == 'TIMEOUT':
            key_p1.append('TIMEOUT')
            key_p2.append('TIMEOUT')
            
        else:
            
            # Key setup = parallel
            if data.loc[i, 'setup'] == 'parallel':
        
                if data.loc[i, 'participant'] == 'P1':
                    
                    try:
                        keypress = data.loc[i, 'note_p1'].strip()
                        translation = t_dict_p['note_p1'][keypress]
                        key_p1.append(translation)
                        key_p2.append(np.nan)
                    
                    except:
                        key_p1.append("ERR")
                        key_p2.append(np.nan)
                        print("ERROR: Prohibited key at %s, %s." % (i, data.loc[i, 'pair']))
                    
                elif data.loc[i, 'participant'] == 'P2':
                    
                    try:
                        keypress = data.loc[i, 'note_p2'].strip()
                        translation = t_dict_p['note_p2'][keypress]
                        key_p2.append(translation)
                        key_p1.append(np.nan)
                    
                    except:
                        key_p2.append("ERR")
                        key_p1.append(np.nan)
                        print("ERROR: Prohibited key at %s, %s." % (i, data.loc[i, 'pair']))
                    
                    
                else:
                    print("issue here")
                    pass
        
            # Key setup = overlap
            else:
        
                if data.loc[i, 'participant']  == 'P1':
                    
                    try:
                        keypress = data.loc[i, 'note_p1'].strip()
                        translation = t_dict_o['note_p1'][keypress]
                        key_p1.append(translation)
                        key_p2.append(np.nan)
                    
                    except:
                        key_p1.append("ERR")
                        key_p2.append(np.nan)
                        print("ERROR: Prohibited key at %s, %s." % (i, data.loc[i, 'pair']))
                    
                elif data.loc[i, 'participant'] == 'P2':
                    
                    try:
                        keypress = data.loc[i, 'note_p2'].strip()
                        translation = t_dict_o['note_p2'][keypress]
                        key_p2.append(translation)
                        key_p1.append(np.nan)
                    
                    except:
                        key_p2.append("ERR")
                        key_p1.append(np.nan)
                        print("ERROR: Prohibited key at %s, %s." % (i, data.loc[i, 'pair']))
                    
                else:
                    print("issue here")
                    pass

    # Append columns
    data['key_p1'] = key_p1
    data['key_p2'] = key_p2
                
    return data

In [None]:
def preprocess(data):
    """
    Preprocessing consists of five parts:
        1. Remove rows that are part of successful sequences, before a game is completed
        2. Add a column with the key setup
        3. Add columns for keynumbers of P1 and P2
        4. Drop unused columns
        5. Reorder columns
    @params
        data: pandas dataframe with data from the music task
    @return
        data: modified selection of the original dataframe
    """
    
    # 1. Remove rows that are part of succesful sequences
    data = preprocess_succesful_sequences(data)
    
    # 2. Add column with key setup
    data = preprocess_key_setup(data)
    
    # 3. Add columns with key numbers  
    data = preprocess_key_numbers(data)
    
    # 4. Drop unused columns
    unused = [
        'experiment_id',
        'level',
        'level_2',
        'game_number',
        'timestamp',
        'pressure',
        'note_check',
        'succes_status',
        'report_status',
        'difficulty',
        'note_p1',
        'note_p2',
    ]
    data = data.drop(columns = unused)
    
    # 5. Reorder columns
    order = [
        'pair',
        'setup',
        'score',
        'participant',
        'role',
        'key_p1',
        'key_p2'
    ]
    
    data = data[order]
    
    return data

#### A. PARSING

In [None]:
def structure_stream(data):
    """ 
    Structure the keypresses based on game rounds and turns between participants.
    The distinguishable splits in the stream are represented in a nested list and tuple structure.
    ERR keys are ignored.
    @params 
        data: pandas dataframe of a pair (relevant columns are: participant, note_p1, note_p2, note_check)
    @return
        experiment: nested list and tuple [game: [turn: (participant, role, [keypresses])]]
    @TODO
        replace iterrows with data.index
    """
    
    # experiment = [game, game]
    experiment = []
    
    # game = [turn, turn]
    game = []
    
    # turn = (participant, role, [keypresses])
    participant = ''
    role = ''
    
    # keypresses = ['AON', 'AOFF', etc.]
    keypresses = []
    
    for i, row in data.iterrows():
    
        # First row: 'NEWGAME'
        if i == 0:
            # Ignore
            pass
        
        # Last row: end of experiment
        elif i == len(data.index)-1:
            
            if row.key_p1 == 'NEWGAME' or row.key_p1 == 'TIMEOUT':
                # Ignore
                pass

            elif row.key_p1 == 'ERR' or row.key_p2 == 'ERR':
                # Ignore
                pass
            
            else:
                if row.participant == 'P1':
                    keypresses.append(row.key_p1)
            
                elif row.participant == 'P2':
                    keypresses.append(row.key_p2)
                
                else:
                    pass
            
            # End of experiment: add turn to game
            if len(keypresses) > 0 : game.append((participant, role, keypresses))
            
            # End of experiment: add game to stream
            if len(game) > 0 : experiment.append(game)
        
        # Within experiment
        else:
            
            next_row = data.iloc[i+1]
            
            # End of game
            if row.key_p1 == 'NEWGAME' or row.key_p1 == 'TIMEOUT':

                # End of game: add turn to game
                if len(keypresses) > 0 : game.append((participant, role, keypresses))
                # Reset
                keypresses = []
                participant = ''
                role = '' 

                # End of game: add game to experiment
                if len(game) > 0 : experiment.append(game)
                # Reset game
                game = []
            
            elif row.key_p1 == 'ERR' or row.key_p2 == 'ERR':
                # Ignore
                pass
            
            # Within game
            else:

                if row.participant == 'P1':

                    participant = row.participant
                    role = row.role
                    keypresses.append(row.key_p1)

                    if next_row.participant == 'P2':
                        # End of turn: append turn to game
                        game.append((participant, role, keypresses))
                        # Reset
                        keypresses = []
                        participant = ''
                        role = '' 

                elif row.participant == 'P2':

                    participant = row.participant
                    role = row.role
                    keypresses.append(row.key_p2)

                    if next_row.participant == 'P1':
                        # End of turn: append turn to game
                        game.append((participant, role, keypresses))
                        # Reset
                        keypresses = []
                        participant = ''
                        role = ''

                else:
                    pass
    
    return experiment

In [None]:
def parse(data):
    """
    Use structured stream to first distinguish splits based on game and turn.
    Parses on chord level: keypresses are stacked until all keys are released, then the stack is simplified 
    to only the distinct notes without considering the order of occurrence.
    This means that overlapping terms, e.g. when participants interrupt one another, are taken as one term.
    Example: DON, AON, DOFF, CON, COFF, AOFF > ADC
    
    @params
        data: pandas dataframe (relevant columns are: participant, note_p1, note_p2, note_check)
        mark_role: annotates terms by role of the participant that uses the term
        add_split: mark splits of games with NEW_GAME
    @return
        stream: list of lists of tokens
    """
    
    # [experiment [game (participant, role, ['AON', 'AOFF'])]]
    structured_stream = structure_stream(data)
    
    # for dataframe(columns: term, participants, roles)
    stream = []
    
    # game
    for game in structured_stream:
        
        # take one game to be a "sentence"
        sentence = []
        
        stack = []
        P1 = False
        P2 = False
        director = False
        matcher = False
        
        # Turn
        for participant, role, keypresses in game:
            
            keys = [key[0] for key in keypresses]
            
            for key in keys:
            
                if participant == 'P1': P1 = True
                if participant == 'P2': P2 = True
                if role == 'director': director = True
                if role == 'matcher': matcher = True
            
                stack.append(key)
                
                # All keys are released for both participants: a term is formed
                if all(value%2 == 0 for value in Counter(k for k in stack).values()):
                    
                    # Term column
                    term = ''.join(sorted(list(set(stack))))
                    
                    if director and matcher:
                        term = term # don't have to annotate if mixed, right?
                    elif director:
                        term = 'd_' + term
                    else: # matcher
                        term = 'm_' + term
                    
                    sentence.append(term)
                    
                    # Reset
                    stack = []
                    P1 = False
                    P2 = False
                    director = False
                    matcher = False
        
        # Append sentence to stream
        stream.append(sentence)
    
    return stream

#### B. CO-OCCURRENCES

In [None]:
def sort_voc(voc):
    """
    Sort vocabulary alphabetically and then based on length of term.
    The order of the vocabulary is relevant once the corresponding matrices are visualized or further computed.
    
    @ params
        voc: list of individual terms
        mark_role: whether the prefix d_ or m_ is part of the terms
    @ return
        ordered_voc: list of term ordered in such a way that the heatmap of the matrix will be better readable
    """
    
    # Alphabetically
    sorted_voc = sorted(voc)
    
    # Length term
    sorted_voc.sort(key=lambda x: len(x))
    
    return sorted_voc

In [None]:
def matrix(stream, n=3):
    """
    Compute co-occurrence numpy matrix based on n context window at both sides.
    
    @ params
        df_stream: pandas dataframe with relevant column (term)
        n: size of context window
    @ return
        matrix: numpy matrix size len(voc) x len(voc) with counts how many times term with term in vicinity of n terms occurs
        voc_dict: with as key all terms in the vocabulary and as value their index corresponding to the matrix
    """
    
    # Vocabulary
    voc = sort_voc(list(set([term for sentence in stream for term in sentence])))
    voc_dict = {voc[i]:i for i in range(len(voc))}
    
    # Numpy matrix [len(voc) x len(voc)]
    matrix = []
    
    for word in voc:
        
        # Each row corresponds to a word
        row = [0 for i in range(len(voc))]
        
        for sentence in stream: 
        
            for i in range(len(sentence)):
            
                if sentence[i] == word:    

                    left = sentence[i-n:i] if i-n > 0 else []
                    right = sentence[i+1:i+1+n] if i+1+n <= len(sentence) else []
                    context = left + right

                    for context_word in context:
                        row[voc_dict[context_word]] += 1
        
        matrix.append(row)
    
    matrix = np.array(matrix)
    
    return matrix, voc_dict

In [None]:
def project_embeddings(title, matrix, voc_dict, save_fig=False, show_fig=False):
    """
    Visualize reduced matrix with a scatterplot. 
    @ params
        title: title for plot
        matrix: numpy co-occurrence matrix
        voc_dict: all terms corresponding to indexes of matrix {term: index} to annotate plot 
    @ output
        embeddings: 2d graph scatterplot of reduced vector space
    """
    
    if save_fig or show_fig:
        # Get normalized matrix: makes relationships more pronounced
        # matrix = normalize(matrix)
    
        # Get reduced matrix
        tsne = TSNE(n_components=2, perplexity=30, early_exaggeration=12.0, random_state=0)
        reduced_matrix = tsne.fit_transform(matrix) 
    
        # Get x- and y-choords
        x = [term[0] for term in reduced_matrix]
        y = [term[1] for term in reduced_matrix]
        labels = voc_dict.keys()
    
        # Plot
        plt.figure(figsize=(10, 8))
        ax = sns.scatterplot(x, y)
    
        # Different colors for P1_d, P1_m, P2_d, P2_m
        terms_p1_d = [term for term in voc_dict.keys() if term.startswith('d_') and term[2] in ['1', '2', '3']]
        terms_p1_m = [term for term in voc_dict.keys() if term.startswith('m_') and term[2] in ['1', '2', '3']]
        terms_p2_d = [term for term in voc_dict.keys() if term.startswith('d_') and term[2] in ['4', '5', '6']]
        terms_p2_m = [term for term in voc_dict.keys() if term.startswith('m_') and term[2] in ['4', '5', '6']]
    
        for term in terms_p1_d:
            ax.plot(x[voc_dict[term]], y[voc_dict[term]], color = 'C1', marker="o")
        for term in terms_p1_m:
            ax.plot(x[voc_dict[term]], y[voc_dict[term]], color = 'C1', marker="o")
        for term in terms_p2_d:
            ax.plot(x[voc_dict[term]], y[voc_dict[term]], color = 'C2', marker="o")
        for term in terms_p2_m:
            ax.plot(x[voc_dict[term]], y[voc_dict[term]], color = 'C2', marker="o")
    
        # Label dots with terms
        for i, txt in enumerate(labels):
            ax.annotate(txt, (x[i], y[i]))
    
        # Legend
        legend_elements = [Line2D([0], [0], marker='o', color='w', label='P1', markerfacecolor='C1', markersize=8),
                          Line2D([0], [0], marker='o', color='w', label='P2', markerfacecolor='C2', markersize=8),
                          Line2D([0], [0], marker='o', color='w', label='both', markerfacecolor='C0')]
    
        ax.legend(handles=legend_elements)
        
        plt.title("word vectors (t-SNE)\n%s" % title)
        
    if save_fig:
        plt.savefig("graphs/word_vectors_tsne_%s.png" % title, bbox_inches='tight')
        
    if show_fig: 
        plt.show()
    
    if save_fig and not show_fig:
        plt.close()

#### C. STATEMENTS

In [None]:
def table_distances(title, matrix, voc_dict, save_fig=False, show_fig=False):
    """
    Calculate cosine distances between all terms based on co-occurrence matrix.
    Reorganize table with director keys on rows and matcher keys on columns.
    params:
        matrix: numpy co-occurrence matrix
        voc_dict: all terms corresponding to indexes of matrix {term: index} to annotate plot
    return:
        distances: table with cosine distances of selection
    todo: 
        add mask to clean 0.99?
    """
    
    # Cosine distance function from sklearn
    distances = cosine_distances(matrix)
    
    # Select terms for table    
    # Director
    i = [term for term in voc_dict.keys() if term.startswith('d_') and len(term) < 5]
    # Matcher
    j = [term for term in voc_dict.keys() if term.startswith('m_') and len(term) < 5]
    
    # Find indices of terms
    indices_i = [voc_dict[term] for term in i]
    indices_j = [voc_dict[term] for term in j]
    
    # Selection with indices 
    table = distances[np.ix_(indices_i, indices_j)]
    
    if save_fig or show_fig:
        # Show results in heatmap
        plt.figure(figsize=(len(i)*0.7, len(j)*0.7))
        ax = sns.heatmap(table, xticklabels=j, yticklabels=i, annot=True, cbar=False, linewidths=0.05, linecolor='white', cmap="Blues_r", vmin=0, vmax=1)
        ax.xaxis.tick_top()
        ax.title.set_position([.5, 1.05])
        plt.yticks(rotation=0)
        plt.title("cosine distance\n%s" % title)
        
        if save_fig:
            plt.savefig("graphs/cosine_distance_%s.png" % title,  bbox_inches='tight')
        
        if show_fig:
            plt.show()
        
        if save_fig and not show_fig:
            plt.close()

    return distances

In [None]:
def verbatim_translation_measure(table, voc_dict):
    """ Calculate m_vt."""
    
    # Average d_3 <to> m_4, d_4 <to> m_3
    vt = [ table[voc_dict["d_3"], voc_dict["m_4"]],
           table[voc_dict["d_4"], voc_dict["m_3"]] ]
    
    m_vt = sum(vt) / len(vt)
    
    return m_vt

In [None]:
def proximity_translation_measure(table, voc_dict):
    """ Calculate m_pt."""
    
    # Average d_12 <to> m_4, d_23 <to> m_5, d_45 <to> m_2, d_56 <to> m_3
    pt = [ table[voc_dict["d_12"], voc_dict["m_4"]],
               table[voc_dict["d_23"], voc_dict["m_5"]],
               table[voc_dict["d_45"], voc_dict["m_2"]],
               table[voc_dict["d_56"], voc_dict["m_3"]] ]

    m_pt = sum(pt) / len(pt)
    
    return m_pt

In [None]:
def keys_mapping_measure(table, voc_dict):
    """Calculate m_km."""
    
    # Single keys
    i = [term for term in voc_dict.keys() if term.startswith('d_') and len(term) == 3]
    j = [term for term in voc_dict.keys() if term.startswith('m_') and len(term) == 3]
    indices_i = [voc_dict[term] for term in i]
    indices_j = [voc_dict[term] for term in j]
    t = table[np.ix_(indices_i, indices_j)]
    
    # Calculate average lowest over rows
    km = [min(row) for row in t]
    m_km = sum(km) / len(km)
    
    return m_km

In [None]:
def chords_mapping_measure(table, voc_dict):
    """Calculate m_cm."""
    
    # Chords
    i = [term for term in voc_dict.keys() if term.startswith('d_') and len(term) <= 4]
    j = [term for term in voc_dict.keys() if term.startswith('m_') and len(term) <= 4]
    indices_i = [voc_dict[term] for term in i]
    indices_j = [voc_dict[term] for term in j]
    t = table[np.ix_(indices_i, indices_j)]
    
    # Calculate average lowest over rows
    cm = [min(row) for row in t]
    m_cm = sum(cm) / len(cm)
    
    return m_cm

In [None]:
def calculate_measures(table, voc_dict, print_results=False):
    """
    Calculate all measures.
    """

    # Measures
    m_vt = verbatim_translation_measure(table, voc_dict)
    m_pt = proximity_translation_measure(table, voc_dict)
    m_km = keys_mapping_measure(table, voc_dict)
    m_cm = chords_mapping_measure(table, voc_dict)
    
    if print_results:
        print("%-24s: %-10.3f" % ("verbatim translation measure", m_vt))
        print("%-24s: %-10.3f" % ("proximity translation measure", m_pt))
        print("%-24s: %-10.3f" % ("keys mapping measure", m_km))
        print("%-24s: %-10.3f" % ("chords mapping measure", m_cm))
        
    return((m_vt, m_pt, m_km, m_cm))

#### RESULTS

In [None]:
def write_results(n, pair, key_setup, score, measures):
    """
    Write csv's to be further processed in R.
    params:
        n: context window with which co-occurrences were calculated
        pair: list of pairs
        key_setup: list of key setups
        score: list of scores
        measures: list of tuples of all measures
    results:
        r/data/measures.csv written
    """
    
    # Write measures
    df = pd.DataFrame(
        data = {
            "n": n,
            "pair": pair,
            "key_setup": key_setup,
            "score": score
        }
    )
    
    # Column titles
    titles_measures = [
        "m_vt",
        "m_pt",
        "m_km",
        "m_cm"
    ]    
    
    # Iterate over results
    for i in range(len(measures[0])):
        measure_title = titles_measures[i]
        measure_values = []
            
        for m in range(len(measures)):
            measure_values.append(measures[m][i])
            
        # Add colum to dataframe
        df[measure_title] = measure_values    
    
    # Save df to csv
    title_df = "measures_n=%d_%s"% (n, datetime.now().strftime("%d_%m_%Y"))
    df.to_csv('r/data/'+title_df+'.csv', index=False)     

#### PIPELINE

In [None]:
data = read_data()

In [None]:
data = preprocess(data)

In [None]:
data

In [None]:
# Settings pipeline
show_fig = True
save_fig = True
print_results = False

# General info
length_sequences = []

# Data
n = 5
pairs = []
setups = []    
score = []
measures = []

# Pipeline for each pair, sorted by key setup
for setup, data_setup in data.groupby('setup'):

    for pair, data_pair in data_setup.groupby('pair'):
    
        # Reset index for each pair
        data_pair = data_pair.reset_index(drop=True)
    
        # Title for graphs: pair number, key setup, score, date
        title = "n=%d-%s-%s-%s-%s"%(n, pair, data_pair.loc[0, 'setup'], data_pair.loc[len(data_pair.index)-1, 'score'], datetime.now().strftime("%d_%m_%Y"))
        print("\n%s %s" % (title, "-"*78))
    
        # Measures
        pairs.append(pair)
        setups.append(setup)
        score.append(data_pair.loc[len(data_pair.index)-1, 'score'])
        
        # A. PARSING
        stream = parse(data_pair)
        length_sequences.append([len(sequence) for sequence in stream])
    
        # B. CO-OCCURRENCES
        m, voc_dict = matrix(stream, n)
            
        # Visualization embeddings
        project_embeddings(title, m, voc_dict, save_fig, show_fig)
    
        # C. STATEMENTS
        table = table_distances(title, m, voc_dict, save_fig, show_fig)
        measures.append(calculate_measures(table, voc_dict, print_results))

In [None]:
# Write results to CSV's for further analysis in R
write_results(n, pairs, setups, score, measures)