In [1]:
# Render our plots inline
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (16, 6)

In [2]:
# adjust to your local directories 
embem_data_dir = '/home/jvdzwaan/data/embem/'
output_dir = '/home/jvdzwaan/data/tmp/'

In [36]:
# load data
def load_data(corpus, column_names, corpus_metadata, label_counts, body_parts, emotion_bodypart_pairs):
    c = pd.read_csv(corpus, header=None, sep='\t', index_col=0, names=column_names)
    md = pd.read_csv(corpus_metadata, index_col=0)
    l = pd.read_csv(label_counts, index_col=0)
    bp = pd.read_csv(body_parts, index_col=0)
    ebp = pd.read_csv(emotion_bodypart_pairs, index_col=0)
    return pd.concat([c, md, l, bp, ebp], axis=1)

def do_split1(row):
    parts = row['title+author'].split('/')
    if len(parts) == 2:
        return parts[0]
    return row['title+author']

def do_split2(row):
    parts = row['title+author'].split('/')
    if len(parts) == 2:
        return parts[1]
    return ''

corpus_big = load_data(os.path.join(embem_data_dir, 'corpus/corpus_big.csv'), ['id', 'year', 'genre', 'title', 'authors'],
                       os.path.join(embem_data_dir, 'dict/corpus_big_additional_metadata.csv'), 
                       os.path.join(embem_data_dir, 'dict/corpus_big_label_counts.csv'),
                       os.path.join(embem_data_dir, 'dict/corpus_big_heem_expanded_body_parts.csv'),
                       os.path.join(embem_data_dir, 'dict/corpus_big_emotion_bodypart_pairs.csv'))
corpus_big['source'] = ['nederlab' for i in corpus_big.index]
corpus_big['set'] = ['predictions' for i in corpus_big.index]
annotation = load_data(os.path.join(embem_data_dir, 'corpus/annotation_corpus.csv'), ['id', 'year', 'genre', 'title', 'authors'],
                       os.path.join(embem_data_dir, 'dict/annotation_additional_metadata.csv'), 
                       os.path.join(embem_data_dir, 'dict/annotation_label_counts.csv'),
                       os.path.join(embem_data_dir, 'dict/annotation_heem_expanded_body_parts.csv'),
                       os.path.join(embem_data_dir, 'dict/annotation_emotion_bodypart_pairs.csv'))
annotation['source'] = ['nederlab' for i in annotation.index]
annotation['set'] = ['annotations' for i in annotation.index]
ceneton = load_data(os.path.join(embem_data_dir, 'corpus/ceneton.csv'), ['id', 'year', 'genre', 'title', 'authors'],
                    os.path.join(embem_data_dir, 'dict/ceneton_additional_metadata.csv'),
                    os.path.join(embem_data_dir, 'dict/ceneton_label_counts.csv'),
                    os.path.join(embem_data_dir, 'dict/ceneton_heem_expanded_body_parts.csv'),
                    os.path.join(embem_data_dir, 'dict/ceneton_emotion_bodypart_pairs.csv'))
ceneton['source'] = ['ceneton' for i in ceneton.index]
ceneton['set'] = ['predictions' for i in ceneton.index]
edbo = load_data(os.path.join(embem_data_dir, 'corpus/edbo.csv'), ['id', 'year', 'genre', 'title+author'],
                 os.path.join(embem_data_dir, 'dict/edbo_additional_metadata.csv'),
                 os.path.join(embem_data_dir, 'dict/edbo_label_counts.csv'),
                 os.path.join(embem_data_dir, 'dict/edbo_heem_expanded_body_parts.csv'),
                 os.path.join(embem_data_dir, 'dict/edbo_emotion_bodypart_pairs.csv'))
edbo['source'] = ['edbo' for i in edbo.index]
edbo['set'] = ['predictions' for i in edbo.index]
edbo['title'] = edbo.apply(lambda row: do_split1(row), axis=1)
edbo['authors'] = edbo.apply(lambda row: do_split2(row), axis=1)

complete = pd.concat([annotation, corpus_big, ceneton, edbo]).fillna('')
complete[['year', 'genre', 'title', 'authors', 'source', 'set', 'period']]
complete[['year', 'genre', 'title', 'authors', 'source', 'set', 'period']].to_csv(os.path.join(output_dir, 'corpus_metadata.csv'), encoding='utf-8', sep='\t')