### This directory contains woodcut metadata for the English Broadside Ballad Archive (EBBA). A team at EBBA developed a tag taxonomy for a set of ~16,000 woodcut images

Each woodcut was tagged with one or two genre terms and then with a variable number of descriptive tags, depending on the image.

| Tag type	  | Tag description																		|
|-------------|-------------------------------------------------------------------------------------|
| Genre		  | a type of woodcut impression (portrait, landscape, etc)								|
| Descriptive | specific representations in the woodcut impression (animals, objects, people, etc.) |

In [None]:
# Data Exploration
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Enhanced table viewing
%load_ext google.colab.data_table

# **Gain insight as to how the tables are associated**



In [None]:
# Load the Data
desc_tags = pd.read_csv('https://raw.githubusercontent.com/KyleCodes/blockprints/main/descriptiveTags.csv')
genre_terms = pd.read_csv('https://raw.githubusercontent.com/KyleCodes/blockprints/main/genreTerms.csv')
imp_desc_tags = pd.read_csv('https://raw.githubusercontent.com/KyleCodes/blockprints/main/impDescriptiveTags.csv')
imp_genre_terms = pd.read_csv('https://raw.githubusercontent.com/KyleCodes/blockprints/main/impGenreTerms.csv')
impression_ids = pd.read_csv('https://raw.githubusercontent.com/KyleCodes/blockprints/main/impressions.csv')

<hr>

### **GENRE TERMS** 
check number of unique items and observe their mapping to IDs

In [None]:
genre_terms.head()

Unnamed: 0,BGT_ID,BGT_Term
0,17,map
1,3,allegory
2,14,maritime
3,5,portrait
4,16,architecture


In [None]:
# Number of genre terms: 12
len(genre_terms['BGT_Term'].unique())

12

<hr>

### **DESCRIPTIVE TAGS**
check number of unique items and observe their mapping to IDs


In [None]:
desc_tags.head()

Unnamed: 0,DT_ID,DT_Tag
0,1,ax
1,2,candle
2,3,carriage
3,4,knife / dagger
4,5,loom


In [None]:
# Number of descriptive terms
len(desc_tags['DT_Tag'].unique())

411

<hr>

### **IMPRESSION IDS (image file names)**
check number of unique items and observe their mapping to IDs

In [None]:
impression_ids.head()

Unnamed: 0,IMP_ID,IMP_File
0,1,20148-40.jpg
1,2,20982-10.jpg
2,3,31714-10.jpg
3,4,30230-40.jpg
4,5,21213-10.jpg


In [None]:
# Number of woodblock impressions
len(impression_ids['IMP_File'].unique())

16103

<hr>

### **IMP DESCRIPTIVE TAGS (maps descriptive tags to images)**
* `IDT_ID`: id (linking/association of tags to photo) 
* `IDT_IMP_ID`: impression id (photo) -> `IMP_ID` in impression_ids df
* `IDT_DT_ID`: descriptive tag id (tag)-> `DT_ID` in desc_tags df


In [None]:
imp_desc_tags.head()

Unnamed: 0,IDT_ID,IDT_IMP_ID,IDT_DT_ID
0,64500,1090,264
1,45790,4418,638
2,45792,4418,252
3,68057,3587,330
4,45788,4418,302


In [None]:
len(imp_desc_tags)

46431

In [None]:
# Unique identifier for a particular image-descriptive_tag mapping
imp_desc_tags['IDT_ID'].value_counts()

67583    1
48509    1
36219    1
34170    1
38264    1
        ..
84700    1
31694    1
72406    1
66261    1
67585    1
Name: IDT_ID, Length: 46431, dtype: int64

In [None]:
'''
  IDT_IMP_ID is a particular image ID within impression_ids df
  Notice that there can be many DTs associated to an impression, up to 50 per image
'''

imp_desc_tags['IDT_IMP_ID'].value_counts()

1548     50
5428     39
1024     33
4125     33
4037     33
         ..
6375      1
6374      1
11465     1
6563      1
2993      1
Name: IDT_IMP_ID, Length: 6295, dtype: int64

In [None]:
'''
  IDT_DT_ID is a particular descriptor tag within desc_tags df
  Notice that some descriptive tags occur more frequently than others
'''

five_most_counts = imp_desc_tags['IDT_DT_ID'].value_counts().head(5).values
five_least_counts = imp_desc_tags['IDT_DT_ID'].value_counts().tail(5).values


# Print 5 most frequently occuring descriptive tags
print('=================================\n5 most common descriptors\n=================================')
j = 0
for i in imp_desc_tags['IDT_DT_ID'].value_counts().head(5).index:
  tag_name = str(desc_tags[desc_tags['DT_ID'] == i]['DT_Tag'].values)
  n = str(five_most_counts[j])
  print('COUNT: ' + n + '\tTAG: ' + tag_name)
  j += 1

# Print 5 least frequently occurring tags 
print('\n=================================\n5 least common descriptors\n=================================')
j = 0
for i in imp_desc_tags['IDT_DT_ID'].value_counts().tail(5).index:
  tag_name = str(desc_tags[desc_tags['DT_ID'] == i]['DT_Tag'].values)
  n = str(five_least_counts[j])
  print('COUNT: ' + n + '\tTAG: ' + tag_name)
  j += 1


5 most common descriptors
COUNT: 4008	TAG: ['man']
COUNT: 3727	TAG: ['number of figures - 1']
COUNT: 3020	TAG: ['outdoor']
COUNT: 2467	TAG: ['woman']
COUNT: 1303	TAG: ['1625-1660, Caroline / Interregnum']

5 least common descriptors
COUNT: 1	TAG: ['Numeral 0']
COUNT: 1	TAG: ['storm']
COUNT: 1	TAG: ['chevron']
COUNT: 1	TAG: ["printing press / printer's tools"]
COUNT: 1	TAG: ['Numeral 7']


<hr>

### **IMP GENRE TERMS**


In [None]:
imp_genre_terms.head()

Unnamed: 0,IGT_ID,IGT_IMP_ID,IGT_BGT_ID
0,7072,4418,9
1,7073,1961,5
2,1550,2479,5
3,2205,3885,5
4,7074,223,9


In [None]:
len(imp_genre_terms)

6523

In [None]:
# Unique identifier for a particular image-genre_term mapping
imp_genre_terms['IGT_ID'].value_counts()

6141     1
7585     1
5520     1
7569     1
1426     1
        ..
10920    1
10928    1
8881     1
6838     1
6147     1
Name: IGT_ID, Length: 6523, dtype: int64

In [None]:
'''
  IDT_IMP_ID is a particular image ID within impression_ids df
  Notice that there can be many GTs associated to an impression, up to 3 per image
'''

imp_genre_terms['IGT_IMP_ID'].value_counts()

1777     3
526      3
9928     3
5911     3
4527     3
        ..
9763     1
7425     1
3331     1
5380     1
10241    1
Name: IGT_IMP_ID, Length: 6171, dtype: int64

In [None]:
'''
  IDT_DT_ID is a particular descriptor tag within desc_tags df
  Notice that some Genre tags occur more frequently than others
'''

five_most_counts = imp_genre_terms['IGT_BGT_ID'].value_counts().head(5).values
five_least_counts = imp_genre_terms['IGT_BGT_ID'].value_counts().tail(5).values


# Print 5 most frequently occuring genre tags
print('=================================\n5 most common genres\n=================================')
j = 0
for i in imp_genre_terms['IGT_BGT_ID'].value_counts().head(5).index:
  tag_name = str(genre_terms[genre_terms['BGT_ID'] == i]['BGT_Term'].values)
  n = str(five_most_counts[j])
  print('COUNT: ' + n + '\tTAG: ' + tag_name)
  j += 1

print('\n=================================\n5 least common genres\n=================================')
j = 0
for i in imp_genre_terms['IGT_BGT_ID'].value_counts().tail(5).index:
  tag_name = str(genre_terms[genre_terms['BGT_ID'] == i]['BGT_Term'].values)
  n = str(five_least_counts[j])
  print('COUNT: ' + n + '\tTAG: ' + tag_name)
  j += 1


5 most common genres
COUNT: 4154	TAG: ['portrait']
COUNT: 1445	TAG: ['narrative']
COUNT: 325	TAG: ['emblem / symbol']
COUNT: 180	TAG: ['maritime']
COUNT: 145	TAG: ['still life']

5 least common genres
COUNT: 50	TAG: ['architecture']
COUNT: 39	TAG: ['landscape']
COUNT: 3	TAG: ['map']
COUNT: 3	TAG: ["printer's mark"]
COUNT: 2	TAG: []


<hr>

## Now the tables have been explored and we understand the way IDs create associations between the tables. 

### It is time to build a master dataframe to link the tables together. We will index by impression ID and include `['impression_id', 'filename', 'desc_tags', 'genr_tags']`, where the two latter columns will be permitted to store lists.

**Note: This will be a pretty densely nested table solution. For our analysis we can definitely consider expanding into a sparse representation by multi-hot-encoding the categories**


Use this guide to create a cleaned dataframe

https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173

In [None]:
# Get the list of impression IDs
imp_id_list = impression_ids['IMP_ID'].values.tolist()
filename_list = []
desc_tags_by_impID = []
desc_tag_num_by_impID = []
genr_tags_by_impID = []
genr_tag_num_by_impID = []


# Loop through each impression and collect information about it
for curr_imp_id in imp_id_list:
  # Collect the filename for the current impression
  filename_list.append(impression_ids[impression_ids['IMP_ID'] == curr_imp_id]['IMP_File'].values[0])

  # Collect the list of tags associated with the current impression
  curr_dt_indexes = imp_desc_tags[imp_desc_tags['IDT_IMP_ID'] == curr_imp_id]['IDT_DT_ID'].values.tolist()
  curr_dt_values = []

  for i in curr_dt_indexes:
    try:
      curr_dt_values.append(desc_tags[desc_tags['DT_ID'] == i]['DT_Tag'].values[0])
    except:
      curr_dt_values.append([])
    
  
  desc_tags_by_impID.append(curr_dt_values)
  desc_tag_num_by_impID.append(len(curr_dt_values))

  # Collect the list of genres associated with the current impression
  curr_gt_indexes = imp_genre_terms[imp_genre_terms['IGT_IMP_ID'] == curr_imp_id]['IGT_BGT_ID'].values.tolist()
  curr_gt_values = []

  for i in curr_gt_indexes:
    try:
      curr_gt_values.append(genre_terms[genre_terms['BGT_ID'] == i]['BGT_Term'].values[0])
    except:
      curr_gt_values.append([])

  genr_tags_by_impID.append(curr_gt_values)
  genr_tag_num_by_impID.append(len(curr_gt_values))


In [None]:
df = pd.DataFrame(
    list(zip(
        imp_id_list, 
        filename_list, 
        genr_tags_by_impID, 
        genr_tag_num_by_impID, 
        desc_tags_by_impID, 
        desc_tag_num_by_impID)), 
    columns =['IMP_ID', 'filename', 'genre_terms', 'num_genres', 'desc_tags', 'num_desc_tags']
)

df

Unnamed: 0,IMP_ID,filename,genre_terms,num_genres,desc_tags,num_desc_tags
0,1,20148-40.jpg,[],0,[],0
1,2,20982-10.jpg,[portrait],1,"[bird, unspecified, church, steeple, window, 1...",13
2,3,31714-10.jpg,[],0,"[1680-onward, late Stuart / Georgian, man, num...",3
3,4,30230-40.jpg,[portrait],1,"[fan, number of figures - 1, outdoor, 1603-162...",6
4,5,21213-10.jpg,[],0,[],0
...,...,...,...,...,...,...
16098,16108,36412-20.jpg,[],0,[],0
16099,16109,36413-10.jpg,[],0,[],0
16100,16110,36413-20.jpg,[],0,[],0
16101,16111,36414-10.jpg,[],0,[],0
