# Creating a manuscript catalogue

In this proof of concept, we are building an online catalogue of French manuscripts.

In [1]:
import pandas as pd

In [13]:
mss = pd.read_csv('/Users/companjenba/surfdrive/Shared/French_manuscripts_project/contents_124.csv')
mss

Unnamed: 0,item,title,language,start_folio,start_side,end_folio,end_side
0,1,"Sigillum S. Marie super Cantica, by Honorius ...",Latin,1,r,18,v
1,2,Sermo S. Anselmi Ep. de conceptu uirginali et ...,Latin,18,v,19,r
2,3,Sermons in French (Dean no. 607),English,19,v,24,r
3,4,Sermons in English,English,24,r,24,v
4,5,Sermons in Latin,Latin,25,r,41,r
5,6,Sermon in English,English,41,r,42,r
6,7,Ne have thou no God buten one (DIMEV 3682),English,42,r,42,r
7,8,A note de elemosina,Latin,42,r,42,r
8,9,Regula S. Augustini exposita,Latin,43,r,71,r
9,10,Omni tempore benedicam deum.,Latin,72,r,73,v


Going from folio side ID to an ordinal page number helps calculate the number of pages.

- 1r = 1
- 1v = 2
- 2r = 3
- 2v = 4

etc. So multiply by 2 and subtract 1 if there's an `r` in the folio side ID.

In [14]:
def count_sides(ser):
    ordinal_start = ser['start_folio'] * 2
    if ser['start_side'] == 'r':
        ordinal_start -= 1
    ordinal_end = ser['end_folio'] * 2
    if ser['end_side'] == 'r':
        ordinal_end -= 1
    return ordinal_end - (ordinal_start - 1)

mss.apply(count_sides, axis=1)

0     36
1      2
2     10
3      2
4     33
5      3
6      1
7      1
8     57
9      4
10     1
11     1
12     1
13     1
14     6
15     3
16     4
17     9
18     6
dtype: int64

But these numbers do not account for sides with two or more texts – those sides count as 1 for each text.

Instead, we can calculate how much of each side that is the start or end for a text should count for each language. Let's assume all texts on a side take equal parts of the side. Then if a side has two (parts of) English texts and one Latin text, the side counts as $1/3$ for each text.

We need to create an index for all sides that contain the start and/or end of a text.

In [15]:
def folio_side_to_ordinal(folio, recto_verso):
    o = folio * 2
    if recto_verso == 'r':
        o -= 1
    return o

def fs2o(ser):
    ser['ordinal_start'] = folio_side_to_ordinal(ser['start_folio'], ser['start_side'])
    ser['ordinal_end'] = folio_side_to_ordinal(ser['end_folio'], ser['end_side'])
    return ser

mss = mss.apply(fs2o, axis=1)
mss

Unnamed: 0,item,title,language,start_folio,start_side,end_folio,end_side,ordinal_start,ordinal_end
0,1,"Sigillum S. Marie super Cantica, by Honorius ...",Latin,1,r,18,v,1,36
1,2,Sermo S. Anselmi Ep. de conceptu uirginali et ...,Latin,18,v,19,r,36,37
2,3,Sermons in French (Dean no. 607),English,19,v,24,r,38,47
3,4,Sermons in English,English,24,r,24,v,47,48
4,5,Sermons in Latin,Latin,25,r,41,r,49,81
5,6,Sermon in English,English,41,r,42,r,81,83
6,7,Ne have thou no God buten one (DIMEV 3682),English,42,r,42,r,83,83
7,8,A note de elemosina,Latin,42,r,42,r,83,83
8,9,Regula S. Augustini exposita,Latin,43,r,71,r,85,141
9,10,Omni tempore benedicam deum.,Latin,72,r,73,v,143,146


In [16]:
from collections import defaultdict
sides_languages = defaultdict(list)
for row_index, text in mss.iterrows():
    sides_languages[text['ordinal_start']].append(text['language'])
    if text['ordinal_start'] != text['ordinal_end']:
        sides_languages[text['ordinal_end']].append(text['language'])

sorted(sides_languages.items())

[(1, ['Latin']),
 (36, ['Latin', 'Latin']),
 (37, ['Latin']),
 (38, ['English']),
 (47, ['English', 'English']),
 (48, ['English']),
 (49, ['Latin']),
 (81, ['Latin', 'English']),
 (83, ['English', 'English', 'Latin']),
 (85, ['Latin']),
 (141, ['Latin']),
 (143, ['Latin']),
 (146, ['Latin', 'English']),
 (147, ['Latin', 'Latin']),
 (148, ['Latin']),
 (149, ['Latin']),
 (154, ['Latin', 'Latin']),
 (156, ['Latin', 'Latin']),
 (159, ['Latin', 'Latin']),
 (167, ['Latin', 'Latin']),
 (172, ['Latin'])]

In [17]:
sides_languages[36]

['Latin', 'Latin']

In [18]:
len(sides_languages[36])

2

In [19]:
def count_sides_better(ser):
    ser['count_in_between'] = max(ser['ordinal_end'] - 1 - ser['ordinal_start'], 0)
    ser['count_start'] = 1 / len(sides_languages[ser['ordinal_start']])
    ser['count_end'] = 1 / len(sides_languages[ser['ordinal_end']])
    ser['correction'] = 0
    if ser['ordinal_start'] == ser['ordinal_end']:
        ser['correction'] -= ser['count_end']
    ser['corrected_total_sides'] = ser['count_in_between'] + ser['count_start'] + ser['count_end'] + ser['correction']
    return ser

mss = mss.apply(count_sides_better, axis=1)
mss

Unnamed: 0,item,title,language,start_folio,start_side,end_folio,end_side,ordinal_start,ordinal_end,count_in_between,count_start,count_end,correction,corrected_total_sides
0,1,"Sigillum S. Marie super Cantica, by Honorius ...",Latin,1,r,18,v,1,36,34,1.0,0.5,0.0,35.5
1,2,Sermo S. Anselmi Ep. de conceptu uirginali et ...,Latin,18,v,19,r,36,37,0,0.5,1.0,0.0,1.5
2,3,Sermons in French (Dean no. 607),English,19,v,24,r,38,47,8,1.0,0.5,0.0,9.5
3,4,Sermons in English,English,24,r,24,v,47,48,0,0.5,1.0,0.0,1.5
4,5,Sermons in Latin,Latin,25,r,41,r,49,81,31,1.0,0.5,0.0,32.5
5,6,Sermon in English,English,41,r,42,r,81,83,1,0.5,0.333333,0.0,1.833333
6,7,Ne have thou no God buten one (DIMEV 3682),English,42,r,42,r,83,83,0,0.333333,0.333333,-0.333333,0.333333
7,8,A note de elemosina,Latin,42,r,42,r,83,83,0,0.333333,0.333333,-0.333333,0.333333
8,9,Regula S. Augustini exposita,Latin,43,r,71,r,85,141,55,1.0,1.0,0.0,57.0
9,10,Omni tempore benedicam deum.,Latin,72,r,73,v,143,146,2,1.0,0.5,0.0,3.5


These results can be written back to a CSV file.

In [20]:
mss.to_csv('/Users/companjenba/surfdrive/Shared/French_manuscripts_project/results/contents_124.csv', index=False)

To calculate the total sides and percentages for each language we need to group the rows by language. We also need the total number of sides.

In [22]:
grouped_by_language = mss.groupby('language')
total_sides = mss['corrected_total_sides'].sum()
total_sides

170.0

In [23]:
grouped_by_language['corrected_total_sides'].sum()

language
English     13.666667
Latin      156.333333
Name: corrected_total_sides, dtype: float64

In [25]:
sides_per_language = grouped_by_language.agg({'corrected_total_sides': sum })
sides_per_language['ratio'] = sides_per_language['corrected_total_sides'].apply(lambda x: x/total_sides*100)
sides_per_language

Unnamed: 0_level_0,corrected_total_sides,ratio
language,Unnamed: 1_level_1,Unnamed: 2_level_1
English,13.666667,8.039216
Latin,156.333333,91.960784
