In [5]:
# Finds and returns all scribal abbreviations in a text by looking for overbars
def find(text):
    shorts = []
    words = text.split()
    for i in words:
        if chr(773) in i:
            i = strip_accents(i)
            i = clean_text(i)
            if i not in shorts:
                shorts.append(i)
    return shorts

In [6]:
import unicodedata
import string
# Removes diacritical marks from the Greek text of a manuscript
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn').upper()

In [7]:
# Normalizes text for comparison
def clean_text(text):
    s = text.translate(str.maketrans('', '', string.punctuation))
    s = s.translate({ord(i): None for i in '·'})
    s = s.translate({ord(i): None for i in '’'})
    s = s.translate({ord(i): None for i in '˙'})
    s = s.translate({ord(i): None for i in 'ʼ'})
    s = s.translate({ord(i): None for i in '⸆'})
    s = s.translate({ord(i): None for i in '❏'})
    s = s.translate({ord(i): None for i in '›'})
    s = s.translate({ord(i): None for i in 'ϗ'})
    s = s.translate({ord(i): None for i in ';'})
    s = s.translate({ord(i): None for i in ';'})
    s = s.translate({ord(i): None for i in '»'})
    s = s.translate({ord(i): None for i in '«'})
    s = s.translate({ord(i): None for i in ';'})
    s = s.translate({ord(i): None for i in 'A'})
    s = s.translate({ord(i): None for i in 'B'})
    s = s.translate({ord(i): None for i in '᾽'})
    s = s.translate({ord(i): None for i in '·'})
    found = False
    left = ""
    right = ""
    for i in range(len(s)):
        if s[i] == '«':
            found = True
            left = s[:i]
        if found:
            if s[i] == '»':
                right = s[i+1:]
                break
    if len(left) != 0 and len(right) != 0:
        print(s)
        s = left + right
    for c in s:
        if ord(c) < 913 or ord(c) > 937:
            s.replace(c, '')
    return s

In [8]:
def clean_text1(text):
    s = text.translate(str.maketrans('', '', string.punctuation))
    for c in s:
        if ord(c) < 913 or ord(c) > 937:
            print(s)
            print(s)
    return s

In [9]:
def separate_vnum(s):
    digit = False
    for i in range(len(s)):
        if s[i].isdigit():
            digit = True
        elif digit:
            digit = False
            s = s[:i] + " " + s[i:]
    return s

In [10]:
# Finds all scribal abbreviations in P66 as a baseline
file = open("P66(2).txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
text = separate_vnum(text)
find(text)

['ΘΝ',
 'ΘΣ',
 'ΑΝΘΡΩΠΩ',
 '',
 'ΘΥ',
 'ΕΡΧΟΜΕΝΟ',
 'ΗΜΕΙ',
 'ΠΡΣ',
 'ΙΥ',
 'ΧΥ',
 'ΙΕΡΟΣΟΛΥΜΩ',
 'ΕΡΩΤΗΣΩΣΙ',
 'ΧΣ',
 'ΚΥ',
 'ΙΝ',
 'ΠΝΑ',
 'ΠΝΙ',
 'ΑΓΙ',
 'ΥΣ',
 'ΠΕΡΙΠΑΤΟΥ',
 'ΙΣ',
 'ΑΚΟΛΟΥΘΗΣΑ',
 'ΠΡΩΤΟ',
 'ΗΘΕΛΗΣΕ',
 'ΒΗΘΣΑΙΔΑ',
 'ΥΝ',
 'ΦΙΛΙΠΠΟ',
 'ΑΝΟΥ']

In [11]:
# Dictionary that maps abbreviations to full words
ab_dict = {'ΘΝ' : 'ΘΕΟΝ',
 'ΘΣ' : 'ΘΕΟΣ',
 'ΑΝΘΡΩΠΩ' : 'ΑΝΘΡΩΠΩΝ',
 '' : '',
 'ΘΥ' : 'ΘΕΟΥ',
 'ΕΡΧΟΜΕΝΟ' : 'ΕΡΧΟΜΕΝΟΝ',
 'ΗΜΕΙ' : 'ΗΜΕΙΣ',
 'ΠΡΣ' : 'ΠΑΤΡΟΣ',
 'ΙΥ' : 'ΙΗΣΟΥ',
 'ΧΥ' : 'ΧΡΙΣΤΟΥ',
 'ΙΕΡΟΣΟΛΥΜΩ' : 'ΙΕΡΟΣΟΛΥΜΩΝ',
 'ΕΡΩΤΗΣΩΣΙ' : 'ΕΡΩΤΗΣΩΣΙΝ',
 'ΧΣ' : 'ΧΡΙΣΤΟΣ',
 'ΚΥ' : 'ΚΥΡΙΟΥ',
 'ΙΝ' : 'ΙΗΣΟΥΝ',
 'ΠΝΑ' : 'ΠΝΕΥΜΑ',
 'ΠΝΙ' : 'ΠΝΕΥΜΑΤΙ',
 'ΑΓΙ' : 'ΑΓΙΩ',
 'ΥΣ' : 'ΥΙΟΣ',
 'ΠΕΡΙΠΑΤΟΥ' : 'ΠΕΡΙΠΑΤΟΥΝΤΙ',
 'ΙΣ' : 'ΙΗΣΟΥΣ',
 'ΑΚΟΛΟΥΘΗΣΑ' : 'ΑΚΟΛΟΥΘΗΣΑΝ',
 'ΠΡΩΤΟ' : 'ΠΡΩΤΟΝ',
 'ΗΘΕΛΗΣΕ' : 'ΗΘΕΛΗΣΕΝ',
 'ΒΗΘΣΑΙΔΑ' : 'ΒΗΘΣΑΙΔΑ',
 'ΥΝ' : 'ΥΙΟΝ',
 'ΦΙΛΙΠΠΟ' : 'ΦΙΛΙΠΠΟΣ',
 'ΑΝΟΥ' : 'ΑΝΘΡΩΠΟΥ',
 'ΑΝΟΣ' : 'ΑΝΘΡΩΠΟΣ',
 'ΓΕ' : 'ΓΕΝΝΗΘΗ',
 'ΤΗ' : 'ΤΗΝ',
 'ΑΜΗ' : 'ΑΜΗΝ',
 'ΠΝΣ' : 'ΠΝΕΥΜΑΤΟΣ',
 'ΠΝΕΥΑ' : 'ΠΝΕΥΜΑ',
 'ΕΣΤΙ' : 'ΕΣΤΙΝ',
 'ΠΝΣ' : 'ΠΝΕΥΜΑΤΟΣ',
 'ΥΜΕΙ' : 'ΥΜΕΙΣ',
 'ΟΥΡΑΝΟ' : 'ΟΥΡΑΝΟΥ',
 'ΤΟ' : 'ΤΟΝ',
 'ΤΗ' : 'ΤΗΝ',
 'ΘΩ' : 'ΘΕΩ',
 'ΕΡΧΟ' : 'ΕΡΧΟΝΤΑΙ',
 'ΕΙΠΟ' : 'ΕΙΠΟΝ',
 'Ω' : 'ΩΝ',
 'ΠΗΡ' : 'ΠΑΤΗΡ',
 'ΥΩ' : 'ΥΙΩ', 
 'ΥΜΙ' : 'ΥΜΙΝ', 
 'ΚΟΛΛΥΒΙΣΤΩ' : 'ΚΟΛΛΥΒΙΣΤΩΝ', 
 'ΕΙΠΕ' : 'ΕΙΠΕΝ', 
 'Ε' : 'ΕΝ',
 'ΑΥΤΟ' : 'ΑΥΤΟΝ',
 'Ο' : 'ΟΝ', 
 'ΕΙΧΕ' : 'ΕΙΧΕΝ', 
 'Β' : 'ΔΥΟ', 
 'Γ' : 'ΤΡΕΙΣ', 
 'ΠΩΛΟΥ' : 'ΠΩΛΟΥΝΤΑΣ', 
 'ΣΧΟΙΝΙΩ' : 'ΣΧΟΙΝΙΩΝ',  
 'ΠΡΟΣ' : 'ΠΑΤΡΟΣ', 
 'Μ' : 'ΤΕΣΣΕΡΑΚΟΝΤΑ', 
 'ΟΥ' : 'ΟΥΝ', 
 'ΤΩ' : 'ΤΩΝ',
 'ΠΝΕΙ' : 'ΠΝΕΙ',
 'ΙΗΛ' : 'ΙΣΡΑΗΛ',
 'ΥΥ' : 'ΥΙΟΥ', 
 'ΑΝΟΙ' : 'ΑΝΘΡΩΠΟΙ', 
 'ΑΚΟΥΩ' : 'ΑΚΟΥΩΝ', 
 'ΟΥΝΟΥ' : 'ΟΥΡΑΝΟΥ',
 'ΧΡΣ' : 'ΧΡΙΣΤΟΣ',
 'ΑΝΩΝ' : 'ΑΝΘΡΩΠΩΝ',
 'ΙΗΣ' : 'ΙΗΣΟΥΣ',
 'ΠΡΑ' : 'ΠΑΤΗΡΑ', 
 'ΠΑΛΙ' : 'ΠΑΛΙΝ',
 'ΕΙΠΕ' : 'ΕΙΠΕΝ', 
 'ΠΡΙ' : 'ΠΑΤΡΙ',  
 'ΕΙΠΑ' : 'ΕΙΠΑΝ',
 'ΑΝΟΝ' : 'ΑΝΘΡΩΠΟΝ',
 'ΥΜΩ' : 'ΥΜΩΝ',
 'ΕΙΔΕ' : 'ΕΙΔΕΝ', 
 'ΑΝΘΡΩΠΟ' : 'ΑΝΘΡΩΠΟΝ', 
 'ΗΜΩ' : 'ΗΜΩΝ',
 'ΛΟΓΟ' : 'ΛΟΓΟΝ',
 'ΚΕ' : 'ΚΥΡΙΕ'}

In [12]:
# Replaces scribal abbreviations with full words
def replace(dict, text):
    words = text.split()
    newtext = ""
    for word in words:
        if chr(773) in word:
            word = strip_accents(word)
            newtext += dict[word] + " "
        else:
            newtext += word + " "
    return newtext

In [13]:
# Looks for any new scribal abbreviations in the given text using the already constructed dictionary
def find_new(text):
    text = separate_vnum(text)
    words = text.split()
    new = []
    for word in words:
        if chr(773) in word:
            word = clean_text(word)
            word = strip_accents(word)
            if word not in ab_dict.keys() and word not in new:
                new.append(word)
    return new

In [14]:
def gen_dict(text):
    words = set()
    wlist = text.split()
    for w in wlist:
        if not w.isnumeric():
            words.add(w)
    return words

In [15]:
def addMan(data, text, date, category, msid):
    text = separate_vnum(text)
    text = clean_text(text)
    text = replace(ab_dict, text)
    text = strip_accents(text)
    data[0] = date
    verse = ""
    vnum = 0
    sp = text.split()
    for i in sp:
        try:
            temp = vnum
            vnum = int(i)
            if verse != "":
                data[temp] = verse
                verse = ""
                while(temp != vnum + 1 and temp != vnum):
                    temp += 1
                    data[temp] = verse
        except ValueError:
            verse += i
    data[vnum] = verse
    data.append(category)
    data.append(msid)
    mss.append(data)

In [16]:
# NA28 reading of John 1
file = open("NA28.txt", encoding='utf-8')
s = file.read().replace('\n', ' ')

In [17]:
from copy import copy
s = clean_text(s)
s = strip_accents(s)
NA = copy(s)

In [18]:
# The base text is from the Nestle-Aland Greek New Testament Version 28
test = [None]*100
test.append("NA28")
# The first element of the array will usually be the date of the manuscript, but for the base text it is 0
test[0] = 0;
verse = ""
vnum = 0
sp = s.split()
for i in sp:
    try:
        temp = vnum
        vnum = int(i)
        if verse != "":
            test[temp] = verse
            verse = ""
    except ValueError:
        verse = verse+i
test[vnum] = verse
print(test[2])
mss = [test]

ΟΥΤΟΣΗΝΕΝΑΡΧΗΠΡΟΣΤΟΝΘΕΟΝ


In [19]:
# Loads Codex Sinaiticus (01) into the database
training = [None]*100
file = open("01.txt", encoding='utf-8')
name = "Sinaiticus(01)"
category = 1
msid = 1001
# Codex Sinaiticus was written around 330-350 A.D, so the date is estimated here at 340
date = 340
text = file.read().replace('\n', ' ')

In [20]:
find_new(text)

[]

In [21]:
addMan(training, text, date, category, msid)

In [22]:
# Loads manuscript P5 into the database
training = [None]*100
file = open("P5.txt", encoding='utf-8')
name = "P5"
category = 1
msid = 5
# Manuscript P5 was written around the third century A.D, so the date is estimated here at 250
date = 250
text = file.read().replace('\n', ' ')

In [23]:
find_new(text)

[]

In [24]:
addMan(training, text, date, category, msid)

In [25]:
# Loads Codex Bezae (05) into the database
training = [None]*100
file = open("05.txt", encoding='utf-8')
name = "05"
category = 4
msid = 1005
# Codex Bezae was written around 400 A.D.
date = 400
text = file.read().replace('\n', ' ')

In [26]:
find_new(text)

[]

In [27]:
addMan(training, text, date, category, msid)

In [28]:
# Loads manuscript P66 into the database
training = [None]*100
file = open("P66.txt", encoding='utf-8')
name = "P66"
category = 1
msid = 66
# Manuscript P66 was likely written around 150 A.D.
date = 150
text = file.read().replace('\n', ' ')

In [29]:
find_new(text)

[]

In [30]:
addMan(training, text, date, category, msid)

In [31]:
# Loads manuscript P75 into the database
training = [None]*100
file = open("P75.txt", encoding='utf-8')
name = "P75"
category = 1
msid = 75
# Manuscript P75 was likely written around 200 A.D.
date = 150
text = file.read().replace('\n', ' ')

In [32]:
find_new(text)

[]

In [33]:
addMan(training, text, date, category, msid)

In [34]:
# Loads Codex Vaticanus (03) into the database
training = [None]*100
file = open("03.txt", encoding='utf-8')
name = "03"
category = 4
msid = 1003
# Codex Vaticanus was written around 350 A.D.
date = 350
text = file.read().replace('\n', ' ')

In [35]:
find_new(text)

[]

In [36]:
addMan(training, text, date, category, msid)

In [37]:
# Computes the mode length of each verse and stores it in an array
def find_modes(mss, vnum):
    from statistics import mode
    modes = [0]
    for i in range(1, vnum+1):
        lengths = []
        for j in range(len(mss)):
            if mss[j][i] != None:
                lengths.append(len(mss[j][i]))
        try:
            modes.append(mode(lengths))
        except:
            try:
                modes.append(max(lengths, key = lengths.count))
            except:
                modes.append(0)
    return modes

In [38]:
# Finds the number of manuscripts that agree with the current reading
def findMatches(mss, text, vnum):
    matches = 0
    for i in mss:
        if i[vnum] == text:
#             print("Made it!")
            matches += 1
    return matches

In [39]:
# Converts the text of a verse into an integer
def textToNum(text):
    total = 0;
    for i in range(len(text)):
        val = (ord(text[i])-912)*24**(i+1)
        total += val
    return total

In [40]:
def meanDate(mss):
    sumdates = 0
    numdates = 0
    for i in mss:
        if i != mss[0]:
            sumdates += i[0]
            numdates += 1
    return sumdates/numdates

In [41]:
# Formats and generates training data
def get_data(mss, vnum):
    avgdate = meanDate(mss)
    data = []
    targets = []
    for i in mss:
        if i != mss[0]:
            for j in range(1, vnum+1):
                if i[j] != None:
                    # Each verse is stored in the format [date, length-mode, number of manuscripts with the same reading, textual category, id]
                    verse = []
                    verse.append(i[0]-avgdate)
                    verse.append(len(i[j])-modes[j])
                    verse.append(findMatches(mss[1:], i[j], j))
                    verse.append(i[100])
                    verse.append(i[101])
                    data.append(verse)
                    if i[j] == mss[0][j]:
                        targets.append(1)
                    else:
                        targets.append(0)
    return data, targets

In [42]:
def get_variants(mss, length):
    verses = []
    for i in range(1,length):
        variants = {}
        for j in mss:
            if j[i] not in variants and j[i] != None:
                variants[j[i]] = findMatches(mss,j[i],i)
        verses.append(variants)
    return verses

In [43]:
import csv
from copy import deepcopy
def export_data(filename, data):
    data = deepcopy(data)
    rows = [["Date", "Length-Mode", "# of MSS in Agreement", "Category", "Correct"]]
    for i in range(len(data[0])):
        data[0][i].append(data[1][i])
        rows.append(data[0][i])
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(rows)

In [44]:
from sklearn.svm import SVC
modes = find_modes(mss, 51)
training = get_data(mss, 51)
data = training[0]
targets = training[1]
export_data("data.csv", training)

In [45]:
print("Version 1: πάντα δι’ αὐτοῦ ἐγένετο, καὶ χωρὶς αὐτοῦ ἐγένετο οὐδ' ἕν. ὃ γέγονεν")
print("Version 2: πάντα δι’ αὐτοῦ ἐγένετο, καὶ χωρὶς αὐτοῦ ἐγένετο οὐδὲ ἕν. ὃ γέγονεν")
print("Version 3: πάντα δι’ αὐτοῦ ἐνέγετο, καὶ χωρὶς αὐτοῦ ἐγένετο οὐδ' ἕν. ὃ γέγονεν\n")
variants = get_variants(mss, 51)
vnum = 3
for i in variants[vnum-1]:
    print(i + " - # number of manuscripts: " + str(variants[vnum-1][i]))
print("\nJohn 1:3 - \"All things through him were made, and without him was not anything made that was made.\"")
print("Version 1: original text, copied perfectly")
print("Version 2: unabreviated ουδέ (nothing/not anything) as opposed to ουδ' in the original")
print("Version 3: misspelled word due to scribal error")

Version 1: πάντα δι’ αὐτοῦ ἐγένετο, καὶ χωρὶς αὐτοῦ ἐγένετο οὐδ' ἕν. ὃ γέγονεν
Version 2: πάντα δι’ αὐτοῦ ἐγένετο, καὶ χωρὶς αὐτοῦ ἐγένετο οὐδὲ ἕν. ὃ γέγονεν
Version 3: πάντα δι’ αὐτοῦ ἐνέγετο, καὶ χωρὶς αὐτοῦ ἐγένετο οὐδ' ἕν. ὃ γέγονεν

ΠΑΝΤΑΔΙΑΥΤΟΥΕΓΕΝΕΤΟΚΑΙΧΩΡΙΣΑΥΤΟΥΕΓΕΝΕΤΟΟΥΔΕΕΝΟΓΕΓΟΝΕΝ - # number of manuscripts: 3
ΠΑΝΤΑΔΙΑΥΤΟΥΕΓΕΝΕΤΟΚΑΙΧΩΡΙΣΑΥΤΟΥΕΓΕΝΕΤΟΟΥΔΕΝΟΓΕΓΟΝΕΝ - # number of manuscripts: 2
ΠΑΝΤΑΔΙΑΥΤΟΥΕΝΕΓΕΤΟΚΑΙΧΩΡΙΣΑΥΤΟΥΕΓΕΝΕΤΟΟΥΔΕΝΟΓΕΓΟΝΕΝ - # number of manuscripts: 1

John 1:3 - "All things through him were made, and without him was not anything made that was made."
Version 1: original text, copied perfectly
Version 2: unabreviated ουδέ (nothing/not anything) as opposed to ουδ' in the original
Version 3: misspelled word due to scribal error


In [46]:
variants = get_variants(mss, 51)
vnum = 4
for i in variants[vnum-1]:
    print(i + ": " + str(variants[vnum-1][i]))
print("\nJohn 1:4 - \"In him was life, and the life was the light of men.\"")
print("Version 1: original text, copied perfectly")
print("Version 2: \"In him is life and the life was the light of men.\"")
print("Version 3: \"in\" implied by dative αὐτω rather than explicit")

ΕΝΑΥΤΩΖΩΗΗΝΚΑΙΗΖΩΗΗΝΤΟΦΩΣΤΩΝΑΝΘΡΩΠΩΝ: 3
ΕΝΑΥΤΩΖΩΗΕΣΤΙΝΚΑΙΗΖΩΗΗΝΤΟΦΩΣΤΩΝΑΝΘΡΩΠΩΝ: 2
ΑΥΤΩΖΩΗΗΝΚΑΙΗΖΩΗΗΝΤΟΦΩΣΤΩΝΑΝΘΡΩΠΩΝ: 1

John 1:4 - "In him was life, and the life was the light of men."
Version 1: original text, copied perfectly
Version 2: "In him is life and the life was the light of men."
Version 3: "in" implied by dative αὐτω rather than explicit


In [47]:
# The SVM produces an impressive 97.8% accuracy on the training data
clf = SVC(gamma="auto", C=200000)
clf.fit(data, targets)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))

0.9789915966386554
[[114   4]
 [  1 119]]


In [48]:
variants = get_variants(mss, 51)
for i in variants:
    for j in i:
        print(j + ": " + str(i[j]))
    print()

ΕΝΑΡΧΗΗΝΟΛΟΓΟΣΚΑΙΟΛΟΓΟΣΗΝΠΡΟΣΤΟΝΘΕΟΝΚΑΙΘΕΟΣΗΝΟΛΟΓΟΣ: 6

ΟΥΤΟΣΗΝΕΝΑΡΧΗΠΡΟΣΤΟΝΘΕΟΝ: 6

ΠΑΝΤΑΔΙΑΥΤΟΥΕΓΕΝΕΤΟΚΑΙΧΩΡΙΣΑΥΤΟΥΕΓΕΝΕΤΟΟΥΔΕΕΝΟΓΕΓΟΝΕΝ: 3
ΠΑΝΤΑΔΙΑΥΤΟΥΕΓΕΝΕΤΟΚΑΙΧΩΡΙΣΑΥΤΟΥΕΓΕΝΕΤΟΟΥΔΕΝΟΓΕΓΟΝΕΝ: 2
ΠΑΝΤΑΔΙΑΥΤΟΥΕΝΕΓΕΤΟΚΑΙΧΩΡΙΣΑΥΤΟΥΕΓΕΝΕΤΟΟΥΔΕΝΟΓΕΓΟΝΕΝ: 1

ΕΝΑΥΤΩΖΩΗΗΝΚΑΙΗΖΩΗΗΝΤΟΦΩΣΤΩΝΑΝΘΡΩΠΩΝ: 3
ΕΝΑΥΤΩΖΩΗΕΣΤΙΝΚΑΙΗΖΩΗΗΝΤΟΦΩΣΤΩΝΑΝΘΡΩΠΩΝ: 2
ΑΥΤΩΖΩΗΗΝΚΑΙΗΖΩΗΗΝΤΟΦΩΣΤΩΝΑΝΘΡΩΠΩΝ: 1

ΚΑΙΤΟΦΩΣΕΝΤΗΣΚΟΤΙΑΦΑΙΝΕΙΚΑΙΗΣΚΟΤΙΑΑΥΤΟΟΥΚΑΤΕΛΑΒΕΝ: 5
ΚΑΙΤΟΦΩΣΕΝΤΗΣΚΟΤΕΙΑΦΑΙΝΕΙΚΑΙΗΣΚΟΤΕΙΑΑΥΤΟΟΥΚΑΤΕΛΑΒΕΝ: 1

ΕΓΕΝΕΤΟΑΝΘΡΩΠΟΣΑΠΕΣΤΑΛΜΕΝΟΣΠΑΡΑΘΕΟΥΟΝΟΜΑΑΥΤΩΙΩΑΝΝΗΣ: 3
ΕΓΕΝΕΤΟΑΝΘΡΩΠΟΣΑΠΕΣΤΑΛΜΕΝΟΣΠΑΡΑΘΕΟΥΗΝΟΝΟΜΑΑΥΤΩΙΩΑΝΝΗΣ: 1
ΕΓΕΝΕΤΟΑΝΘΡΩΠΟΣΑΠΕΣΤΑΛΜΕΝΟΣΠΑΡΑΚΥΡΙΟΥΕΝΟΝΟΜΑΑΥΤΩΙΩΑΝΝΗΝ: 1
ΕΓΕΝΕΤΟΑΝΘΡΩΠΟΣΑΠΕΣΤΑΛΜΕΝΟΣΠΑΡΑΘΕΟΥΟΝΟΜΑΑΥΤΩΙΩΑΝΗΣ: 1

ΟΥΤΟΣΗΛΘΕΝΕΙΣΜΑΡΤΥΡΙΑΝΙΝΑΜΑΡΤΥΡΗΣΗΠΕΡΙΤΟΥΦΩΤΟΣΙΝΑΠΑΝΤΕΣΠΙΣΤΕΥΣΩΣΙΝΔΙΑΥΤΟΥ: 4
ΟΥΤΟΣΗΛΘΕΝΕΙΣΜΑΡΤΥΡΙΑΝΙΝΑΜΑΡΤΥΡΗΣΗΠΕΡΙΤΟΥΦΩΤΟΣΙΝΑΠΑΝΤΕΣΠΙΣΤΕΥΣΟΥΣΙΝΔΙΑΥΤΟΥ: 1
ΟΥΤΟΣΗΛΘΕΝΕΙΣΜΑΡΤΥΡΙΟΝΙΝΑΜΑΡΤΥΡΗΣΗΠΕΡΙΤΟΥΦΩΤΟΣΙΝΑΠΑΝΤΕΣΠΙΣΤΕΥΣΩΣΙΝΔΙΑΥΤΟΥ: 1

ΟΥΚΗΝΕΚΕΙΝΟΣΤΟΦΩΣΑΛΛΙΝΑΜΑΡΤΥΡΗΣΗΠΕΡΙΤΟΥΦΩΤΟΣ: 4
ΟΥΚΗΝΕΚ

In [49]:
# Add John chapter 2 to training data
file = open("02NA28.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
NA += strip_accents(clean_text(text))
print(NA)
mss = []
msid = 0
testing = [None]*100
addMan(testing, text, 0, 0, 0)

1 ΕΝ ΑΡΧΗ ΗΝ Ο ΛΟΓΟΣ ΚΑΙ Ο ΛΟΓΟΣ ΗΝ ΠΡΟΣ ΤΟΝ ΘΕΟΝ ΚΑΙ ΘΕΟΣ ΗΝ Ο ΛΟΓΟΣ 2 ΟΥΤΟΣ ΗΝ ΕΝ ΑΡΧΗ ΠΡΟΣ ΤΟΝ ΘΕΟΝ 3 ΠΑΝΤΑ ΔΙ ΑΥΤΟΥ ΕΓΕΝΕΤΟ ΚΑΙ ΧΩΡΙΣ ΑΥΤΟΥ ΕΓΕΝΕΤΟ ΟΥΔΕ ΕΝ Ο ΓΕΓΟΝΕΝ 4 ΕΝ ΑΥΤΩ ΖΩΗ ΗΝ ΚΑΙ Η ΖΩΗ ΗΝ ΤΟ ΦΩΣ ΤΩΝ ΑΝΘΡΩΠΩΝ 5 ΚΑΙ ΤΟ ΦΩΣ ΕΝ ΤΗ ΣΚΟΤΙΑ ΦΑΙΝΕΙ ΚΑΙ Η ΣΚΟΤΙΑ ΑΥΤΟ ΟΥ ΚΑΤΕΛΑΒΕΝ 6 ΕΓΕΝΕΤΟ ΑΝΘΡΩΠΟΣ ΑΠΕΣΤΑΛΜΕΝΟΣ ΠΑΡΑ ΘΕΟΥ ΟΝΟΜΑ ΑΥΤΩ ΙΩΑΝΝΗΣ 7 ΟΥΤΟΣ ΗΛΘΕΝ ΕΙΣ ΜΑΡΤΥΡΙΑΝ ΙΝΑ ΜΑΡΤΥΡΗΣΗ ΠΕΡΙ ΤΟΥ ΦΩΤΟΣ ΙΝΑ ΠΑΝΤΕΣ ΠΙΣΤΕΥΣΩΣΙΝ ΔΙ ΑΥΤΟΥ 8 ΟΥΚ ΗΝ ΕΚΕΙΝΟΣ ΤΟ ΦΩΣ ΑΛΛ ΙΝΑ ΜΑΡΤΥΡΗΣΗ ΠΕΡΙ ΤΟΥ ΦΩΤΟΣ 9 ΗΝ ΤΟ ΦΩΣ ΤΟ ΑΛΗΘΙΝΟΝ Ο ΦΩΤΙΖΕΙ ΠΑΝΤΑ ΑΝΘΡΩΠΟΝ ΕΡΧΟΜΕΝΟΝ ΕΙΣ ΤΟΝ ΚΟΣΜΟΝ 10 ΕΝ ΤΩ ΚΟΣΜΩ ΗΝ ΚΑΙ Ο ΚΟΣΜΟΣ ΔΙ ΑΥΤΟΥ ΕΓΕΝΕΤΟ ΚΑΙ Ο ΚΟΣΜΟΣ ΑΥΤΟΝ ΟΥΚ ΕΓΝΩ 11 ΕΙΣ ΤΑ ΙΔΙΑ ΗΛΘΕΝ ΚΑΙ ΟΙ ΙΔΙΟΙ ΑΥΤΟΝ ΟΥ ΠΑΡΕΛΑΒΟΝ 12 ΟΣΟΙ ΔΕ ΕΛΑΒΟΝ ΑΥΤΟΝ ΕΔΩΚΕΝ ΑΥΤΟΙΣ ΕΞΟΥΣΙΑΝ ΤΕΚΝΑ ΘΕΟΥ ΓΕΝΕΣΘΑΙ ΤΟΙΣ ΠΙΣΤΕΥΟΥΣΙΝ ΕΙΣ ΤΟ ΟΝΟΜΑ ΑΥΤΟΥ 13 ΟΙ ΟΥΚ ΕΞ ΑΙΜΑΤΩΝ ΟΥΔΕ ΕΚ ΘΕΛΗΜΑΤΟΣ ΣΑΡΚΟΣ ΟΥΔΕ ΕΚ ΘΕΛΗΜΑΤΟΣ ΑΝΔΡΟΣ ΑΛΛ ΕΚ ΘΕΟΥ ΕΓΕΝΝΗΘΗΣΑΝ 14 ΚΑΙ Ο ΛΟΓΟΣ ΣΑΡΞ ΕΓΕΝΕΤΟ ΚΑΙ ΕΣΚΗΝΩΣΕΝ ΕΝ ΗΜΙΝ ΚΑΙ ΕΘΕΑΣΑΜΕΘΑ ΤΗΝ ΔΟΞΑΝ ΑΥΤΟΥ ΔΟΞΑΝ ΩΣ ΜΟΝΟΓΕΝΟΥΣ ΠΑΡΑ ΠΑ

In [50]:
find_new(text)

[]

In [51]:
# Loads John 2 from P66
file = open("02P66.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 150
category = 1
msid = 66
addMan(testing, text, date, category, msid)

In [52]:
find_new(text)

[]

In [53]:
# Loads John 2 from P75
file = open("02P75.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 150
category = 1
msid = 75
addMan(testing, text, date, category, msid)

In [54]:
find_new(text)

[]

In [55]:
# Loads John 2 from Codex Sinaiticus
file = open("0201.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 340
category = 1
msid = 1001
addMan(testing, text, date, category, msid)

In [56]:
find_new(text)

[]

In [57]:
# Loads John 2 from Codex Vaticanus
file = open("0203.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 350
category = 4
msid = 1003
addMan(testing, text, date, category, msid)

In [58]:
find_new(text)

[]

In [59]:
find_new(text)

[]

In [60]:
# Adds John 2 to training data sample
modes = find_modes(mss, 25)
testing = get_data(mss, 25)
data += testing[0]
targets += testing[1]
export_data("data.csv", training)

In [61]:
# The SVM is even more accurate with another chapter, producing 95.2% accuracy
clf = SVC(gamma="auto", C=200000)
clf.fit(data, targets)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))
svm = clf.predict(data)

0.985207100591716
[[168   4]
 [  1 165]]


In [62]:
# A decision tree produces even higher accuracy than an SVM, although it remains to be seen whether this holds when we increase the number 
# of features and the amount of training data.
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(data, targets)
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))
dtc = clf.predict(data)

0.985207100591716
[[168   4]
 [  1 165]]


In [63]:
# A neural network approach is almost equal to an SVM or decision tree
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(max_iter=200000)
clf.fit(data, targets)
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))
nn = clf.predict(data)

0.9704142011834319
[[168   4]
 [  6 160]]


In [64]:
# Ensemble learner
predictions = []
for i in range(len(targets)):
    if svm[i] + dtc[i] + nn[i] >= 2:
        predictions.append(1)
    else:
        predictions.append(0)
print(accuracy_score(targets, predictions))
print(confusion_matrix(targets, predictions))

0.985207100591716
[[168   4]
 [  1 165]]


In [65]:
# Add John chapter 3 to training data
file = open("03NA28.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
NA += strip_accents(clean_text(text))
mss = []
testing = [None]*100
addMan(testing, text, 0, 0, 0)

In [66]:
NA = separate_vnum(NA)
words = gen_dict(NA)

In [67]:
# Loads John 3 from P66
file = open("03P66.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 150
category = 1
msid = 66
addMan(testing, text, date, category, msid)

In [68]:
find_new(text)

[]

In [69]:
# Loads John 3 from P75
file = open("03P75.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 150
category = 1
msid = 75
addMan(testing, text, date, category, msid)

In [70]:
find_new(text)

[]

In [71]:
# Loads John 3 from Sinaiticus
file = open("0301.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 340
category = 1
msid = 1001
addMan(testing, text, date, category, msid)

In [72]:
find_new(text)

[]

In [73]:
# Loads John 3 from Codex Bezae
file = open("0305.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 400
category = 1
msid = 1005
addMan(testing, text, date, category, msid)

In [74]:
find_new(text)

[]

In [75]:
# Loads John 3 from Codex Vaticanus
file = open("0303.txt", encoding='utf-8')
text = file.read().replace('\n', ' ')
testing = [None]*100
date = 350
category = 1
msid = 1003
addMan(testing, text, date, category, msid)

In [76]:
find_new(text)

[]

In [77]:
# Adds John 3 to training data sample
modes = find_modes(mss, 36)
testing = get_data(mss, 36)
data += testing[0]
targets += testing[1]
export_data("data.csv", training)

In [116]:
# Chapter 3 throws in a bit more confusion, reducing accuracy to just below 95%
from sklearn.metrics import precision_recall_fscore_support as prf
clf = SVC(gamma="auto", C=200000)
clf.fit(data, targets)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))
print(prf(targets, clf.predict(data), average = 'binary'))
svm = clf.predict(data)

0.9878296146044625
[[249   5]
 [  1 238]]
(0.9794238683127572, 0.99581589958159, 0.9875518672199171, None)


In [117]:
# The decision tree actually improves its accuracy with the addition of chapter 3, topping 96%
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = 'gini', class_weight = 'balanced')
clf.fit(data, targets)
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))
print(prf(targets, clf.predict(data), average = 'binary'))
dtc = clf.predict(data)

0.9878296146044625
[[249   5]
 [  1 238]]
(0.9794238683127572, 0.99581589958159, 0.9875518672199171, None)


In [154]:
# A neural network approach doesn't achieve the same level of performance as an SVM or decision tree, but it is pretty close
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(max_iter=200000)
clf.fit(data, targets)
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))
print(prf(targets, clf.predict(data), average = 'binary'))
nn = clf.predict(data)

0.8803245436105477
[[214  40]
 [ 19 220]]
(0.8461538461538461, 0.9205020920502092, 0.8817635270541082, None)


In [143]:
from sklearn.neighbors import KNeighborsClassifier as knn
clf = knn()
clf.fit(data, targets)
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))
print(prf(targets, clf.predict(data)))
kn = clf.predict(data)

0.9756592292089249
[[246   8]
 [  4 235]]
(array([0.984     , 0.96707819]), array([0.96850394, 0.9832636 ]), array([0.97619048, 0.97510373]), array([254, 239], dtype=int64))


In [157]:
from sklearn.ensemble import RandomForestClassifier as RFC
clf = RFC()
clf.fit(data, targets)
print(accuracy_score(targets, clf.predict(data)))
print(confusion_matrix(targets, clf.predict(data)))
print(prf(targets, clf.predict(data)))
rfc = clf.predict(data)

0.9878296146044625
[[249   5]
 [  1 238]]
(array([0.996     , 0.97942387]), array([0.98031496, 0.9958159 ]), array([0.98809524, 0.98755187]), array([254, 239], dtype=int64))


In [158]:
# Ensemble learner
predictions = []
for i in range(len(targets)):
    if svm[i] + dtc[i] + nn[i] + kn[i] + rfc[i] >= 2:
        predictions.append(1)
    else:
        predictions.append(0)
print(accuracy_score(targets, predictions))
print(confusion_matrix(targets, predictions))
print(prf(targets, predictions, average = 'binary'))

0.9837728194726166
[[247   7]
 [  1 238]]
(0.9714285714285714, 0.99581589958159, 0.9834710743801652, None)
