## Sophocles' 'Oedipus Tyrannus' Line 625a Conjecture Analyser

#### Install nltk for bigrams

In [15]:
import nltk

### Metric #1: Vocabulary (forms)
#### Count frequency of all forms in Sophocles' works

In [16]:
with open("sophraw.txt", "r", encoding='utf8') as inputfile:
    text = inputfile.read()

with open("sophraw.txt", "r", encoding='utf8') as inputfile:
    lines = inputfile.readlines()

words = text.split()
stripped_words = []
for word in words:
    stripped_word = word.strip(',;.ʼ·:!')
    stripped_words.append(stripped_word)
word_dict = {}
for word in stripped_words:
    if word not in word_dict:
        word_dict[word] = 1
    else:
        word_dict[word] += 1

#### Determine maximum-scoring and minimum-scoring lines (for normalisation)

In [17]:
vocab_scores = {}
for line in lines:
    vocab_scores[line] = 0
    words = line.split()
    num = len(words)
    for word in words:
        new_word = word.strip(',;.ʼ·:!')
        vocab_scores[line] += round(word_dict[new_word] / num, 2)

# Most 'Sophoclean' (hexametric) line (by vocab, scaled): Antigone 667: "καὶ σμικρὰ καὶ δίκαια καὶ τἀναντία." - (spoken by Creon) score of 513
vocab_max = 513

# Least 'Sophoclean'(hexametric) line (by vocab, scaled): Ajax 820: "σιδηροβρῶτι θηγάνῃ νεηκονής·." (spoken by Ajax) - score of 1 (among others)
vocab_min = 1

### Metric #2: Vocabulary (lemmas)
#### Count frequency of lemmas

In [18]:
with open("sophlemmas.txt", "r", encoding='utf8') as lemmafile:
    lemmas_raw = lemmafile.read()

lemmas = lemmas_raw.split()
lemma_dict = {}
prev = ""
for lemma in lemmas:
    cur = lemma
    if cur not in lemma_dict:
        lemma_dict[cur] = 1
    else:
        if cur != prev:
            lemma_dict[cur] += 1
    prev = cur

#### Due to the need to manually lemmatise each line, determining the exact maximum-scoring and minimum-scoring lines was infeasible for this metric. Instead, a hypothetical normalisation of factor of 200 (based on the logic that a line containing only the most common non-stopwords would score just over 200, and a line containing only unique words would score 3 or 4, leading to a difference between maximum and minimum of approximately 200) was used.

In [19]:
lemma_norm = 200

### Metric #3: Syntax/style (bigrams)
#### Count frequency of bigrams in Sophocles' works

In [20]:
all_bigrams = []
bigrams_lines = []
for line in lines:
    words = line.split()
    stripped_words = []
    for word in words:
        stripped_word = word.strip(',;.ʼ·:!')
        stripped_words.append(stripped_word)
    stripped_words.insert(0, 'start')
    stripped_words.append('end')
    bigrams = list(nltk.bigrams(stripped_words))
    all_bigrams = all_bigrams + bigrams
    bigrams_lines.append(bigrams)
    
bigram_dict = {}
for bigram in all_bigrams:
    if bigram not in bigram_dict:
        bigram_dict[bigram] = 1
    else:
        bigram_dict[bigram] += 1

#### Determine maximum-scoring and minimum-scoring lines (for normalisation)

In [21]:
bigram_scores = {}
for line in bigrams_lines:
    bigram_scores[tuple(line)] = 0
    for bigram in line:
        bigram_scores[tuple(line)] += bigram_dict[bigram]

# Most 'Sophoclean' (hexametric) line (by syntax/style): Philoctetes 1020: "ἀλλʼ οὐ γὰρ οὐδὲν θεοὶ νέμουσιν ἡδύ μοι," (spoken by Philoctetes) - score of 551
bigram_max = 551

# Least 'Sophoclean'(hexametric) line (by syntax/style): Ajax 820: "σιδηροβρῶτι θηγάνῃ νεηκονής·." (spoken by Ajax) - score of 4 (among others)
bigram_min = 4

### Metric #4: Amount of information (stop words, support words, and significant words)
#### Use list of stop words from Perseus and remove diacritics

In [22]:
stopwords_raw = ['μή', 'ἑαυτοῦ', 'ἄν', 'ἀλλʼ', 'ἀλλά', 'ἄλλος', 'ἀπό', 'ἄρα',
             'αὐτός', 'δʼ', 'δέ', 'δή', 'διά', 'δαί', 'δαίς', 'ἔτι', 'ἐγώ',
             'ἐκ', 'ἐμός', 'ἐν', 'ἐπί', 'εἰ', 'εἰμί', 'εἴμι', 'εἰς', 'γάρ', 'γε', 'γʼ',
             'γα', 'ἡ', 'ἤ', 'καί', 'κατά', 'μʼ', 'μέν', 'μετά', 'μή', 'ὁ', 'ὅδε',
             'ὅς', 'ὅστις', 'ὅτι', 'οὕτως', 'οὗτος', 'οὔτε', 'οὖν', 'οὐδείς',
             'οἱ', 'οὐ', 'οὐδέ', 'οὐκ', 'περί', 'πρός', 'σύ', 'σʼ', 'σύν', 'τά', 'τʼ', 'τε',
             'τήν', 'τῆς', 'τῇ', 'τι', 'τί', 'τις', 'τίς', 'τό', 'τοί',
             'τοιοῦτος', 'τόν', 'τούς', 'τοῦ', 'τῶν', 'τῷ', 'ὑμός', 'ὑπέρ',
             'ὑπό', 'ὡς', 'ὦ', 'ὥστε', 'ἐάν', 'παρά', 'σός']

stopwords = []
for stopword in stopwords_raw:
    stopword = stopword.strip("',;.ʼ·:!")
    bare_word = ""
    for letter in stopword:
        if letter in 'ἈἉἊἋἌἍἎἏᾺἀἁἂἃἄἅἆἇὰᾶάᾼᾈᾉᾊᾋᾌᾍᾎᾏᾳᾀᾁᾂᾃᾄᾅᾆᾇᾲᾷ':
            bare_word = bare_word + 'α'
        elif letter in 'ἘἙἚἛἜἝῈἐἑἒἓἔἕὲέ':
            bare_word = bare_word + 'ε'
        elif letter in 'ἨἩἪἫἬἭἮἯῊἠἡἢἣἤἥἦἧὴῆήῌᾘᾙᾚᾛᾝᾞᾟῃᾐᾑᾒᾓᾔᾕᾖᾗῂῇῄ':
            bare_word = bare_word + 'η'
        elif letter in 'ἸἹἺἻἼἽἾἿῚἰἱἲἳἴἵἶἷὶῖί':
            bare_word = bare_word + 'ι'
        elif letter in 'ὈὉὊὋὌὍῸὀὁὂὃὄὅὸό':
           bare_word = bare_word + 'ο'
        elif letter in 'ὙὛὝὟῪὐὑὒὓὔὕὖὗὺῦῧῢΰύ':
           bare_word = bare_word + 'υ'
        elif letter in 'ὨὩὪὫὬὭὮὯῺὠὡὢὣὤὥὦὧὼῶώῼᾨᾩᾪᾫᾬᾭᾮᾯῳᾠᾡᾢᾣᾤᾥᾦᾧῲῷῴ':
           bare_word = bare_word + 'ω'
        else:
          bare_word = bare_word + letter
    stopwords.append(bare_word)

#### Use list of support words from Github user and remove diacritics

In [23]:
with open("stopwords-el.txt", "r", encoding='utf8') as midwordfile:
    midwords_raw = midwordfile.readlines()         

midwords = []
for midword in midwords_raw:
    midword = midword.strip()
    midword = midword.strip("',;.ʼ·:!")
    bare_word = ""
    for letter in midword:
        if letter in 'ἈἉἊἋἌἍἎἏᾺἀἁἂἃἄἅἆἇὰᾶάᾼᾈᾉᾊᾋᾌᾍᾎᾏᾳᾀᾁᾂᾃᾄᾅᾆᾇᾲᾷ':
            bare_word = bare_word + 'α'
        elif letter in 'ἘἙἚἛἜἝῈἐἑἒἓἔἕὲέ':
            bare_word = bare_word + 'ε'
        elif letter in 'ἨἩἪἫἬἭἮἯῊἠἡἢἣἤἥἦἧὴῆήῌᾘᾙᾚᾛᾝᾞᾟῃᾐᾑᾒᾓᾔᾕᾖᾗῂῇῄ':
            bare_word = bare_word + 'η'
        elif letter in 'ἸἹἺἻἼἽἾἿῚἰἱἲἳἴἵἶἷὶῖί':
            bare_word = bare_word + 'ι'
        elif letter in 'ὈὉὊὋὌὍῸὀὁὂὃὄὅὸό':
           bare_word = bare_word + 'ο'
        elif letter in 'ὙὛὝὟῪὐὑὒὓὔὕὖὗὺῦῧῢΰύ':
           bare_word = bare_word + 'υ'
        elif letter in 'ὨὩὪὫὬὭὮὯῺὠὡὢὣὤὥὦὧὼῶώῼᾨᾩᾪᾫᾬᾭᾮᾯῳᾠᾡᾢᾣᾤᾥᾦᾧῲῷῴ':
           bare_word = bare_word + 'ω'
        else:
          bare_word = bare_word + letter
    midwords.append(bare_word)


#### Remove diacritics from the text, and determine maximum-scoring and minimum-scoring lines (for normalisation) by scoring stop words as 1, support words as 2, and other words as 3

In [24]:
cur_max = 0
cur_min = 99
max_line = ""
min_line = ""
for line in lines:
    line_score = 0
    words = line.split()
    bare_words = []
    for word in words:
        bare_word = ""
        stripped_word = word.strip(',;.ʼ·:!')
        for letter in stripped_word:
            if letter in 'ἈἉἊἋἌἍἎἏᾺἀἁἂἃἄἅἆἇὰᾶάᾼᾈᾉᾊᾋᾌᾍᾎᾏᾳᾀᾁᾂᾃᾄᾅᾆᾇᾲᾷ':
                bare_word = bare_word + 'α'
            elif letter in 'ἘἙἚἛἜἝῈἐἑἒἓἔἕὲέ':
                bare_word = bare_word + 'ε'
            elif letter in 'ἨἩἪἫἬἭἮἯῊἠἡἢἣἤἥἦἧὴῆήῌᾘᾙᾚᾛᾝᾞᾟῃᾐᾑᾒᾓᾔᾕᾖᾗῂῇῄ':
                bare_word = bare_word + 'η'
            elif letter in 'ἸἹἺἻἼἽἾἿῚἰἱἲἳἴἵἶἷὶῖί':
                bare_word = bare_word + 'ι'
            elif letter in 'ὈὉὊὋὌὍῸὀὁὂὃὄὅὸό':
                bare_word = bare_word + 'ο'
            elif letter in 'ὙὛὝὟῪὐὑὒὓὔὕὖὗὺῦῧῢΰύ':
                bare_word = bare_word + 'υ'
            elif letter in 'ὨὩὪὫὬὭὮὯῺὠὡὢὣὤὥὦὧὼῶώῼᾨᾩᾪᾫᾬᾭᾮᾯῳᾠᾡᾢᾣᾤᾥᾦᾧῲῷῴ':
                bare_word = bare_word + 'ω'
            else:
                bare_word = bare_word + letter
        bare_words.append(bare_word)

    for bare_word in bare_words:
        if bare_word in stopwords:
            line_score += 1
        elif bare_word in midwords:
            line_score += 2
        else:
            line_score += 3
    if line_score > cur_max:
        cur_max = line_score
        max_line = line

    if line_score < cur_min:
        cur_min = line_score
        min_line = line


# Most 'significant' (hexametric) line: Philoctetes 989: "Ζεύς ἐσθʼ, ἵνʼ εἰδῇς, Ζεύς, ὁ τῆσδε γῆς κρατῶν," (spoken by Odysseus) - score of 24
signif_max = 24

# Least 'significant' (hexametric) line: Oedipus at Colonus 1269: "παρασταθήτω· τῶν γὰρ ἡμαρτημένων" (spoken by Polynices) - score of 8
signif_min = 8

### Determine normalised scores for each conjecture

#### Metric #1: Vocabulary (forms)

In [25]:
with open("conjectures.txt", "r", encoding='utf8') as conjfile:
    conjectures = conjfile.readlines()

print("Metric #1: Vocab (forms) scores")
conj_vocab_scores = {}
for conjecture in conjectures:
    conj_vocab_scores[conjecture] = 0
    words = conjecture.split()
    num = len(words)
    for word in words:
        new_word = word.strip(',;.ʼ·:!')
        if new_word in word_dict:
            conj_vocab_scores[conjecture] += word_dict[new_word] / num
    conj_vocab_scores[conjecture] = round((conj_vocab_scores[conjecture] - vocab_min) / (vocab_max - vocab_min), 4)
    print(conjecture.strip(), conj_vocab_scores[conjecture])
    

Metric #1: Vocab (forms) scores
ὅς γ’ οὐ λογίζῃ, πῶς δε πιστεύοιμι σοι; 0.1582
ὥς στέρεος εῖ! τί οὐχὶ πιστεύεις ἐμοί; 0.1309
θέλεις λέγειν με δεῖν πιθέσθαι σοι, Κρέων; 0.1099
τί ποτε πιθοίμην ἂν γε τοῖςδε σοῖς λόγοις; 0.2432
μηδείς δε πιστεύσει γέ σοι, ὀρθῶς φρονῶν. 0.048


### Metric #2: Vocabulary (lemmas)

In [29]:
with open("conjlemmas.txt", "r", encoding='utf8') as conjlemmafile:
    conj_lemmas_raw = conjlemmafile.readlines()

print("Metric #2: Vocab (lemmas) scores")
conj_lemma_scores = {}
for conjecture in conj_lemmas_raw:
    conj_lemma_scores[conjecture] = 0
    conj_lemmas = conjecture.split()
    num = 0
    for lemma in conj_lemmas:
        if lemma not in stopwords_raw:
            num += 1
            if lemma in lemma_dict:
                conj_lemma_scores[conjecture] += lemma_dict[lemma]
    if num == 0:
        num = 1
    conj_lemma_scores[conjecture] = conj_lemma_scores[conjecture] / (lemma_norm * num)
    print(conjecture.strip(), conj_lemma_scores[conjecture])

Metric #2: Vocab (lemmas) scores
ὅς γε οὐ λογίζομαι πῶς δέ πιστεύω σύ 0.10166666666666667
ὡς στερεός εἰ τίς οὐ πιστεύω ἐμός ἐγώ 0.02
ἐθέλω λέγω ἐγώ δεῖ πείθω σύ Κρέων 0.431
τίς ποτέ πείθω ἄν γε ὅδε σός λόγος 0.62875
μηδείς δέ πιστεύω γε σύ ὀρθός φρονέω 0.16125


### Metric #3: Syntax/style (bigrams)

In [30]:
print("Metric #3: Syntax/style (bigrams) scores")

bigrams_conjectures = []
conj_bigram_scores = {}
for conjecture in conjectures:
    words = conjecture.split()
    stripped_words = []
    for word in words:
        stripped_word = word.strip(',;.ʼ·:!')
        stripped_words.append(stripped_word)
    stripped_words.insert(0, 'start')
    stripped_words.append('end')
    bigrams = list(nltk.bigrams(stripped_words))

    conj_bigram_scores[conjecture] = 0
    for bigram in bigrams:
        if bigram in bigram_dict:
            conj_bigram_scores[conjecture] += bigram_dict[bigram]
    conj_bigram_scores[conjecture] = round((conj_bigram_scores[conjecture] - bigram_min) / (bigram_max - bigram_min), 4)
    print(conjecture.strip(), conj_bigram_scores[conjecture])

Metric #3: Syntax/style (bigrams) scores
ὅς γ’ οὐ λογίζῃ, πῶς δε πιστεύοιμι σοι; 0.0878
ὥς στέρεος εῖ! τί οὐχὶ πιστεύεις ἐμοί; 0.0786
θέλεις λέγειν με δεῖν πιθέσθαι σοι, Κρέων; 0.0091
τί ποτε πιθοίμην ἂν γε τοῖςδε σοῖς λόγοις; 0.3583
μηδείς δε πιστεύσει γέ σοι, ὀρθῶς φρονῶν. 0.0018


### Metric #4: Amount of information (stop words, support words, and significant words)


In [31]:
print("Metric #4: Amount of information (stop words, support words, and significant words) scores")
conj_signif_scores = {}
for conjecture in conjectures:
    conj_signif_scores[conjecture] = 0
    words = conjecture.split()
    bare_words = []
    for word in words:
        bare_word = ""
        stripped_word = word.strip(',;.ʼ·:!')
        for letter in stripped_word:
            if letter in 'ἈἉἊἋἌἍἎἏᾺἀἁἂἃἄἅἆἇὰᾶάᾼᾈᾉᾊᾋᾌᾍᾎᾏᾳᾀᾁᾂᾃᾄᾅᾆᾇᾲᾷ':
                bare_word = bare_word + 'α'
            elif letter in 'ἘἙἚἛἜἝῈἐἑἒἓἔἕὲέ':
                bare_word = bare_word + 'ε'
            elif letter in 'ἨἩἪἫἬἭἮἯῊἠἡἢἣἤἥἦἧὴῆήῌᾘᾙᾚᾛᾝᾞᾟῃᾐᾑᾒᾓᾔᾕᾖᾗῂῇῄ':
                bare_word = bare_word + 'η'
            elif letter in 'ἸἹἺἻἼἽἾἿῚἰἱἲἳἴἵἶἷὶῖί':
                bare_word = bare_word + 'ι'
            elif letter in 'ὈὉὊὋὌὍῸὀὁὂὃὄὅὸό':
                bare_word = bare_word + 'ο'
            elif letter in 'ὙὛὝὟῪὐὑὒὓὔὕὖὗὺῦῧῢΰύ':
                bare_word = bare_word + 'υ'
            elif letter in 'ὨὩὪὫὬὭὮὯῺὠὡὢὣὤὥὦὧὼῶώῼᾨᾩᾪᾫᾬᾭᾮᾯῳᾠᾡᾢᾣᾤᾥᾦᾧῲῷῴ':
                bare_word = bare_word + 'ω'
            else:
                bare_word = bare_word + letter
        bare_words.append(bare_word)
    for bare_word in bare_words:
        if bare_word in stopwords:
            conj_signif_scores[conjecture] += 1
        elif bare_word in midwords:
            conj_signif_scores[conjecture] += 2
        else:
            conj_signif_scores[conjecture] += 3
    conj_signif_scores[conjecture] = round((conj_signif_scores[conjecture] - signif_min) / (signif_max - signif_min), 4)
    print(conjecture.strip(), conj_signif_scores[conjecture])

Metric #4: Amount of information (stop words, support words, and significant words) scores
ὅς γ’ οὐ λογίζῃ, πῶς δε πιστεύοιμι σοι; 0.625
ὥς στέρεος εῖ! τί οὐχὶ πιστεύεις ἐμοί; 0.375
θέλεις λέγειν με δεῖν πιθέσθαι σοι, Κρέων; 0.75
τί ποτε πιθοίμην ἂν γε τοῖςδε σοῖς λόγοις; 0.5625
μηδείς δε πιστεύσει γέ σοι, ὀρθῶς φρονῶν. 0.625
