In [14]:
# First of all, we import REGEX: 
import re

# We open a file containing an Old Souletin text and assign a variable to it:
IP = open('IP.txt').read()

"""One can search for both upper and lower case within the REGEX, 
but it is much more to command a case sensitive replacement of the pattern.
As a starting point, I create 
IP_1 = re.sub('[G|g]e','je',IP)
IP_2 = re.sub('[G|g]i','ji',IP_1)
IP_3 = re.sub('[G|g]ue','ge',IP_2)
IP_4 = re.sub('[G|g]ui','gi',IP_3)
IP_5 = re.sub('[Q|q]ue','ke',IP_4)
IP_6 = re.sub('[Q|q]ui','ki',IP_5)
IP_7 = re.sub('[Ç|ç]','z',IP_6)

Thus, we definitively need a case sensitive replacement function.
"""

"One can search for both upper and lower case within the REGEX, \nbut it is much more to command a case sensitive replacement of the pattern.\nAs a starting point, I create \nIP_1 = re.sub('[G|g]e','je',IP)\nIP_2 = re.sub('[G|g]i','ji',IP_1)\nIP_3 = re.sub('[G|g]ue','ge',IP_2)\nIP_4 = re.sub('[G|g]ui','gi',IP_3)\nIP_5 = re.sub('[Q|q]ue','ke',IP_4)\nIP_6 = re.sub('[Q|q]ui','ki',IP_5)\nIP_7 = re.sub('[Ç|ç]','z',IP_6)\n\nThus, we definitively need a case sensitive replacement function.\n"

In [15]:
# We create a function for case sensitive replacing:
def case_sensitive_substitution(string, old, new):
    """ This function replaces occurrences of old with new, within string
        replacements will match the case of the text it replaces"""
    def repl(match):
        current = match.group()
        result = ''
        all_upper = True
        for i,c in enumerate(current):
            if i >= len(new):
                break
            if c.isupper():
                result += new[i].upper()
            else:
                result += new[i].lower()
                all_upper = False
        # Now we append any remaining characters from new
        if all_upper:
            result += new[i+1:].upper()
        else:
            result += new[i+1:].lower()
        return result
    regex = re.compile(re.escape(old), re.I)
    return regex.sub(repl, string)

# We text the new function:
print(case_sensitive_substitution("This is a STRING whith UPPER and LOWER caseS",'s','б'))

Thiб iб a БTRING whith UPPER and LOWER caбeБ


In [16]:
# Once created the function for case sensitive replacement, we can use it for command a number of substitutions 
# that perform changes from the spelling system of Old Souletin Basque into the spelling of present-day standard Basque.
# It is crucial to bear in mind the order of the rules that we want to introduce in the code.

# The first pair of rules must be executed before the second pair; otherwise one pair invalidates the other pair:
IP_rule1 = (case_sensitive_substitution(IP,'ge','je'))
IP_rule2 = (case_sensitive_substitution(IP_rule1,'gi','ji'))
IP_rule3 = (case_sensitive_substitution(IP_rule2,'gue','ge'))
IP_rule4 = (case_sensitive_substitution(IP_rule3,'gui','gi'))

# The following 3 rules do not pause such problem:
IP_rule5 = (case_sensitive_substitution(IP_rule4,'que','ke'))
IP_rule6 = (case_sensitive_substitution(IP_rule5,'qui','ki'))
IP_rule7 = (case_sensitive_substitution(IP_rule6,'ç','z'))

# Rules 8 & 9 must be executed in this very order, otherwise 9 cancels 8.
# Moreover, rules 8 & 9 must be previous to 10:
IP_rule8 = (case_sensitive_substitution(IP_rule7,'x','ts'))
IP_rule9 = (case_sensitive_substitution(IP_rule8,'tch','tx'))
IP_rule10 = (case_sensitive_substitution(IP_rule9,'ch','x'))

# Similarly, rules 11 & 12 must be permormed before rule 13:
IP_rule11 = (case_sensitive_substitution(IP_rule10,'ce','ze'))
IP_rule12 = (case_sensitive_substitution(IP_rule11,'ci','zi'))
IP_rule13 = (case_sensitive_substitution(IP_rule12,'c','k'))

# We add more rules updating spelling:
IP_rule14 = (case_sensitive_substitution(IP_rule13,'gn','ñ'))
IP_rule15 = (case_sensitive_substitution(IP_rule14,'y','i'))
IP_rule16 = (case_sensitive_substitution(IP_rule15,'v','b'))
IP_rule17 = (case_sensitive_substitution(IP_rule16,'mb','nb'))
IP_rule18 = (case_sensitive_substitution(IP_rule17,'mp','np'))
IP_rule19 = (case_sensitive_substitution(IP_rule18,'ss','s'))
IP_rule20 = (case_sensitive_substitution(IP_rule19,'é','e'))

# Still we need to establish two rules which are crucial to reflect the phonological appearence of the Souletin dialect:
# u > ü, BUT ou > u. 
# After having explored many different and more complex ways, I propose this one, which is quite simple:

# First we establish the change from U to ü:
IP_rule21 = (case_sensitive_substitution(IP_rule20,'u','ü'))
# Then we change the previously non-existent OÜ group to U:
IP_rule22 = (case_sensitive_substitution(IP_rule21,'oü','u'))

# Now we can test how is the general spelling update for Old Souletin texts:
print(IP_rule22)

﻿IGANTEZTAKO PRONUA, ETA HILEN PRONUA.
PAÜBEN, G. DÜGÜE ETA J. DESBARATZ, Beithan Muldezko Leteretan ezarria.

< 1 > IGANTEZTAKO PRONUA.

Aitaren eta Semiaren eta Espiritü Saintiaren izenian. Hala biz.

Popülü fidela, zunbat ere gure menteko egün oroz Jinko gure kreazaliaren zerbütxatzera obligatü beikira, halere igantia Jaünaren egüna deithü izan da zeren eta egün saintü huntan lan thenporalak oro eitzi behar beitütügü, amurekatik gure Jaün ezinago handi denaren laidatzen, 
eta uhuratzen enplega dezagün, eta haren leialki zerbütxatzeko moianen ikhasten.

Hartakoz lekhü saintü huntara bildü izan gira, nun present gaüdialarik mezako sakrifizio saintian, zuñtan Jesü Kristek bere büria guregatik aphezen eskietzaz bera aitari oferitzen beitü ogiaren eta arduaren üdüriaren pian, lehenik Jinkua adoratzen dügü, < 2 > eta hari zor zaion ororen gañetiko uhuria emaiten: bigerrenki haren huntarzün dibinuari eskerrak deitzogü, egün orok largoki hareganik ükheiten dütügün hunkietzaz: herenki arraki

In [17]:
# For evaluation purposes, we can also count the number replacements.
# To that end we can use the function len() and REGEX, in a syntax as follows:
# count_changes = len(re.findall(pattern,string))

# To obtain the number of replacements operated by the each rule, 
# we can subtract the number of occurrences of the OLD PATTERN in the text issue from the application of this rule (IP_ruleX)
# to the number of occurrences of the old pattern in previous version of the text (IP_rule(X-1):
# count_ruleX = len(re.findall(pattern,IP)) - len(re.findall(pattern,IP_ruleX))

# We test the number of matches with function print()
count_rule1 = len(re.findall('ge',IP)) - len(re.findall('ge',IP_rule1))
print('Rule 1 operated ',count_rule1,' changes.')

Rule 1 operated  9  changes.


In [13]:
# Now we do the same for each replacing rule:

count_rule2 = len(re.findall('gi',IP_rule1)) - len(re.findall('gi',IP_rule2))
count_rule3 = len(re.findall('gue',IP_rule2)) - len(re.findall('gue',IP_rule3))
count_rule4 = len(re.findall('gui',IP_rule3)) - len(re.findall('gui',IP_rule4))
count_rule5 = len(re.findall('que',IP_rule4)) - len(re.findall('que',IP_rule5))
count_rule6 = len(re.findall('qui',IP_rule5)) - len(re.findall('qui',IP_rule6))
count_rule7 = len(re.findall('ç',IP_rule6)) - len(re.findall('ç',IP_rule7))
count_rule8 = len(re.findall('x',IP_rule7)) - len(re.findall('x',IP_rule8))
count_rule9 = len(re.findall('tch',IP_rule8)) - len(re.findall('tch',IP_rule9))
count_rule10 = len(re.findall('ch',IP_rule9)) - len(re.findall('ch',IP_rule10))
count_rule11 = len(re.findall('ce',IP_rule10)) - len(re.findall('ce',IP_rule11))
count_rule12 = len(re.findall('ci',IP_rule11)) - len(re.findall('ci',IP_rule12))
count_rule13 = len(re.findall('c',IP_rule12)) - len(re.findall('c',IP_rule13))
count_rule14 = len(re.findall('gn',IP_rule13)) - len(re.findall('gn',IP_rule14))
count_rule15 = len(re.findall('y',IP_rule14)) - len(re.findall('y',IP_rule15))
count_rule16 = len(re.findall('v',IP_rule15)) - len(re.findall('v',IP_rule16))
count_rule17 = len(re.findall('mb',IP_rule16)) - len(re.findall('mb',IP_rule17))
count_rule18 = len(re.findall('mp',IP_rule17)) - len(re.findall('mp',IP_rule18))
count_rule19 = len(re.findall('ss',IP_rule18)) - len(re.findall('ss',IP_rule19))
count_rule20 = len(re.findall('é',IP_rule19)) - len(re.findall('é',IP_rule20))
count_rule21 = len(re.findall('u',IP_rule20)) - len(re.findall('u',IP_rule21))
count_rule22 = len(re.findall('oü',IP_rule21)) - len(re.findall('oü',IP_rule22))

print('Rule 1 operated ',count_rule1,' changes.')
print('Rule 2 operated ',count_rule2,' changes.')
print('Rule 3 operated ',count_rule3,' changes.')
print('Rule 4 operated ',count_rule4,' changes.')
print('Rule 5 operated ',count_rule5,' changes.')
print('Rule 6 operated ',count_rule6,' changes.')
print('Rule 7 operated ',count_rule7,' changes.')
print('Rule 8 operated ',count_rule8,' changes.')
print('Rule 9 operated ',count_rule9,' changes.')
print('Rule 10 operated ',count_rule10,' changes.')
print('Rule 11 operated ',count_rule11,' changes.')
print('Rule 12 operated ',count_rule12,' changes.')
print('Rule 13 operated ',count_rule13,' changes.')
print('Rule 14 operated ',count_rule14,' changes.')
print('Rule 15 operated ',count_rule15,' changes.')
print('Rule 16 operated ',count_rule16,' changes.')
print('Rule 17 operated ',count_rule17,' changes.')
print('Rule 18 operated ',count_rule18,' changes.')
print('Rule 19 operated ',count_rule19,' changes.')
print('Rule 20 operated ',count_rule20,' changes.')
print('Rule 21 operated ',count_rule21,' changes.')
print('Rule 22 operated ',count_rule22,' changes.')

Rule 1 operated  9  changes.
Rule 2 operated  13  changes.
Rule 3 operated  17  changes.
Rule 4 operated  72  changes.
Rule 5 operated  0  changes.
Rule 6 operated  1  changes.
Rule 7 operated  395  changes.
Rule 8 operated  22  changes.
Rule 9 operated  19  changes.
Rule 10 operated  16  changes.
Rule 11 operated  117  changes.
Rule 12 operated  125  changes.
Rule 13 operated  480  changes.
Rule 14 operated  32  changes.
Rule 15 operated  30  changes.
Rule 16 operated  3  changes.
Rule 17 operated  6  changes.
Rule 18 operated  11  changes.
Rule 19 operated  12  changes.
Rule 20 operated  4  changes.
Rule 21 operated  782  changes.
Rule 22 operated  267  changes.


In [None]:
# REMARKS:

# Although the result is highly satisfactory, it is needed a philological revision of the output.
# For example, the [u > ü] rule may change family names that are not Basque (Dugué > Dügüé).
# More importantly, some diphtongs that are no subject to this rule have been changed too (PAUBEN > PAÜBEN).
# Another problem is that of ethymological spelling: the syllable <chan> in the word "Archangel" is pronounced /kan/, 
# so its spelling must NOT be changed into <Arxanjel> (as rule 10 establishes), but into <Arkanjel>.
# As a final remark, this conversor of spelling is not "universal" for Old Souletin Basque, since variation in spelling is important in old texts. 
# However, the main guidelines of this graphical updating are supposed to work with most of Old Souletin texts:
# we always can adapt this code by introducing new rules or change existent ones.