```
A CONCISE DICTIONARY of MIDDLE ENGLISH 
FROM A.D. 1150 TO 1580 
By the REV. A. L. MAYHEW, M.A. 
of Wadham College, Oxford 
and the REV. WALTER W. SKEAT LITT.D.; LL.D. EDIN.; M.A. OXON. 
Elrington and Bosworth Professor of Anglo-Saxon in the University of Cambridge 
“These our Ancient Words here set down, I trust will for this time satisfie the Reader.” R. VERSTEGAN, _Restitution of Decayed Intelligence_, ch. vii (at the end) 
“Authentic words be given, or none!” WORDSWORTH, _Lines on Macpherson’s Ossian_ +Oxford+ 
AT THE CLARENDON PRESS M DCCC LXXXVIII
```

In [1]:
import os
import re
from collections import Counter

In [2]:
os.system('wget http://www.gutenberg.org/files/10625/10625-0.txt')

0

In [3]:
with open('10625-0.txt') as f: 
    med = f.read()

In [4]:
paras = med.split('\n\n')

In [5]:
entries = [para for para in paras if para.startswith('+')]

In [6]:
entries[1180]

'+Bi-sprengen+, _v._ to besprinkle, MD; +bi-spreynde+, _pt. s._ W;\n+besprent+, _pp._ bedewed, S3; +bysprent+, S3; +besprint+, S3;\n+bispreynt+, W2.--AS. _be-sprengan_.'

In [18]:
class Word: 
    def __init__(self, raw, variantOf=None): 
        self.raw = raw
        self.variantOf = variantOf
        self.string = self.raw.replace('\n', ' ')
        self.etym = self.getEtym(self.string)
        self.etymLang = self.getEtymLang(self.etym)
        self.etymLangStandard = self.standardizeEtymLang(self.etymLang)
        self.parent = self.getParent(self.etym)
        self.referenceTo = self.getReference(self.string)
        self.name = self.getName(self.string)
        self.pos = self.getPOS(self.string)
        self.variants = self.getVariants(self.string)

    def __str__(self): 
        self.__repr__()
    
    def __repr__(self): 
        out = self.name + '\n'
        out += ', \n'.join("  %s: %s" % item for item in vars(self).items())
        return out
    
    def __eq__(self, other): 
        return self.name == other
        
    def getName(self, word): 
        if word is not None: 
            match = re.search('\+(.*?)\+', word) 
            if match is not None: 
                return match.group(1) 
        if self.variantOf is not None: 
            return self.variantOf
        return None
    
    def getReference(self, word): 
        # First see if it's an error. 
        errorMatch = re.search('\[\[(error\sfor\s|headword\sspelled\s)\+(.*?)\+', word)
        if errorMatch is not None: 
            self.string = self.string.replace(errorMatch.group(0), '')
            return errorMatch.group(2)
        # Don't try to follow a reference if it already has an etymology, 
        # since this is now extranneous information. 
        if self.etym is not None: 
            return None
        match = re.search('see \+(.*?)\+', word)
        if match is not None: 
            # Remove the reference from our working string. 
            self.string = self.string.replace(match.group(0), '')
            return match.group(1)
        else:
            return None
        
    def getPOS(self, word): 
        match = re.search('_(.*?)_', word)
        if match is not None: 
            self.string = self.string.replace(match.group(0), '')
            return match.group(1).replace('.', '')
        else: 
            return None
    
    def getEtym(self, word): 
        if '--' in word: 
            dashSplit = word.split('--')
            etymChunk = dashSplit[1]
            # Remove the part after the dash. 
            self.string = self.string.replace(etymChunk, "").replace('--', '')
            return etymChunk
        else: 
            return None

    def getEtymLang(self, word): 
        if word is not None and '.' in word: 
            matches = list(re.finditer('(.*?)\.', word))
            if len(matches) > 1 and matches[0].group(1) == "Cp": 
                return matches[1].group(1).strip()
            else: 
                return matches[0].group(1).strip()
        else: 
            return None
    
    def getParent(self, word): 
        if word is not None and '.' in word: 
            match = re.search('_(.*?)_', word)
            if match is not None: 
                return match.group(1)
        else: 
            return None
        
    def standardizeEtymLang(self, lang): 
        if lang is None: 
            return None
        langDict = { 'AS': 'ang', 'OS': 'ang', 'ONorth': 'ang', 'OMerc': 'ang',
                'Probably from AS': 'ang', "From AS": 'ang',
                "Church Lat": 'lat', 'Lat': 'lat', "Late Lat": 'lat', "Low Lat": 'lat',
                'OF': 'fro', 'AF': 'xno', 'From OF': 'fro', 
                'Of French origin, from Lat': 'fra',
                "Icel": 'isl', "Swed": 'swe', "Du": 'nld', 
                "G": 'deu', "OHG": 'goh',
                "Goth": 'got'}
        if lang in langDict: 
            return langDict[lang]
        else: 
            return None
    
    def getVariants(self, word): 
        matches = re.findall('\+(.*?)\+', word)
#         print(matches)
        return [match for match in matches]

In [19]:
words = [Word(entry) for entry in entries]

In [20]:
len(words), len(entries)

(11349, 11349)

In [21]:
wordDict = {}
allVariants = []
for word in words: 
    wordDict[word.name] = word

In [22]:
# Follow "see OtherWord" references
for word in words: 
    if word.referenceTo is not None: 
        if word.referenceTo in wordDict: 
            wordDict[word.name] = wordDict[word.referenceTo]


In [23]:
for word in words: 
    if word.variants is not None: 
        for variant in word.variants: 
            wordDict[variant] = word

In [24]:
len(wordDict)

28335

In [30]:
words[5020]

Hony-socle
  raw: +Hony-socle+, _sb._ honeysuckle, Prompt.; _locusta_, Prompt., 
  variantOf: None, 
  string: +Hony-socle+,  honeysuckle, Prompt.; _locusta_, Prompt., 
  etym: None, 
  etymLang: None, 
  etymLangStandard: None, 
  parent: None, 
  referenceTo: None, 
  name: Hony-socle, 
  pos: sb, 
  variants: ['Hony-socle']

In [49]:
etyms = {name: obj for name, obj in wordDict.items() if obj.parent is not None}

In [41]:
wordsWithEtyms = [w for w in words if w.etym is not None]
wordsWithParents = [w for w in words if w.parent is not None]

In [16]:
Counter(etyms.values()).most_common(50)

[('ang', 11644),
 ('fro', 3270),
 ('xno', 2133),
 ('isl', 1053),
 ('lat', 448),
 ('nld', 76),
 ('swe', 50),
 ('deu', 31),
 ('goh', 27),
 ('got', 20),
 ('fra', 5)]

In [61]:
etymWN = []
for name, etym in etyms.items(): 
    etymWN.append("enm:%s\t%s:%s" % (name, etym.etymLangStandard, etym.parent))

In [62]:
len(etymWN)

19135

In [63]:
etymWNOut = '\n'.join(etymWN)

In [64]:
with open('omed-etymwn.tsv', 'w') as f: 
    f.write(etymWNOut)