In [40]:
import re
import random

In [17]:
dictLines = open('oed.dict').readlines()

In [227]:
class DictWord: 
    def __init__(self, pair):
        if len(pair) != 2: 
            raise Error
        
        self.etymString = None    
        self.parentLang = None
        self.parent = None
        self.etymString = None
        
        self.rawString = pair
        self.rawWord = pair[0]
        self.rawDef = pair[1]
        
        self.word = self.parseWord(self.rawWord)
        
        self.etymString = self.parseDefinition(self.rawDef)
        self.parseEtym(self.etymString)

    def __repr__(self): 
        varDict = vars(self)
        return "\n".join(["%s: %s" % (key, val) 
                          for key, val in varDict.items()
                          if key is not 'rawString'])
    
    def parseWord(self, rawWord): 
        # Get the first word.
        match = re.search('([A-Za-z].*?)[,.;\s]', rawWord)
        if match is not None: 
            return match.group(1)
    
        
    def parseDefinition(self, rawDef): 
        """ Get bracketed part of definition."""
        if rawDef is not None and '[' in rawDef: 
            match = re.search('\[(.*?)]', rawDef)
            if match is not None: 
                return match.group(1)
        return None
    
    langDict = {"med.L.": 'lat', 
                "L.": 'lat', 
                "La.": 'lat', 
                "AngloFr.": 'xno', 
                "F.": 'fra', 
                "Fr.": 'fra', 
                "OF.": 'fro', 
                "mod.Fr.": 'fra',   
                "G.": 'deu', 
                "Ger.": 'deu', 
                "mod.F.": 'fra', 
                'OHG.': 'goh', 
                'MHG.': 'gmh', 
                'OE.': 'ang', 
                'ON.': 'non', 
                'Sp.': 'spa', 
                'It.': 'ita', 
                'OS.': 'ang', 
                'Gr.': 'grc', 
                "Gre.": 'grc', 
                "Heb.": 'heb', 
                "Jap.": 'jpn',
                'OCelt.': 'cel', 
                "Oir.": 'sga', 
                "mod.Ir.": 'gle', 
                "ME.": 'enm', 
                "Arab.": 'ara'}

    def parseEtym(self, etym): 
        if etym == None: 
            return None
        languages = list(self.langDict.keys())
        matches = [re.search('\s'+re.escape(lang), etym) for lang in languages]
        # Sort by where they appear in the list, so that we can pick 
        # the leftmost language. 
        sortedMatches = sorted(matches, key=lambda x: x.span()[0] if x is not None else 99999)
        if len(sortedMatches) > 0 and sortedMatches[0] is not None: 
            # Search after the name of the language
            langEnd = sortedMatches[0].span()[1] 
            etym = etym[langEnd:]
            # Get the first word after the language name. 
            match = re.search('([A-Za-z].*?)[,.;\s]', etym)
            if match is not None: 
                parent = match.group(1)
            else: 
                parent = None
            self.parentLang = sortedMatches[0].group(0).strip()
            self.parent = parent
        else: 
            self.parentLang = None
            self.parent = None

In [228]:
def parseDict(dictLines): 
    matchLines = []
    for i, line in enumerate(dictLines): 
        match = re.search('^\s\(', line)
        if match is not None: 
            matchLines.append(i)
    definitions = []
    for i in matchLines: 
        definitions.append(dictLines[i-1:i+1])
    definitions = [[d.strip() for d in defns] for defns in definitions]
    return definitions

In [229]:
definitions = parseDict(dictLines)

In [245]:
wordsWithEtyms = []
for word in definitions: 
    w = DictWord(word)
    if w.word is not None and w.parentLang is not None and w.parent is not None: 
        wordsWithEtyms.append(w)
            

In [246]:
out = []
for w in wordsWithEtyms:
    if w.parentLang in DictWord.langDict: 
        translated = DictWord.langDict[w.parentLang]
        out.append("eng:%s\t%s:%s" % (w.word, translated, w.parent))
    else: 
        print('---')
        print("Can't find lang for: ", w.word)
        print('Etymstring: ', w.etymString)
        print('Parentlang: ', w.parentLang)

In [247]:
len(out)

38074

In [248]:
etymWN = "\n".join(out)

In [249]:
with open('oed-etymwn.tsv', 'w') as f: 
    f.write(etymWN)