To start working with our data, we had to take one extra pre-processing step. 

In the news articles from the 90s, the German umlauts (ö,ä,ü,ß) were often replaced with correspondingly 'oe','ae','ue','ss'. As a result, a word 'Nürnberg' is saved in the corpus as u'Nuernberg' or u'N\xfcrnberg' depending on the publication date. We use a spellchecker PyHunSpell to ensure that all (or almost all) the words have a unique representation.

In [1]:
# import PyHunSpell
import hunspell

In [2]:
# load the data
import pandas as pd 
data_umlaut = pd.read_csv('hb_umlauts_fix.csv', encoding = 'utf-8', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Correct text 57565: some words and numbers are merged into a single token
import nltk 
from sacremoses import MosesTokenizer, MosesDetokenizer # detokenizing package
md = MosesDetokenizer() 
import re
tokens = nltk.word_tokenize(data_umlaut['texts'][57565])
tokens_new = []
# Split tokens containing text and numbers into two parts
# E.g., "OEsterreich1624,70" should be split into "OEsterreich" and "1624,70"
for t in tokens:
    match = re.match(r"([a-z]+)([0-9,]+)", t, re.I)
    if match:
        items = match.groups()
        tokens_new.append(items[0] + ' ' + items[1])
    else:
        tokens_new.append(t)
data_umlaut.loc[57565,'texts'] = md.detokenize(tokens_new)

  "You should really be using Python3!!! "


In [4]:
# make sure that you have downloaded nltk_data
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mokuneva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# use a spellchecker (umlautf.py function) to make umlaut replacements where necessary

from datetime import datetime
startTime = datetime.now()
import umlautf

import multiprocessing as mp
NUM_CORE = mp.cpu_count() # set to the number of cores you want to use

import worker_umlaut

if __name__ == "__main__":
    list_of_texts = data_umlaut.texts
    list_of_objects = [umlautf.spell(i) for i in list_of_texts]
    
    pool = mp.Pool(NUM_CORE)
    corrected_umlauts = pool.map(worker_umlaut.worker_umlaut, ((obj) for obj in list_of_objects))
    pool.close()
    pool.join()
        
print datetime.now() - startTime

1:00:41.123000


In [6]:
data_umlaut['texts'] = corrected_umlauts 

In [7]:
data_umlaut.to_csv('hb_umlauts_fixed.csv', encoding='utf-8-sig', sep = ';')