In [1]:
# import required libraries
import sys
import re
import pandas as pd
import spacy
from pymongo import MongoClient

In [2]:
# parameters and variable init for db operations
uri = 'mongodb://localhost:27017/'
database = 'zs_database'
collection_fetch = 'stories'
collection_push = 'autotags'

df = pd.DataFrame()
db = object

In [3]:
"""
get unprocessed data into pandas dataframe
"""
# exception handling for database operation
try:
    print("Connecting to database")
    # init mongodb client
    client = MongoClient(uri)
    db = client[database]

    # retrieving only story id and details for now
    df = pd.DataFrame(list(db[collection_fetch].find({}, {"_id":0, "id": 1, "plain_text": 1})))

except:  # TODO: maybe be specific about the exceptions that can occur
    print('Unexpected error:', sys.exc_info()[0])
    print('Exiting system ...')
    exit()
    
df

Connecting to database


Unnamed: 0,id,plain_text
0,19,Radio Über-All sendet 6 mal Zukunftsmusik! Am ...
1,21,Unser Dorf pflanzt SauerstoffMathilda schaute ...
2,24,"Es war einmal vor gar nicht so langer Zeit, da..."
3,25,"Das Beste an Partys ist das Buffet, findet Ber..."
4,27,"Endlose Laubwälder, saftige Weiden, Wildblumen..."
...,...,...
195,489,"Familie Maus verläßt ihr gutes, altes HausErst..."
196,490,Das Wolkenland Glück war einst ein Merkmal des...
197,491,"Eines Tages wacht Miro auf, und findet eine ko..."
198,492,Goldene Sonnenstrahlen fielen durch die klare ...


In [4]:
"""
Clean given text
"""
def clean_text(content):
    """
    Removing Unwanted Characters
    """
    # removing html tags
    content = re.sub('<[^<]+?>', '', content)

    # removing entity names
    content = re.sub('&[^<]+?;', '', content)

    # removing whitespace from escape characters
    content = re.sub(r'[\n\r\t\a\f\b\v]', '', content)

    # remove unwanted characters (eg ",',.,?,!). We might want to kee these later though.
    content = re.sub(r'[\'\-".?!,0-9“„–()]', '', content)
    content = re.sub(r'[\\\".?!,0-9():;]', '', content)
    # remove additional characters
    content = re.sub(r'[\‘\'\-\[\]»«0-9“„”…–]', '', content)

    """
    Encoding the proper format
    """
    # python 3 handles sting in UTF-8 by default
    # We need to write ode if we want to process data in other formats
    # for now default UTF-8 is ok
    
    return content


In [5]:
"""
Generate tokens for given text
"""
def generate_tokens(content):   
    #load plain text into spacy processor     
    doc = nlp(content)

    # init list
    token_list = []
    lemma_list = []
    lemma_list_without_verbs = []
    pos_list = []
    
    # Iterate through each token identified in doc    
    for token in doc:
        # remove stop words for German Language like like 'eine', 'könnte'... from spacy lib.
        if (not token.is_stop) and (token.text != " ") and (token.text.strip() != ""):
            # additional trimming needed for some cases
            word = token.text.strip()
            lemma = token.lemma_.strip()
            pos = token.pos_.strip()

            # addictionl check to see if the trimmed or converted text is not empty
            if word != '':
                token_list.append(word)  # token list without stop word
            if lemma != '':
                lemma_list.append(lemma)
                if pos != "VERB" and pos != "ADV":
                    lemma_list_without_verbs.append(lemma)
            if pos != '':
                pos_list.append(pos)

    # Entity listing through spaCy lib. requires text without stop words for wfficiency
    entity_list = [[i.text, i.label_] for i in doc.ents]
    noun_list = [chunk.text for chunk in doc.noun_chunks]
    
    return (token_list, lemma_list, pos_list, entity_list, noun_list, lemma_list_without_verbs)

In [6]:
"""
preprocess text and insert into collection
"""

# load spacy core for German language
nlp = spacy.load('de_core_news_md')

# check if push collection(autotags) already exists, if so, remove(drop) the collection for now
# TODO: handle exception
if collection_push in db.list_collection_names():
    collection = db[collection_push]
    if collection.estimated_document_count() != 0:
        print('Dropping the old collection (' + collection_push + ') ...')
        collection.drop()
            
collection = db[collection_push]

print("Pre-processing all text. This might take some time...")
print("Story id(s) processed: ", end=" ")
for x in df.iterrows():
    # fetching id and content for each item in data-frame
    index, item = x
    story_id = item.id
    content = item.plain_text

    # clean text for each document 
    content = clean_text(content)

    """
    Creating Tokens without stop words
    Create lemma list and part of speech list in case we need it later
    """
    # word tokanization and other preprocessing for each document
    token_list, lemma_list, pos_list, entity_list, noun_list, lemma_list_without_verbs = generate_tokens(content)

    # TODO
    # if we are going to use un-cased data sets, we need to change the tokens to lower case
    # use spaCy sentencizer component if sentence tokenizing is needed

    # insert into db
    # TODO: write try catch statement, possibly ignore this if singular document is not inserted and continue with other
    collection.insert_one(
        {
            "story_id": story_id,
            "tokens": token_list,
            "lemmas": lemma_list,
            "pos": pos_list,
            "nouns": noun_list,
            "entities": entity_list,
            "lemma_list_without_verbs": lemma_list_without_verbs
        }
    )
    print(str(story_id), end=" ")

print('Done !!!')
print('Pre-processed data entered into (' + collection_push + ') collection')

Pre-processing all text. This might take some time...
Story id(s) processed:  19 21 24 25 27 28 64 68 69 72 74 94 100 105 107 110 114 118 119 121 124 125 131 132 138 139 140 142 177 180 185 187 188 189 194 198 199 201 202 203 207 214 215 216 217 219 224 225 226 228 233 237 248 251 252 255 256 258 260 262 266 269 272 273 274 281 286 288 289 291 292 293 294 296 297 299 300 302 305 306 307 308 310 315 316 319 320 321 323 325 326 329 330 331 332 333 334 336 340 341 344 347 348 350 353 358 359 360 361 362 364 365 366 367 369 370 371 374 375 378 379 380 381 382 383 384 387 390 391 392 393 394 396 398 399 400 401 403 406 407 408 409 410 411 412 414 416 417 418 419 421 423 424 426 427 428 429 432 433 436 438 439 440 442 443 444 445 446 448 450 451 452 453 454 456 457 458 459 460 462 463 464 469 471 472 473 474 475 478 479 480 485 486 487 488 489 490 491 492 494 Done !!!
Pre-processed data entered into (autotags) collection
