In [1]:
import os
import multiprocessing as mp # use multiprocessing module for parallel computing
import pandas as pd # load pandas: python data analysis library
import sz_load # import a function that loads data from one folder (see sz_load.py file for details)
import codecs
import numpy as np
import collections
# import the module replacing incorrectly encoded German umlaut characters with their correct versions
import umlauts_correct_sz 
import correct_url_sz
import clean_sz_articles
import itertools
import clean_tables_sz
import count_names

from itertools import repeat
from datetime import datetime

# We import some functions from the Handelsblatt folder
import sys
sys.path.insert(1, os.getcwd().replace('SZ', 'Handelsblatt'))
import count_words_mp # import the function calculating the number of words in a text
import split_number_word
import chained_articles # import a function that outputs two parts of a chained article and duplicated articles
import find_umlaut

# We import some functions from the 'dpa' folder
sys.path.insert(1, os.getcwd().replace('SZ', 'dpa'))
import identify_ger 
import fuzzy_duplicates_test_all_dpa
import fuzzy_duplicates_dpa # import a function that outputs the indices of fuzzy duplicates 
import numeric_articles

In [2]:
NUM_CORE = mp.cpu_count()-8 # set the number of cores to use

print("The number of cores that will be used: {}".format(NUM_CORE))

The number of cores that will be used: 56


# Süddeutsche Zeitung (SZ) data

*Süddeutsche Zeitung* is a renowned German daily newspaper with a strong focus on both politics and economy in its national edition. In the first quarter of 2021, the newspaper had a circulation of 304,769 daily copies, as reported by the IVW (Informationsgemeinschaft zur Feststellung der Verbreitung von Werbeträgern), a German organization responsible for assessing media circulation. Due to its comprehensive coverage of economic topics, the Süddeutsche Zeitung is particularly valuable for economic forecasting.

We acquired the SZ dataset from Genios, a reputable German provider of business information. The dataset comprises **2,034,968** articles published between January 1994 and November 2018. The data acquisition took place in February 2019.

The dataset is organized into three primary folders: 'Sueddeutsche', 'Sueddetusche_historisch', and 'Sueddeutsche_regional'. For our research project, which aims to forecast economic variables using newspaper data, we decided to focus on the first two folders, as regional data is considered less relevant to our objectives. The 'Sueddeutsche_historisch' folder holds 11 subfolders for the years 1994 to 2004, such as SZH_1994, while the 'Sueddeutsche' folder contains 14 subfolders for each year between 2005 and 2018, like SZ_2005. In total, there are 25 subfolders, one for each year. Each subfolder contains several XML files, from which we extract relevant information for our project. Regrettably, due to copyright restrictions, we are unable to publish the data.

## Load the data

First, we need to read in the data. We create the list including the names of the 25 subfolders (`folder_list`) and apply the function `sz_load` to them in parallel by exploiting Python's `multiprocessing` library.

We extract the following XML elements:

* datum - publication date
* ressort - section/subsection of the newspaper
* quelle/name - description of the source, useful for filtering out regional articles
* titel-liste/titel - article's title
* titel-liste/dachzeile - article's kicker
* titel-liste/untertitel - article's subheading
* inhalt/vorspann - annotation
* inhalt/text - text of the article

In [3]:
# 'SZ' is the main folder with 3 subfolders ('Sueddeutsche', 'Sueddetusche_historisch', and 'Sueddeutsche_regional') in it
#path = 'G:\\Userhome Mariia\\SZ' # your path here
path = os.getcwd().replace('\\newspaper_data_processing\\SZ', '') + '\\SZ'

# Create the list of all subfolders within SZ main folder.
folder_list=[]

for fol in [fol for fol in os.listdir(path)[:2] ]: # 3 folders: SZ, SZ_historisch, SZ_regional (we use the first two)
    # os.listdir(path) - names of directories
    for f in [f for f in os.listdir(path + '\\' + fol) ]:  # within each folder: folders for different years                                          
        folder_list.append(path + '\\' + fol + '\\' + f)

In [4]:
startTime = datetime.now() # track time

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    data_intermediate = pool.map(sz_load.sz_load, folder_list) # load data from each folder in parallel
    data = pd.concat(data_intermediate) # concatenate DataFrames from different folders
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:02:34.441543


In [5]:
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data = data.reset_index() # reset the index of the DataFrame
del data['index'] # delete a column with an old index

In [6]:
data[10:15]

Unnamed: 0,year,month,day,newspaper,newspaper_2,texts,rubrics,title,quelle_texts,page
10,1994,1,3,- Süddeutsche Zeitung (1994 - 2004),SZ,Kein Automatismus bei den Miet-Zusatzkosten Wa...,IMMOBILIEN,Kein Automatismus bei den Miet-Zusatzkosten Wa...,SZ NR. 001 VOM 03.01.1994 SEITE 271,
11,1994,1,3,- Süddeutsche Zeitung (1994 - 2004),SZ,STUETZE DER KONJUNKTUR: Waehrend viele Wirtsch...,IMMOBILIEN,STUETZE DER KONJUNKTUR:,SZ NR. 001 VOM 03.01.1994 SEITE 271,
12,1994,1,3,- Süddeutsche Zeitung (1994 - 2004),SZ,In Berlin/Brandenburg: Bau wartet auf den Boom...,IMMOBILIEN,In Berlin/Brandenburg: Bau wartet auf den Boom.,SZ NR. 001 VOM 03.01.1994 SEITE 271,
13,1994,1,3,- Süddeutsche Zeitung (1994 - 2004),SZ,Bueromarkt Berlin Die Preise kommen den Mieter...,IMMOBILIEN,Bueromarkt Berlin Die Preise kommen den Mieter...,SZ NR. 001 VOM 03.01.1994 SEITE 271,
14,1994,1,3,- Süddeutsche Zeitung (1994 - 2004),SZ,"Maureen; irische Geschichten. 36 Pubbesitzer, ...",Roman,Maureen; irische Geschichten.,SZ NR. 001 VOM 03.01.1994 SEITE 400,


In [7]:
# the number of article before pre-processing
len(data)

2034968

## Light pre-processing

### Remove short articles (<100 words)

The `count_words_mp` function is used to count the number of words in a text. This function only counts words and excludes numbers from the analysis. It takes into consideration Latin letters with any diacritics, such as umlauts. The function removes specific punctuation marks and non-alphabetic characters before counting the words, ensuring that the final count is accurate and relevant for sentiment analysis and topic modeling.

In [8]:
startTime = datetime.now() # track time

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    count_results = pool.map(count_words_mp.count_words_mp, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:41.796058


In [9]:
# Save the result as a new column "word_count"
data['word_count'] = count_results

Shorter texts often lack sufficient semantic information, making it challenging for topic models and bag-of-words-based sentiment analysis tools to perform effectively. As a result, we focus on texts with more than 100 words to ensure better performance.

In [10]:
# remove articles with the number of words<100
data = data[data['word_count']>=100]
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

In [11]:
# the number of articles after removing short articles
len(data)

1516777

### Remove exact duplicates

Removing duplicates is crucial because they do not contribute any new information and only introduce noise into the analysis.

In [12]:
# All the duplicated articles are saved as 'sz_duplicates' for further exploration.
sz_duplicates = data[data['texts'].duplicated(keep = False)]

In our corpus, there are two main types of duplicates:

1. The same article appears twice, published on different pages because SZ is a regional newspaper, and an identical article can be published on various pages for different regions, such as page 27 in Munich and page 28 in Ebersberg.

In [13]:
sz_duplicates.loc[[51,96]]

Unnamed: 0,year,month,day,newspaper,newspaper_2,texts,rubrics,title,quelle_texts,page,word_count
51,1994,1,3,- Süddeutsche Zeitung (1994 - 2004),SZ,Bittere Klage des OB an Finanzminister Waigel ...,Muenchen,Bittere Klage des OB an Finanzminister Waigel ...,SZ NR. 001 VOM 03.01.1994 SEITE f28,,236
96,1994,1,3,- Süddeutsche Zeitung (1994 - 2004),SZ,Bittere Klage des OB an Finanzminister Waigel ...,Muenchen,Bittere Klage des OB an Finanzminister Waigel ...,SZ NR. 001 VOM 03.01.1994 SEITE b33,,236


2. The same article is included in the corpus with different publication dates (e.g., 12.10.2018 and 13.10.2018). In such cases, a sensible approach is to retain the first entry.

In [14]:
sz_duplicates.loc[[1510995,1511135]]

Unnamed: 0,year,month,day,newspaper,newspaper_2,texts,rubrics,title,quelle_texts,page,word_count
1510995,2018,10,12,Süddeutsche Zeitung,SZ,Impressum. HERAUSGEGEBEN VOM SÜDDEUTSCHEN VERL...,Ressort: Meinungsseite Rubrik: Impressum,Impressum.,"Süddeutsche Zeitung, 12.10.2018, Ausgabe Münch...",4,246
1511135,2018,10,13,Süddeutsche Zeitung,SZ,Impressum. HERAUSGEGEBEN VOM SÜDDEUTSCHEN VERL...,Ressort: Meinungsseite Rubrik: Impressum,Impressum.,"Süddeutsche Zeitung, 13.10.2018, Ausgabe Münch...",4,246


In [15]:
# drop the exact duplicates, keep the article with the earlier publication date ('first')
data.drop_duplicates(['texts'], keep = 'first', inplace=True)
data.reset_index(inplace=True, drop=True)

In [16]:
# the number of articles after removing exact duplicates
len(data)

1499007

## Filtering

### Section

As SZ is a general newspaper, it contains numerous sections that may not be relevant for economic forecasting. To address this, we have manually examined each section of the newspaper by reviewing a sample of articles from every section. Using our professional expertise, we have identified and chosen the sections that are most likely to be relevant for economic forecasting, focusing on recent events rather than historical topics.

The sections we retain, which we refer to as 'priority 1' due to their high relevance for economic forecasting, include 'Politik' (Politics), 'Börse und Finanzen' (Stock Market and Finance), 'Geld' (Money), 'Meinungsseite' (Opinion), 'Nachrichten' (News) - which served as a substitute for the 'Politik' section between 1999 and 2003, 'Seite vier' (Page Four), and 'Wirtschaft' (Economy). Among these sections, 'Politik', 'Meinungsseite', 'Nachrichten', and 'Seite vier' focus on politics, while 'Börse und Finanzen', 'Geld', and 'Wirtschaft' cover economic topics.

Other sections which might be potentially interesting include 'Dokument', 'Themen', 'Themen des Tages', 'Themen aus Deutschland', 'Themen aus dem Ausland' (all related to politics), and 'Immobilien' (real estate news connected to economic development). However, we leave out these 'priority 2' articles and only concentrate on 'priority 1' articles mentioned earlier, as 'priority 2' articles often have a different format (they are frequently longer) and are less focused on current events.

We exclude numerous other sections, such as local news ('München'), advertisements ('Anzeiger'), entertainment news ('Freizeit', 'Hobby', 'Reise', etc.), and letters from readers ('Briefe an die SZ'/'Leserbriefe').

In our analysis, we further narrow down the selection by excluding specific subsections from the preserved sections. Examples of these excluded subsections include 'Inhaltsverzeichnis' (Contents), 'Akt. Lexikon' (explanation of important terms), 'Anzeige' (advertisements), and 'Impressum' (contact information, such as names of journalists, their telephone numbers, addresses of editorial offices in different cities, etc.).

In [17]:
# this function loads dictionaries with subsections of the newspaper we are going to analyze
def dictionary_open(name):
    with codecs.open(os.path.join(os.getcwd(), name),
               'r',  'utf-8') as f:
          dictionary = set(f.read().splitlines()[1:-1])
    return dictionary

In [18]:
# Each of the dictionaries contains subsections from the sections 'Politics', 'Stock market and Finance', 'Money', 'Opinion', 
# 'News', 'Page four', and 'Economy'.

politics_s = dictionary_open('politics.txt')
boerse_s = dictionary_open('boerse.txt')
geld_s = dictionary_open('geld.txt')
meinung_s = dictionary_open('meinung.txt')
nachrichten_s = dictionary_open('nachrichten.txt')
vier_s = dictionary_open('vier.txt')
wirtschaft_s = dictionary_open('wirtschaft.txt')

In [19]:
# Create a column with the corresponding section based on subsections from the XML files.
data['section'] = u'Other'
data.loc[(np.isin(data['rubrics'].values, list(politics_s))),'section'] = u'Politik'
data.loc[(np.isin(data['rubrics'].values, list(boerse_s))),'section'] = u'Börse und Finanzen'
data.loc[(np.isin(data['rubrics'].values, list(geld_s))),'section'] = u'Geld'
data.loc[(np.isin(data['rubrics'].values, list(meinung_s))),'section'] = u'Meinungsseite'
data.loc[(np.isin(data['rubrics'].values, list(nachrichten_s))),'section'] = u'Nachrichten'
data.loc[(np.isin(data['rubrics'].values, list(vier_s))),'section'] = u'Seite vier'
data.loc[(np.isin(data['rubrics'].values, list(wirtschaft_s))),'section'] = u'Wirtschaft'

In [20]:
# Create a column "priority1" (these are the most important sections) with two global sections 
# Economy and Politics.
Economy = [u'Börse und Finanzen', u'Geld', u'Wirtschaft']
Politics = [u'Politik', u'Nachrichten', u'Seite vier', u'Meinungsseite']

data['priority1'] = u'Other'
data.loc[(np.isin(data['section'].values, Politics)),'priority1'] = u'Politics'
data.loc[(np.isin(data['section'].values, Economy)),'priority1'] = u'Economy'

In [21]:
# Only keep 'priority 1' articles.
data = data[(data['priority1']==u'Politics') | (data['priority1']==u'Economy')]

In [22]:
# the number of articles after removing irrelevant sections
data.reset_index(inplace=True, drop=True)
len(data)

561294

### Text

Exclude articles that contain the following strings:

* Hultschiner Straße 8, 81677 München: contact details of SZ;
* Bayerische Warenbörse/Bayerische Warenboerse: Bavarian commodity exchange (prices for different commodities, no narrative);
* Südbayerischer Schweinemarkt/Suedbayerischer Schweinemarkt: pig market, prices;
* Münchner Schlachtviehmarkt/Muenchner Schlachtviehmarkt: slaughter cattle market, prices;
* Notierungen der Butter- und Käsebörse/Notierungen der Butter- und Kaeseboerse/Butter- und Kaeseboerse/Butter- und Käsebörse: market of butter and cheese, prices;
* Abonnement: : details about the subscription cost;
* MARKTDATEN VOM TAGE: quantitative information on stock market prices, interest rates;
* bat Studiotheater/Theater Arena/Theater Akademie/Theater Deutsches Theater/Theater Ballhaus Naunynstrasse: a detailed list of various cultural events;
* Herausgegeben vom Sueddeutschen Verlag/Herausgegeben vom Süddeutschen Verlag: a list of staff members;
* Das Publikum im Kleinen Haus: theater news;
* Konkurse\, Vergleich und Gesamtvollstreckungsverfahren: list of companies that have filed for bankruptcy or insolvency;
* AUS DEM INHALT/S Z AM WO C H E N E N D E/SZ AM WOCHENENDE/SIE LESEN HEUTE/Aus dem Inhalt Franjo Kiseljak: contents;
* ^Was diese Woche bringt: lists of upcoming events, meetings, and announcements related to various topics. The articles starting with "Was diese Woche bringt" cover a wide range of subjects, making it difficult to categorize or analyze them effectively.;
* Die größten Unternehmen. Rang/Die globalen Top 50: list of the largest German companies.

In [23]:
text_strings = ['Hultschiner Straße 8\, 81677 München', 'Bayerische Warenbörse', 
                'Bayerische Warenboerse', 'Südbayerischer Schweinemarkt', 'Suedbayerischer Schweinemarkt',
               'Münchner Schlachtviehmarkt', 'Muenchner Schlachtviehmarkt', 'Notierungen der Butter- und Käsebörse',
               'Notierungen der Butter- und Kaeseboerse', 'Butter- und Kaeseboerse', 'Butter- und Käsebörse', 
                'Abonnement\:', 'MARKTDATEN VOM TAGE', 'bat Studiotheater', 'Theater Arena', 
                'Herausgegeben vom Sueddeutschen Verlag', 'Herausgegeben vom Süddeutschen Verlag', 'Theater Akademie',
               'Theater Deutsches Theater', 'Theater Ballhaus Naunynstrasse', 'Das Publikum im Kleinen Haus',
               'Konkurse\, Vergleich und Gesamtvollstreckungsverfahren', 'AUS DEM INHALT', 'S Z AM WO C H E N E N D E',
               'SZ AM WOCHENENDE', 'SIE LESEN HEUTE', 'Aus dem Inhalt Franjo Kiseljak', '^Was diese Woche bringt',
               'Die größten Unternehmen\. Rang', 'Die globalen Top 50']
data = data[~(data.texts.str.contains('|'.join(text_strings)))]
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

In [24]:
# the number of articles after excluding articles based on the text patterns
len(data)

559488

### Title

Exclude articles with the following titles:

* HEUTE IN DER SZ./HEUTE IN DER SZ /.: contents;
* Konkurse, Vergleiche und Gesamtvollstreckungsverfahren./Konkurse, Vergleiche und Gesamtvollstreckungen./Konkurse, Vergleiche und Zwangsvollstreckungsverfahren./Konkurse, Vergleiche und Zwangsvollstreckungen.: list of companies that have filed for bankruptcy or insolvency;
* INHALT.: contents;
* Gewinnzahlen./Gewinnquoten./GEWINNZAHLEN./GEWINNQUOTEN./FINANZEN. Gewinnquoten./UNTERNEHMEN. Gewinnquoten./KURZ GEMELDET. Gewinnquoten./POLITIK UND MARKT. Gewinnquoten./POLITIK UND MARKT. Gewinnzahlen./KURZ GEMELDET. Gewinnzahlen./UNTERNEHMEN. Gewinnzahlen.: winning numbers and prizes for various German lotteries;
* Außerdem in dieser Ausgabe.: contents;
* Kinoprogramm.: movie schedule;
* Terminkalender.: a schedule of various local events;
* Notdienste.: a list of emergency and helpline contacts;
* SIE LESEN HEUTE IN DER SZ./SI E LE S E N H EUTE.: contents;
* 'Gipfelstürmer.': a description of a competition called "Gipfelstürmer", every article titled 'Gipfelstürmer.' contains identical text.

In [25]:
titles_to_exclude = [
    'HEUTE IN DER SZ.', 'Konkurse, Vergleiche und Gesamtvollstreckungsverfahren.',
    'INHALT.', 'Gewinnzahlen.', 'Gewinnquoten.', 'GEWINNZAHLEN.',
    'Außerdem in dieser Ausgabe.', 'Kinoprogramm.', 'Terminkalender.', 'GEWINNQUOTEN.',
    'Notdienste.', 'FINANZEN. Gewinnquoten.', 'Konkurse, Vergleiche und Gesamtvollstreckungen.',
    'SIE LESEN HEUTE IN DER SZ.', 'UNTERNEHMEN. Gewinnquoten.', 'KURZ GEMELDET. Gewinnquoten.', 
    'POLITIK UND MARKT. Gewinnquoten.', 'SI E LE S E N H EUTE.', 'POLITIK UND MARKT. Gewinnzahlen.',
    'KURZ GEMELDET. Gewinnzahlen.', 'UNTERNEHMEN. Gewinnzahlen.', 'HEUTE IN DER SZ /.', 
    'Konkurse, Vergleiche und Zwangsvollstreckungsverfahren.', 'Konkurse, Vergleiche und Zwangsvollstreckungen.',
    'Gipfelstürmer.'
]

data = data[~data['title'].isin(titles_to_exclude)]
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

In [26]:
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame
# the number of articles after excluding articles based on the title
len(data)

557097

In [27]:
counter=collections.Counter(data['year'])
print(counter)

Counter({1999: 27928, 1994: 25990, 2000: 25904, 1995: 25712, 1996: 24801, 1997: 24766, 2001: 24733, 1998: 24690, 2008: 24457, 2010: 23498, 2009: 22863, 2006: 22702, 2011: 22673, 2005: 22456, 2007: 22063, 2003: 21940, 2004: 21911, 2012: 21220, 2002: 20874, 2013: 19314, 2014: 18242, 2016: 17922, 2017: 17550, 2015: 17473, 2018: 15415})


## Umlauts

In the given dataset, some texts contain incorrect umlaut encodings, such as '&auml;', '&uuml;', '&ouml;', '&Auml;', '&Uuml;', and '&Ouml;'. These encodings are typically used in HTML documents to represent umlaut characters like 'ä', 'ü', 'ö', 'Ä', 'Ü', and 'Ö'. However, in this case, the encodings are not being interpreted correctly, leading to broken umlauts in the text.

To fix this issue, we use the function `umlauts_correct_sz` that replaces these incorrect umlaut encodings with the correct umlaut characters.

In [28]:
# Define a list of incorrect umlaut encodings
incorrect_umlauts = ["&auml;", "&uuml;", "&ouml;", "&Auml;", "&Uuml;", "&Ouml;"]

pattern = "|".join(incorrect_umlauts)
sz_uml_encoding = data[data.texts.str.contains(pattern)]

sz_uml_encoding.iloc[1]['texts']

'Mühl Product & Service kauft Aktien zur&uuml;ck. Kranichfeld (Reuters) - Der am Neuen Markt gelistete Baudienstleister Mühl Product & Service beginnt mit dem R&uuml;ckkauf eigener Aktien. Ab sofort sollen bis zu drei Prozent der eigenen Anteilsscheine &uuml;ber die B&ouml;rse erworben werden, wie die Th&uuml;ringer Gesellschaft am Freitag mitteilte. Die Erm&auml;chtigung der Hauptversammlung sehe vor, dass bis zum 15. Dezember 2001 bis zu zehn Prozent des derzeitigen Grundkapitals der Gesellschaft &uuml;ber die B&ouml;rse oder im Rahmen eines &ouml;ffentlichen Kaufangebots erworben werden k&ouml;nnten. Der Aktienr&uuml;ckkauf sei auch in Teilen m&ouml;glich. Der Kaufpreis je Aktie darf den Angaben zufolge den durchschnittlichen Schlusskurs an den f&uuml;nf B&ouml;rsentagen vor Erwerb der Anteilsscheine um nicht mehr als zehn Prozent unter- oder &uuml;berschreiten. Im Falle eines &ouml;ffentlichen Kaufangebotes d&uuml;rfe der Kaufpreis je Aktie den durchschnittlichen Schlusskurs des ac

In [29]:
startTime = datetime.now() # track time

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    texts_corrected = pool.map(umlauts_correct_sz.umlauts_correct_sz, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:11.447588


In [30]:
data['texts'] = texts_corrected

The second problem with umlauts in the early texts (published before 1995) is that they are replaced with digraphs like 'ae' for 'ä', 'oe' for 'ö', 'ue' for 'ü', 'ss' for 'ß', 'AE' for 'Ä', 'OE' for 'Ö', and 'UE' for 'Ü'. This discrepancy can lead to an increased vocabulary size and potential ambiguity in word representation. To fix the issue with umlauts, we use the notebook `Umlauts_fix` written in Python 2. 

In [31]:
umlauts = ['ä', 'ö', 'ü', 'ß', 'Ä', 'Ö', 'Ü']
umlauts_replace = ['ae', 'oe', 'ue', 'ss', 'AE', 'OE', 'UE']

In [32]:
sz_umlauts_fix = data[(data.texts.str.contains('|'.join(umlauts_replace))) & (~data.texts.str.contains('|'.join(umlauts))) & (data.year<1995)]

In [33]:
# example of the text that we try to fix with a spellchecker
sz_umlauts_fix.texts[0]

'Trotz aller Friedensappelle in Suedafrika Terrorwelle zur Jahreswende. Mord an Pub-Besuchern in Kapstadt gibt Raetsel auf. Kapstadt (AP/Reuter/AFP) - Trotz aller Friedensappelle hat das neue Jahr in Suedafrika mit blutigen Gewalttaten begonnen. Fuenf maskierte Schwarze toeteten in der Silvesternacht in einem Pub in Kapstadt vier Weisse. In Johannesburg wurden zwei Polizisten erschossen. In den Schwarzensiedlungen Tokoza und Katlehong oestlich von Johannesburg entdeckte die Polizei die Leichen von fuenf Maennern und einer Frau. Die meisten der Opfer seien erschossen worden, erklaerte die Polizei. Bei einem weiteren Zwischenfall in der Naehe der Hafenstadt Durban seien bei der von ethnischen Indern bewohnten Vorstadt Chatsworth drei Menschen getoetet und zwei weitere verletzt worden. Ebenfalls in Durban wurden nach einem Fernsehbericht bei einem Streit an dem vielbesuchten Strand Brighton Beach ein Mann erschossen und ein weiterer durch einen Messerstich verletzt. Die Hintergruende des 

In [34]:
sz_umlauts_fix.to_csv('sz_umlauts_fix.csv', encoding='utf-8-sig', sep = ';')

In [35]:
sz_umlauts_fixed = pd.read_csv('sz_umlauts_fixed.csv', encoding = 'utf-8', sep=';')

In [36]:
data.loc[sz_umlauts_fixed['Unnamed: 0.1'], 'texts'] = sz_umlauts_fixed.texts.values

In [37]:
# fixed version
data.texts[0]

'Trotz aller Friedensappelle in Südafrika Terrorwelle zur Jahreswende. Mord an Pub - Besuchern in Kapstadt gibt Rätsel auf. Kapstadt (AP / Reuter / AFP) - Trotz aller Friedensappelle hat das neue Jahr in Südafrika mit blutigen Gewalttaten begonnen. Fünf maskierte Schwarze töteten in der Silvesternacht in einem Pub in Kapstadt vier Weiße. In Johannesburg wurden zwei Polizisten erschossen. In den Schwarzensiedlungen Tokoza und Katlehong östlich von Johannesburg entdeckte die Polizei die Leichen von fünf Männern und einer Frau. Die meisten der Opfer seien erschossen worden, erklärte die Polizei. Bei einem weiteren Zwischenfall in der Nähe der Hafenstadt Durban seien bei der von ethnischen Indern bewohnten Vorstadt Chatsworth drei Menschen getötet und zwei weitere verletzt worden. Ebenfalls in Durban wurden nach einem Fernsehbericht bei einem Streit an dem vielbesuchten Strand Brighton Beach ein Mann erschossen und ein weiterer durch einen Messerstich verletzt. Die Hintergründe des Überfal

## Identify and Delete non-German Articles

Next, we filter out articles written in any language other than German using a **langdetect** library.

We remove the following types of articles:

1) texts predominantly comprising English names of individuals, states, and organizations

In [38]:
data.iloc[122092]['texts']

"Gewählte Gouverneure in 36 Bundesstaaten Alabama (AL): Donald Siegelman (Dem) Alaska (AK): Tony Knowles (Dem) Arizona (AZ): Jane Dee Hull (Rep) Arkansas (AR): Mike Huckabee (Rep) Colorado (CO): Bill Owens (Rep) Connecticut (CT): John Rowland (Rep) Florida (FL): John Ellis 'Jeb' Bush (Rep) Georgia (GA): Roy Barne (Dem) Hawaii(HI): Ben Cayetano (Dem) Idaho (ID): Dirk Kempthorne (Rep) Illinois (IL): George Ryan (Rep) Iowa (IA): Tom Vilsack (Dem) Kalifornien (CA): Gray Davis (Dem) Kansas (KS): Bill Graves (Rep) Maine (ME): Angus King (unabh.) Maryland (MD): Parris Glendening (Dem) Massachusetts (MA): Paul Cellucci (Rep) Michigan (MI): John Engler (Rep) Minnesota (MN): Jesse Ventura (unabh.) Nebraska (NE): Mike Johanns (Rep) Nevada (NV): Kenny Guinn (Rep) New Hampshire (NH): Jeanne Shaheen (Dem) New Mexiko (NM): Gary Johnson (Rep) New York (NY): George Pataki (Rep) Ohio (OH): Bob Taft (Rep) Oklahoma (OK): Frank Keating (Rep) Oregon (OR): John Kitzhaber (Dem) Pennsylvania (PA): Thomas Ridge

2) texts including informal language, slang, and interjections

In [39]:
data.iloc[167241]['texts']

'Das Streiflicht. (SZ) Ja. Ja! Jaaaaaaa!!! Ja, Franz. Jaaaaaaaa. Ja. Ja. Ja. Jajaja-ja, jajaja-ja! Ja! Jaaaaaaaaaaaaaaaaaaaaa - jahaha. O ja, Franz. Jaaaa! Jaaaa! Jaaaa! Jaaaa! Ja!! Ja, ja, ja, ja, ja. Ehhh-ja!! Ja, logo. Jaaaaaaa! Jaaa. Jaaa. Hahaaaaaa! Jawollja. Jaaaaaaaa!! Mensch, Franz, oide Fischhaut! Jaaaaaaaa. Ja! Ja! Ja! Jaaaaaa! Jajajajajaja. Jaaaa. Hahaaaa: Zürich. Züri brännt, jaha. Jaaaa. End se winner is? Franz! Se Kaiser! Kaiser Fränz se Bayer or se först. Senks! Senks, Franzl, old Giesing Gwachs! Ja, ja, jaaaaa, ja. Jaaaaaaaaaaaaa! Jaaaaaaaaaaaaaaa!! Ja. Ja. Jaaaaaa! Mhhhja. "Ja! Ja! Ja!" (Bild) bzw. "Jaaa!" (tz). Jaaaaaaaaa. Jaaaaaaaaa. Jaja. Ja. Ja. Ja. Tjaja. Yes. Si. Da. Oui. Wuiwui, Franz. Jaaa. Jaaaaaaaa. Ja. Ja? Jaaaaaaa??? Hm. Tja. Naja. Blatter. Netzer. Völler. Becker. Schiffer. Wieso Becker? Hm. Tja. Und wieso Schiffer? Ahhhhhhh ja. Bravo, Claudia! Jaaajaja! Jaaa. Jaaa. Jaaa. Schaumermal. Nnnnnnnnn-ja! Jaja. Jo, doch. Jaaaaa-ha! Oja, o là là! Jajajajajajajajaja

3) tables without any supporting text

In [40]:
data.iloc[179911]['texts']

'29. 12. 00 2)Veränd. gegen Emissionspr.Veränd. geg. ErstnotizUpdate. com 11.4.NM 3)23,0024,0040,001,851,51-93,43%-93,71%Intraware Software12.5.NM28,0029,0034,902,002,50-91,07%-91,38%Mediascape Comm.22.5.NM47,0045,0047,503,804,40-90,64%-90,22%Comtelco7.2.GM 4)18,0023,0028,901,201,89-89,50%-91,78%Ad Pepper Media9.10.NM17,0017,5018,102,102,22-86,94%-87,31%Travel24.com13.3.NM29,0032,0032,003,054,30-85,17%-86,56%Openshop Holding21.3.NM54,00106,00130,007,808,30-84,63%-92,17%Arbo Media.net9.5.NM43,0053,0056,006,607,10-83,49%-86,60%Lycos Europe22.3.NM24,0024,0024,003,404,10-82,92%-82,92%Matchnet27.6.NM7,506,506,801,401,30-82,67%-80,00%Valor Computerized Syst.15.5.NM12,0014,3015,701,962,13-82,25%-85,10%d + s online23.5.NM25,0025,0025,404,104,50-82,00%-82,00%Alphaform28.6.NM17,0016,0016,702,253,40-80,00%-78,75%Advanced Vision Techn.28.2.NM14,0026,0029,002,002,85-79,64%-89,04%Adlink Internet Media11.5.NM17,0016,2020,953,253,50-79,41%-78,40%Softing16.5.NM14,5016,0031,502,853,07-78,83%-80,81%Carri

In [41]:
# Delete all non-German articles from the data  
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    ger_results = pool.map(identify_ger.identify_ger, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:15:47.300177


In [42]:
data['language'] = ger_results
# the number of non-German articles
print(len(data[data.language == 0]))
# keep articles written in German
data = data[data.language==1]
data.reset_index(inplace=True, drop=True)
# the number of articles after excluding non-German articles
print(len(data))

7
557090


In [43]:
data.to_csv('non_german_delete.csv', encoding = 'utf-8-sig', sep = ';')

In [44]:
data = pd.read_csv('non_german_delete.csv', encoding = 'utf-8-sig', sep=';', index_col = 0, dtype = {'newspaper': 'str',
                                                                                                 'newspaper_2': 'str',
                                                                                                 'quelle_texts': 'str',
                                                                                                 'page': 'str',
                                                                                                 'rubrics': 'str'})
data.page = data.page.fillna('')
data.newspaper = data.newspaper.fillna('')
data.newspaper_2 = data.newspaper_2.fillna('')
data.rubrics = data.rubrics.fillna('')
data.quelle_texts = data.quelle_texts.fillna('')

## Remove URLs

A function called `correct_url_sz` takes a text input and processes it to remove URLs, HTML file references, and email addresses. A set called `exceptions` contains specific URLs that are not removed from the text. Preserving specific URLs, such as 'amazon.de', is crucial for maintaining the context and sentence structure, as removing them could result in loss of important information or topic identification, especially when certain internet companies are known primarily by their website names.

In [45]:
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    url_corrected = pool.map(correct_url_sz.correct_url_sz, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:00:40.701737


In [46]:
data['texts'] = url_corrected

## O instead of 0 problem

In some cases, Optical Character Recognition (OCR) can not distinguish between '0' and 'O' ('o'). As a result, there are tokens like '1OO'. Using regular expressions, we identify problematic tokens and replace 'O' ('o') with '0'.

In [47]:
startTime = datetime.now()

import ocr_replace

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    ocr_corrected = pool.map(ocr_replace.ocr_replace, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:00:12.808099


In [48]:
data['texts'] = ocr_corrected

## Fixing tokens containing a number and a word

We have identified several cases where numbers and words are erroneously merged into a single token. To address this problem of merged tokens, the following code has been implemented.

In [49]:
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    split_corrected = pool.map(split_number_word.split_number_word, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:00:17.302124


In [50]:
data['texts'] = split_corrected

## Concatenate chained articles

By the chained articles we mean the news reports that start on one page and continue on another page of the newspaper. Since the news database is prepared using OCR technology, each part of a chained news article represents a separate entry in the database, i.e. a separate news report.

Ideally, all the parts of a chained article must be combined into one news report. However, as nothing in the metadata helps us identify chained articles, the merging process is not straightforward. We do our best to merge at least some chained articles based on the text alone.

The second part of a chained article normally contains one of the following strings: 'Fortsetzung von Seite' (continuantion from page) or 'FORTSEZUNG VON SEITE'. We select the artilces that meet this critetion and try to merge them with their beginnings.

However, we also exclude the articles that contain strings from both lists, `continued_strings_1` and `continued_strings_2`. This is because these chained articles have already been merged by the SZ team.

In [51]:
continued_strings_1 = ['Fortsetzung S\\.', 'Fortsetzung auf Seite', 'Fortsetzung Seite', 'Fortsetzung nächste Seite',
                       'Fortsetzung Seiten', 'FORTSETZUNG S\\.', 'FORTSETZUNG AUF SEITE', 'FORTSETZUNG SEITE', 
                       'FORTSETZUNG NÄCHSTE SEITE', 'FORTSETZUNG SEITEN']
continued_strings_2 = ['Fortsetzung von Seite', 'FORTSETZUNG VON SEITE']

# Articles that contain one of the strings from 'continued_strings_1' and one of the strings from 
# 'continued_strings_2' are examples of chained articles that have already been merged. 
continued_articles = data[(data.texts.str.contains('|'.join(continued_strings_2))) & 
                         (~data.texts.str.contains('|'.join(continued_strings_1)))]
continued_articles = continued_articles.reset_index()

In the SZ data, there are only 17 chained articles that have not been merged. For the sake of completeness and to demonstrate that it is not an issue for this dataset, we try to merge them using the existing function `chained_articles` written for Handelsblatt data. 

In [52]:
# There are 17 articles that could potentially be part of the chained articles.
len(continued_articles)

17

An input of the function `chained_articles` is a tuple, where the first element is a row of the dataframe corresponding to the second part of the chained article. The second element of the tuple is a dataframe containing all articles published on the same day as the second part of the chained article, i.e. all articles that could potentially be the first part of the chained article.

In [53]:
# Required input for the function 'chained_articles': a row of the 'continued_articles' df, and a subset of the
# 'data' df corresponding to the articles published on the same day as the considered chained article.

# List with the potentially chained articles
cont_input = []
# List with the dataframes containing articles published on the same day as the considered chained article
data_input = []

# To merge the articles from 'chain_exceptions' with the corresponding first parts, more complicated rules 
# are required than those we use in the funciton 'chained_articles'. Therefore, we list them as exceptions.
chain_exceptions = [15,16]
# An input for the function 'chained_articles'.
for ind in continued_articles.index:
    if ind not in chain_exceptions:
        cont_input.append(continued_articles.iloc[[ind]])
        data_input.append(data[(data.day == continued_articles['day'][ind]) & 
                      (data.month == continued_articles['month'][ind]) &
                     (data.year == continued_articles['year'][ind])])
    
inputs_cont = list(zip(cont_input, data_input))

The function `chained_articles` returns a list with two tuples. The first tuple contains two indices corresponging to the first and second part of a chained article. These two parts must be merged into one article.
    
The second tuple contains two indices as well. The first index corresponds to a part of the chained article that is a duplicate because the merged version of this chained article already exists in the database. The article with this index must be deleted. The second index corresponds to the merged version of the chained article. This article must be kept in the database.

All the rules we use to find the first part of a chained article are described in detail in the function `chained_articles`.

In [54]:
startTime = datetime.now() 

chained_pair = []

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    chained_intermediate = pool.map(chained_articles.chained_articles, inputs_cont)
    chained_pair = chained_pair + chained_intermediate 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:05.895131


In [55]:
# The list of unique tuples, where the first element is a duplicate and 
# the second element is a merged chained article.
duplicates = list(set([sublist[1] for sublist in chained_pair if sublist[1] != (-1, -1)]))
# The list with indices of all the duplicated/merged articles.
dup_merged = [tup[0] for tup in duplicates]+[tup[1] for tup in duplicates]
# The list with all the chained articles that we would like to merge.
chained_pair_list = [sublist[0] for sublist in chained_pair if sublist[0] != ()]
# Exclude duplicated/merged articles from the list of the chained articles.
chained_pair_list = [pair for pair in chained_pair_list if (pair[0] not in dup_merged) and \
                     (pair[1] not in dup_merged)]
# The rules we use allow us to merge 12 chained articles
len(chained_pair_list)

12

In [56]:
# An example of the first part
data['texts'][chained_pair_list[0][0]]

'Die Rede des neuen Staatsoberhaupts Herzog will Präsident aller Deutschen sein. Die Ostdeutschen müssten begreifen, dass Sie für uns keine Last, sondern dass Sie für uns ein Gewinn sind. Berlin (dpa) - Nach seiner Wahl zum Bundespräsidenten sagte Roman Herzog vor der Bundesversammlung in Berlin: Frau Präsidentin, meine Damen und Herren, das ist eine bewegende Stunde für mich. Es ist immer wieder in den letzten Tagen daran erinnert worden, wie die letzte Bundespräsidentenwahl, die hier in Berlin im Reichstag stattgefunden hat, sich im März 1969 abgespielt hat. Vielleicht wissen es viele von Ihnen nicht. In dieser Zeit waren meine Frau und ich Bürger von West - Berlin, und wir haben es miterlebt, wie damals die Regierung der DDR - für einige Stunden wenigstens, aber niemand wusste ja, wie lang das dauern würde -, die Zufahrtswege zu Lande abgesperrt hat. Und wir haben es nie für möglich gehalten, dass eine Stunde wie diese noch einmal zu unseren Lebzeiten möglich sein würde. Es ist ein 

In [57]:
# An example of the second part
data['texts'][chained_pair_list[0][1]]

'Herzog will Präsident aller.... Fortsetzung von Seite 1 einige Jahrzehnte in der Vergangenheit war. Aber meine Damen und Herren, wir Deutschen haben die Kraft, wir Europäer haben die Kraft, diesen Weg zu finden und diesen Weg dann auch zu gehen. Daran sollten wir nie zweifeln und das sollten wir auch nie verschweigen. Es gehört zu unserem Leben dazu. Hätten wir diese Hoffnungen nicht mehr, dann könnten wir den Weg, der vor uns liegt, auch nicht gehen. Und das sage ich insbesondere in Irritationen hinein, die sich heute aus den Schwierigkeiten im Gefolge der deutschen Wiedervereinigung ergeben. Ich sage es an die Bürger der früheren Bundesrepublik, die jetzt viele Opfer bringen müssen. Meine Damen und Herren, ich kann es Ihnen nicht ersparen, das immer wieder zu sagen, das ist der Ausgleich für eine Ungerechtigkeit der Weltgeschichte, die durch Zufall an der Elbgrenze gelegt worden ist. Diesseits und jenseits der alten Mauer haben Deutsche gelebt, diesseits und jenseits der alten Mauer

In [58]:
# The indices of the first part of the chained articles.
first_part_ind = [pair[0] for pair in chained_pair_list]
# The indices of the second part of the chained articles.
second_part_ind = [pair[1] for pair in chained_pair_list]
# Merge the first and the second parts.
data.loc[first_part_ind, 'texts'] = data.loc[first_part_ind, 'texts'].values + \
" " + data.loc[second_part_ind, 'texts'].values
# Drop the second part.
data.drop(data.index[second_part_ind], inplace = True)
# The indices of the duplicates.
dup_indices = [tup[0] for tup in duplicates]
# Drop the duplicates.
data.drop(data.index[dup_indices], inplace = True)

In [59]:
# Calculate the word count of the merged articles.

startTime = datetime.now() 

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    count_results = pool.map(count_words_mp.count_words_mp, [text for text in data.loc[first_part_ind, 'texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:02.831637


In [60]:
# Replace the "word_count" of the merged articles.
data.loc[first_part_ind, 'word_count'] = count_results
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

In [61]:
# the number of articles after concatenating chained articles
len(data)

557078

## Remove fuzzy duplicates

By fuzzy duplicates we understand nearly duplicated articles. These are:

* drafts/minor revisions of the articles saved in the database;
* slightly changed advertisements which are published several times during a month.

We identify 'fuzzy' duplicates using cosine similarity and choose a threshold of 93% based on some visual exploration. Here is the article by Ryan Basques we used as a reference: [Link](https://towardsdatascience.com/a-laymans-guide-to-fuzzy-document-deduplication-a3b3cf9a05a7). 

In [62]:
# Create a 'date' column in the data DataFrame
data['date'] = pd.to_datetime(data[['year', 'month', 'day']])

### Duplicates exploration

We must first identify and visually inspect the fuzzy duplicates using the `fuzzy_duplicates_test` function before proceeding with their removal. This step ensures the algorithm's performance is as expected, and it accurately identifies articles as fuzzy duplicates based on their cosine similarity.

In [63]:
# Required input for the function 'fuzzy_duplicates': a dataframe for each month-year combination.
# List with a year
inputs_year = []
# List with a month
inputs_month = []
# List with the dataframes containing 'year', 'month', and 'texts' columns
inputs_month_year = []
for year in list(set(data['year'])):
    for month in list(set(data['month'])):
        # Exclude December 2018
        if year == 2018 and month == 12:
            continue
        inputs_year.append(year)
        inputs_month.append(month)
        inputs_month_year.append(data[(data['year'] == year) & (data['month'] == month)][["month", "year", "texts", "word_count", "date"]])
        
inputs = list(zip(inputs_year, inputs_month, inputs_month_year))

In [64]:
startTime = datetime.now() # track time

import fuzzy_duplicates_test_all_dpa 

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    dup_intermediate = pool.map(fuzzy_duplicates_test_all_dpa.fuzzy_duplicates_test, inputs) 
    duplicates = pd.concat(dup_intermediate) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

duplicates.to_csv('duplicates.csv', encoding = 'utf-8-sig', sep = ';')

0:05:34.361982


### Drop the duplicates

In [65]:
startTime = datetime.now() # track time

delete_indices = []

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    # apply function to all combinations of month-year in parallel
    delete_intermediate = pool.map(fuzzy_duplicates_dpa.fuzzy_duplicates, inputs)
    delete_indices = delete_indices + delete_intermediate # create one list of indices
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:05:35.367368


In [66]:
# Free memory
inputs = None
# List of indices corresponding to the duplicated articles
delete_indices = [item for sublist in delete_indices for item in sublist]
# List of unique indices
delete_indices = list(set(delete_indices))
# Drop the fuzzy duplicates
data.drop(data.index[delete_indices], inplace = True)
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

In [67]:
# the number of articles after removing fuzzy duplicates
len(data)

552092

In [68]:
data.to_csv('fuzzy_duplicates_delete.csv', encoding = 'utf-8-sig', sep = ';')

In [69]:
data = pd.read_csv('fuzzy_duplicates_delete.csv', encoding = 'utf-8-sig', sep=';', index_col = 0, dtype = {'newspaper': 'str',
                                                                                                 'newspaper_2': 'str',
                                                                                                 'quelle_texts': 'str',
                                                                                                 'page': 'str',
                                                                                                 'rubrics': 'str'})
data.page = data.page.fillna('')
data.newspaper = data.newspaper.fillna('')
data.newspaper_2 = data.newspaper_2.fillna('')
data.rubrics = data.rubrics.fillna('')
data.quelle_texts = data.quelle_texts.fillna('')

## SZ-specific problem

We encountered a minor issue specific to the SZ dataset while addressing other problems. This issue involves texts containing broken umlauts. To resolve this, we identify such texts and correct their spelling accordingly.

In [70]:
startTime = datetime.now()

# Find the words like '}ber' and '|ffentlichen' in the articles' texts.
if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    problem_umlaut = pool.map(find_umlaut.find_umlaut, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:00:15.640010


In [71]:
# Indices of potentially problematic articles.
problem_umlaut_list = [tup[0] for tup in enumerate(problem_umlaut) if tup[1] != []]

In [72]:
index1 = problem_umlaut_list[1]
print(index1)
index2 = problem_umlaut_list[2]
print(index2)
index3 = problem_umlaut_list[3]
print(index3)

# Replace '|' with 'ö' and '}' with 'ü' in articles with indices 89229 and 89461
data.loc[index1, 'texts'] = data.loc[index1, 'texts'].replace('|', 'ö').replace('}', 'ü')
data.loc[index2, 'texts'] = data.loc[index2, 'texts'].replace('|', 'ö').replace('}', 'ü')

# Replace 'Biotechnolog|ieindustrie' with 'Biotechnologieindustrie' in the article with index 184123
data.loc[index3, 'texts'] = data.loc[index3, 'texts'].replace('Biotechnolog|ieindustrie', 'Biotechnologieindustrie')

89229
89461
184123


## Clean articles

SZ articles include some text passages that are unlikely to be relevant for either topic modeling or sentiment analysis. We decided to clean the affected articles from these text passages to make the analysis easier for our models.

We remove the following information from the texts:

   * 1) addresses and Internet addresses
   * 2) websites
   * 3) copyright information
   * 4) references to photo sources
   * 5) references to additional information
   * 6) telephone and fax numbers
   * 7) references to page numbers
   * 8) a sentence that is repeated 96 times in one text
   * 9) references to extended versions of interviews
   * 10) references to contact information
   * 11) references to podcasts

In [73]:
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    cleaned_articles = pool.map(clean_sz_articles.clean_sz_articles, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:45.024807


In [74]:
data['texts'] = cleaned_articles

In [75]:
# Calculate an updated word count of the cleaned articles.

startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    count_results = pool.map(count_words_mp.count_words_mp, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:15.376690


In [76]:
# Update the word count in the data frame.
data['word_count'] = count_results
# Drop short articles.
data = data[data['word_count']>=100]
data.reset_index(inplace=True, drop=True)

In [77]:
# the number of articles after excluding articles shorter than 100 words
len(data)

551817

## Exclude tables

In this section, we preprocess the news articles by removing tables to minimize noise and emphasize relevant content. We begin by calculating a numerical density metric for each text, which is computed as the ratio of the count of numbers to the total word count (excluding numbers). Texts with a numerical density of at least 20% are considered as candidates for containing tables. We manually examine these texts to identify recurring strings that typically precede tables. Using regular expressions, we exclude the tables based on these strings. Moreover, we delete some text segments predominantly comprising numbers.

In [78]:
# use the 'numeric_articles' function to identify articles with a high share of numbers in them
inputs = zip(data['texts'], data['word_count'], itertools.repeat(0.20))

startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    tables = pool.starmap(numeric_articles.numeric_articles, inputs)
    pool.close()
    pool.join()

print(datetime.now()-startTime)
data['tables'] = tables
tables = data[data.tables == True]

0:00:09.421605


Below is an example of an article containing a table with emission data for newly listed companies. To remove this table, we apply the following regular expression:  `r'1\) prozentuale Veränderung gegenüber.{0,}'`.

In [79]:
correct_ind = tables[tables.texts.str.contains('Mehr als die Hälfte')].index[0]
print(correct_ind)
tables.loc[correct_ind,'texts']

145683


'Neuemissionen `99. Mehr als die Hälfte liegt im Minus Debütanten an deutschen Börsen seit Anfang dieses Jahres (Reihenfolge nach Kursentwicklung1). Aktien von Film-Firmen verzeichnen die höchsten Gewinne. München, 30. September - Wer bei den Neuemissionen dieses Jahres zu den Erstzeichnern gehörte, machte zumeist Verluste. Gut die Hälfte aller Titel notiert derzeit unter ihrem Ausgabepreis. Die Zahl von Neuemissionen hat schon jetzt einen Höchstwert in der bundesdeutschen Geschichte erreicht. Während im bisherigen Rekordjahr 1998 insgesamt 77 Unternehmen Deutschland als Ort ihrer Börsenpremiere wählten, waren es seit Anfang 1999 bereits 143. Dabei gingen 99 Gesellschaften an den Neuen Markt, 20 in den amtlichen Handel, 12 in den geregelten Markt und 12 weitere an den Freiverkehr oder ähnliche Segmente der Regionalbörsen. Insgesamt 75 Titel notieren derzeit unter jenem Preis, zu dem sie an die Börse kamen. Dabei schneiden die Debütanten am Neuen Markt etwas besser ab als ihre Kollegen 

Clean tables using regular expressions.

In [80]:
startTime = datetime.now()
if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    remove_tables = pool.map(clean_tables_sz.clean_tables_sz, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

data['texts'] = remove_tables

0:00:13.471488


In [81]:
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    count_results = pool.map(count_words_mp.count_words_mp, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:24.099785


In [82]:
# Update the content of the column "word_count"
data['word_count'] = count_results

In [83]:
# remove articles with less than 100 words, while retaining those that contain the string 'Konjunktur-Kompass'
data = data[(data['word_count']>=100) | (data.texts.str.contains('Konjunktur-Kompass'))]
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True)
# the number of articles after excluding articles shorter than 100 words
print(len(data))
del data['tables']

551604


## Identify articles that predominantly consist of numbers

Articles that predominantly consist of numerical data present challenges for sentiment or topic analysis as, once numbers are removed, they tend to contain limited information. Therefore, we remove articles with a numerical density of 50% or greater.

In [84]:
# use the 'numeric_articles' function to identify economic articles with a high share of numbers in them
inputs = zip(data['texts'], data['word_count'], itertools.repeat(0.50))

startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    numeric_list = pool.starmap(numeric_articles.numeric_articles, inputs)
    pool.close()
    pool.join()

print(datetime.now()-startTime)
data['numeric'] = numeric_list

0:00:11.328068


In [85]:
# inspect example article with high share of numbers
data[data.numeric == True]['texts'].iloc[0]

'Die Ergebnisse in den Wahlkreisen. Bei der Landtagswahl in Niedersachsen hat jeder Wähler wie bei der Bundestagswahl zwei Stimmen. Mit der Erststimme wird der Wahlkreiskandidat und mit der Zweitstimme die Partei gewählt. Der Anteil der Zweitstimmen legt gemäß Verhältniswahlrecht die Zahl der Sitze einer jeden Partei im Landtag fest. Die folgenden Werte geben den prozentualen Anteil der Erststimmen wieder. Die Ergebnisse von 1990 stehen in Klammern. 1 Braunschweig - Nordost SPD 41,0 (43,4), CDU 39,7 (44,3), FDP 3,8 (5,3), Grüne 10,5 (7,0), REP 2,7 (0,0), STATT Partei 2,3; gew. Isolde Saalmann (SPD), bisher Wolfgang Sehrt (CDU). 2 Braunschweig - Südost SPD 41,9 (43,7), CDU 41,0 (46,2), FDP 3,5 (4,8), Grüne 7,7 (5,3), REP 3,6 (0,0), STATT Partei 2,2; gew. Klaus - Peter Bachmann (SPD), bisher Heiner Herbst (CDU). 3 Braunschweig - Südwest SPD 49,0 (49,7), CDU 32,5 (40,1), FDP 2,8 (4,2), Grüne 9,4 (6,0), REP 4,1 (0,0), STATT Partei 2,2; gew. Jürgen Buchheister (SPD), bisher Friedhelm Schuri

In [86]:
data[data.numeric == True]['texts'].iloc[1]

'Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994 Samstag / Sonntag, 16. / 17. April 1994

In [87]:
len(data[data.numeric == True])

92

In [88]:
# drop articles predominantly consisting of numbers
data = data[data.numeric == False]
del data['numeric']
data.reset_index(inplace=True, drop=True)
# the number of articles after removing articles that predominantly consist of numbers
print(len(data))

551512


## Identify articles that predominantly consist of names

We eliminate texts with a name density of at least 15% (relative to the total word count, excluding numbers) as part of our pre-processing pipeline. This exclusion is important to guarantee that the remaining articles contain sufficient content for effective topic analysis, as the removal of common German names is a standard pre-processing step in LDA model estimation.

In [89]:
# Load the dictionary containing common German first and last names
with open("names.txt", "r", encoding="utf-8-sig") as f:
    names_list = f.read().splitlines()

In [90]:
inputs = zip(data['texts'], data['word_count'], itertools.repeat(names_list))

startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    names_result = pool.starmap(count_names.count_names, inputs)
    pool.close()
    pool.join()

print(datetime.now()-startTime)

data['names'] = names_result

0:01:10.934713


In [91]:
# Example of an article with a high proportion of names
data[data.names>=0.15].iloc[0]['texts']

'Abraham B. Jehoschua Die Manis Roman Aus dem Hebräischen von Ruth Achlama C R. Piper GmbH & Co. KG München, Zürich > Abraham B. Jehoschua Die Manis Roman Aus dem Hebräischen von Ruth Achlama C R. Piper GmbH & Co. KG München, Zürich > Abraham B. Jehoschua Die Manis Roman Aus dem Hebräischen von Ruth Achlama C R. Piper GmbH & Co. KG München, Zürich > Abraham B. Jehoschua Die Manis Roman Aus dem Hebräischen von Ruth Achlama C R. Piper GmbH & Co. KG München, Zürich > Abraham B. Jehoschua Die Manis Roman Aus dem Hebräischen von Ruth Achlama C R. Piper GmbH & Co. KG München, Zürich > Abraham B. Jehoschua Die Manis Roman Aus dem Hebräischen von Ruth Achlama C R. Piper GmbH & Co. KG München, Zürich > Abraham B. Jehoschua Die Manis Roman Aus dem Hebräischen von Ruth Achlama C R. Piper GmbH & Co. KG München, Zürich > Abraham B. Jehoschua Die Manis Roman Aus dem Hebräischen von Ruth Achlama C R. Piper GmbH & Co. KG München, Zürich >.'

In [92]:
# Exclude texts with a name density of at least 15%
data = data[data.names<0.15]
del data['names']
data.reset_index(inplace=True, drop=True)
# the number of articles after removing articles that predominantly consist of names
print(len(data))

551453


In [93]:
counter=collections.Counter(data['year'])
print(counter)

Counter({1999: 26729, 2000: 25808, 1994: 25640, 1995: 25603, 1996: 24676, 2001: 24626, 1997: 24553, 1998: 24504, 2008: 24146, 2010: 22744, 2009: 22521, 2011: 22491, 2005: 22395, 2006: 22082, 2007: 21959, 2004: 21823, 2003: 21781, 2012: 21047, 2002: 20760, 2013: 19232, 2014: 18188, 2016: 17866, 2017: 17497, 2015: 17421, 2018: 15361})


In [94]:
data.to_csv('sz_prepro_final.csv', encoding = 'utf-8-sig', sep = ';')