In [215]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import numpy as np
import itertools
import dpa_load
import multiprocessing as mp
from datetime import datetime
from itertools import repeat
from ast import literal_eval
import split_articles
import numeric_articles
import numeric_par
import continue_articles
import identify_ger
import clean_tables
# we import some functions from the Handelsblatt folder
import sys
sys.path.insert(1, os.getcwd().replace('dpa code', 'Handelsblatt'))
import count_words_mp
import correct_url
import clean_dpa_articles
import clean_dpa_references

In [2]:
# Set the number of cores to use
NUM_CORE = mp.cpu_count()-4

# DPA Data (1991 - 2018)

Deutsche PresseAgentur (DPA) is the Germany's biggest news agency which sells its news reports to the leading German newspapers. We believe that the data set has a high chance to be useful for economic forecasting because DPA produces information that is timely and has a large reach.

We purchased DPA data in November 2019. The corpus consists of **7,539,874** articles from January 1991 to December 2018.

The data set includes news from both dpa-Basisdienstes and dpa-afx Wirtschaftsnachrichten. The former one is the basic news service covering such topics as Economy, Politics, and  Finance. The second one was created in 1999. It specializes in financial news.

## Load the data

First, we read in the data by extracting the following XML elements:

* title - article's title
* text - text of the article
* date - publication date
* ressort - section (Politics vs Economy)
* source/credit - source (dpa vs afx)
* city - which city the news article refers to
* genre - journalistic genre, e.g., chronology, story, table
* wortanzahl - word count
* keywords - keywords associated with an article

In [3]:
# Folder with unpacked articles
#path = r'E:\\Userhome\\jbaer\\dpa_unpacked'

#path = r'G:\\Test\\Results\\dpa Raw Data\\dpa_unpacked'
path = os.getcwd().replace('\\newspaper_data_processing\\dpa code', '') + '\\dpa_unpacked'

folder_list = []

# 2 folders for dpa and dpa-afx 
for fol in [fol for fol in os.listdir(path)]:

    # Within each folder: folders for different years
    for f in [f for f in os.listdir(path + '\\' + fol)]:
        folder_list.append(path + '\\' + fol + '\\' + f)

In [4]:
# Select a path to the folder for storing results
#PATH = r'G:\\Test\\Results'
#os.chdir(PATH)

In [5]:
# Use the 'dpa_load' function to load articles
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    df_list = pool.map(dpa_load.dpa_load, folder_list)
    data = pd.concat(df_list)
    data.reset_index(inplace=True, drop=True)
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:39:34.263517


In [6]:
print(len(data))

7539874


In [7]:
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

In [8]:
data.head()

Unnamed: 0,texts,file,day,month,year,rubrics,source,keywords,title,city,genre,wordcount,topic,paragraphs
0,Schalck: Milliardenkredit sicherte Zahlungsfäh...,5739189.xml,1,1,1991,pl,dpa,Schalck-Golodkowski,Schalck: Milliardenkredit sicherte Zahlungsfäh...,Berlin,,218,WiPo,[Berlin (dpa) - Nach Darstellung des früheren ...
1,Tschads Regierung: Bevölkerung soll Waffen abl...,5739191.xml,1,1,1991,pl,dpa,Tschad,Tschads Regierung: Bevölkerung soll Waffen abl...,N'Djamena,,75,WiPo,[N'Djamena (dpa) - Die tschadische Regierung h...
2,Welajati: Iran bleibt bei einem Krieg am Golf ...,5739193.xml,1,1,1991,pl,dpa,Golfkrise Iran,Welajati: Iran bleibt bei einem Krieg am Golf ...,Teheran,,90,WiPo,[Teheran (dpa) - Iran wird im Falle eines Krie...
3,Bush will offenbar seinen Außenminister erneut...,5739195.xml,1,1,1991,pl,dpa,Golfkrise USA,Bush will offenbar seinen Außenminister erneut...,Washington,,181,WiPo,[Washington (dpa) - US-Präsident George Bush w...
4,Morgenzusammenfassung Neue Runde diplomatische...,5739199.xml,1,1,1991,pl,dpa,Golfkrise,Morgenzusammenfassung Neue Runde diplomatische...,Washington/Luxemburg,,504,WiPo,[Washington/Luxemburg (dpa) - Zwei Wochen vor ...


In [9]:
#data.to_csv('dpa_raw.csv')

In [10]:
data = pd.read_csv('dpa_raw.csv', encoding = 'utf-8', index_col = 0,  keep_default_na=False,
                   dtype = {'rubrics': 'str', 
                            'source': 'str',
                            'keywords': 'str',
                            'title': 'str',
                            'city': 'str',
                            'genre': 'str',
                            'wordcount': 'str'},
                  converters = {'paragraphs': literal_eval})

In [11]:
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

In [12]:
data.head()

Unnamed: 0,texts,file,day,month,year,rubrics,source,keywords,title,city,genre,wordcount,topic,paragraphs
0,Schalck: Milliardenkredit sicherte Zahlungsfäh...,5739189.xml,1,1,1991,pl,dpa,Schalck-Golodkowski,Schalck: Milliardenkredit sicherte Zahlungsfäh...,Berlin,,218,WiPo,[Berlin (dpa) - Nach Darstellung des früheren ...
1,Tschads Regierung: Bevölkerung soll Waffen abl...,5739191.xml,1,1,1991,pl,dpa,Tschad,Tschads Regierung: Bevölkerung soll Waffen abl...,N'Djamena,,75,WiPo,[N'Djamena (dpa) - Die tschadische Regierung h...
2,Welajati: Iran bleibt bei einem Krieg am Golf ...,5739193.xml,1,1,1991,pl,dpa,Golfkrise Iran,Welajati: Iran bleibt bei einem Krieg am Golf ...,Teheran,,90,WiPo,[Teheran (dpa) - Iran wird im Falle eines Krie...
3,Bush will offenbar seinen Außenminister erneut...,5739195.xml,1,1,1991,pl,dpa,Golfkrise USA,Bush will offenbar seinen Außenminister erneut...,Washington,,181,WiPo,[Washington (dpa) - US-Präsident George Bush w...
4,Morgenzusammenfassung Neue Runde diplomatische...,5739199.xml,1,1,1991,pl,dpa,Golfkrise,Morgenzusammenfassung Neue Runde diplomatische...,Washington/Luxemburg,,504,WiPo,[Washington/Luxemburg (dpa) - Zwei Wochen vor ...


# Pre-processing

## Light pre-processing

### Remove short articles (<100 words)

Short articles are often incoherent or contain only insiginicant news. For this reason we decided to filter out articles that consist of less than 100 words. 

In [13]:
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    count_results = pool.map(count_words_mp.count_words_mp, [text.replace('PARAGRAPH', ' ') for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:03:42.137776


In [14]:
# Save the result as a new column "word_count"
data['word_count'] = count_results

In [15]:
# remove articles with less than 100 words
data = data[data['word_count']>=100]
data.reset_index(inplace=True, drop=True)
print(len(data))

5364634


In [16]:
#data.to_csv('dpa_prepro_step1.csv')

### Remove exact duplicates

A few examples of duplicates in our corpus:
* The same article enters the corpus twice with different publication dates (e.g., 10.1.1991 and 11.1.1991). In this case, a natural solution is to keep the first entry.
* The same article appears twice with a slight variation in the metadata (e.g., the word count is a little different even though the articles are identical, or the keywords differ).
* The same article enters the corpus twice with the same publication date and metadata.
* The same article is published by dpa and dpa-afx (in this case, we keep an article published by dpa).

In [17]:
data = data.sort_values(['year', 'month', 'day', 'topic'], ascending=[True, True, True, True]) # dpa articles come before afx articles
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

In [18]:
# All the duplicated articles are saved as 'dpa_duplicates' for further exploration.
dpa_duplicates = data[data['texts'].duplicated(keep = False)]
len(dpa_duplicates)

1595777

In [19]:
dpa_duplicates.head()

Unnamed: 0,texts,file,day,month,year,rubrics,source,keywords,title,city,genre,wordcount,topic,paragraphs,word_count
13,Neues Schema für Investmentkurse Achtung: Vom ...,5739254.xml,1,1,1991,wi,dpa,KURSE DREI A),Neues Schema für Investmentkurse Achtung: Vom ...,,,544,WiPo,[Die folgenden Positionen der bisherigen Tabel...,283
15,NEUES INVESTMENTSCHEMA. 125) FF RESERVE FONDS ...,5739268.xml,1,1,1991,wi,dpa,KURSE DREI B,NEUES INVESTMENTSCHEMA.,,,489,WiPo,[125) FF RESERVE FONDS 126) FMM-FONDS...,258
16,NEUES INVESTMENTSCHEMA. 247) RE-INRENTA 248) R...,5739273.xml,1,1,1991,wi,dpa,KURSE DREI C),NEUES INVESTMENTSCHEMA.,,,430,WiPo,[247) RE-INRENTA 248) RENDITDEK...,262
209,Neues Schema für Investmentkurse Achtung: Vom ...,5740026.xml,2,1,1991,wi,dpa,KURSE DREI A),Neues Schema für Investmentkurse Achtung: Vom ...,,,544,WiPo,[Die folgenden Positionen der bisherigen Tabel...,283
210,NEUES INVESTMENTSCHEMA. 125) FF RESERVE FONDS ...,5740028.xml,2,1,1991,wi,dpa,KURSE DREI B,NEUES INVESTMENTSCHEMA.,,,489,WiPo,[125) FF RESERVE FONDS 126) FMM-FONDS...,258


Some articles are more than once in the corpus. We filter out all duplicates and only keep the articles with the oldest date or the articles published by dpa.

In [20]:
data.drop_duplicates(['texts'], keep = 'first', inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4542316


In [21]:
#data.to_csv('dpa_prepro_step2.csv')

### Corrections and updates of text news

There are two types of articles that we consider separately: text news corrections and updates.

* In text news corrections (Berichtigung or Berichtigte Neufassung), journalists usually change only a few facts. More rarely, they add or rewrite several paragraphs. In most of the cases, corrected and original news texts are pubished on the same day.

* In the updated texts news (Aktualisierung), journalists add a small amount of text to reflect the latest status. These articles can be published a few days later than the original article. 

#### Corrected news texts

The corrected news texts do not contain much new information, so we treat them as duplicates. However, before deleting these articles, we check if we can always find the original articles in the dataset.

Often the titles of corrected news articles include the titles of the original articles. For example, the title of the corrected article 'Berichtigung: Zahl der betroffenen US-Staaten Clinton will 23,5 Millionen Hektar Nationalforst schützen.' includes information on what was changed (Berichtigung: Zahl der betroffenen US-Staaten) and the title of the original article 'Clinton will 23,5 Millionen Hektar Nationalforst schützen.'.

We use this fact and try to find pairs of original and corrected news reports with the same title. While the titles of corrected news reports may have a different format, being able to find pairs of articles sharing the same title in each time period means that there are no temporal changes in the data.  

Corrected news reports contain 'Berichtigung' (correction) or 'Berichtigte Neufassung' (corrected version) in the title. However, we exclude the articles on rectified shares that include 'Berichtigungsaktien' in the title.

In [22]:
# Corrected news reports in dpa data.
Berichtigung = data[(data.title.str.contains('Berichtigung|Berichtigte Neufassung')) & (~data.title.str.contains('Berichtigungsaktien')) & (data.topic == 'WiPo')].drop(columns =['paragraphs'])

In [23]:
# track time
startTime = datetime.now()
# List with a publication year of the corrected articles
inputs_year = []
# List with a publication month of the corrected articles
inputs_month = []
# List with a publication day of the corrected articles
inputs_day = []
# List with the titles of the corrected articles
inputs_Berichtigung_title = []
# List with the titles of all the articles published on the same day as the corrected article
inputs_titles = []
# In this project we will concentrate on dpa rather than dpa-afx data. Please see explanation below.
dpa_data = data[data.topic=='WiPo']

for ind in Berichtigung.index:
    inputs_year.append(Berichtigung['year'][ind])
    inputs_month.append(Berichtigung['month'][ind])
    inputs_day.append(Berichtigung['day'][ind])
    inputs_Berichtigung_title.append(Berichtigung['title'][ind])
    inputs_titles.append(list(dpa_data[(dpa_data['year'] == Berichtigung['year'][ind]) & (dpa_data['month'] == Berichtigung['month'][ind]) & (dpa_data['day'] == Berichtigung['day'][ind])]["title"]))        
inputs = list(zip(inputs_year, inputs_month, inputs_day, inputs_Berichtigung_title, inputs_titles))
print(datetime.now()-startTime)

0:18:41.092515


We use the function `Berichtigung_pairs` to output the dataframe that contains the titles of the corrected and original articles along with their publication date. This dataframe will help us understand if there are some temporal changes in the data.

In [24]:
startTime = datetime.now()

import Berichtigung_pairs
import pandas as pd

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    Berichtigung_intermediate = pool.map(Berichtigung_pairs.Berichtigung_pairs, inputs)
    Berichtigung_df = pd.concat(Berichtigung_intermediate) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:18.315837


We can see that this dataframe includes 16982 titles of the corrected and original articles, while the total number of corrected articles is equal to 34221.

In [25]:
Berichtigung_df = Berichtigung_df.reset_index()
print("The number of pairs found = {}".format(len(Berichtigung_df)))
print("The number of corrected articles = {}".format(len(Berichtigung)))
Berichtigung_df['day'] = list(map(int, Berichtigung_df.day))
Berichtigung_df['month'] = list(map(int, Berichtigung_df.month))
Berichtigung_df['year'] = list(map(int, Berichtigung_df.year))
Berichtigung_df.head()

The number of pairs found = 16892
The number of corrected articles = 34221


Unnamed: 0,index,day,month,year,title_Berichtigung,title_main
0,0,7,1,1991,Berichtigte Neufassung dpa 159 - DLW - Bietigh...,DLW legte 1990 zu - besonders Möbelbereich erf...
1,0,22,1,1991,Berichtigte Neufassung - Wiesbaden/1257 Preise...,Preise in den neuen Ländern bei starken Bewegu...
2,0,7,2,1991,Berichtigte Neufassung amnesty - London/0107 )...,amnesty international setzt sich für US-Wehrdi...
3,0,7,2,1991,Berichtigte Neufassung - Bundeshaushalt - Bonn...,Bundeshaushalt 1991 mit 400 Milliarden Mark fa...
4,0,8,4,1991,Berichtigte Neufassung - München/1321 Patentam...,Patentamt erwartet Schub durch ostdeutsche Erf...


To understand why this is the case, we create a dataframe with the number of corrected articles (column 'texts') and identified pairs (column 'title_Berichtigung') per year.

We can see that before 1999, the number of corrected articles is low, which explains why the number of identified original articles is also low. Between 1999 and 2011, the number of corrected articles ranges between between 900 and 2000, and we successfully identify most of the original articles that have the same title as their corrections.

However, from 2012 onwards, the number of original articles that we are able to find decreases significantly. After further exploration, we realised that over this period of time most of the original articles had been removed. Therefore, we decided to delete only those corrected articles that were published before 2012 to avoid the risk of getting rid of unique articles.

In [26]:
df1 = Berichtigung.groupby('year').nunique()['texts'].to_frame().reset_index()
df2 = Berichtigung_df.groupby('year').nunique()['title_Berichtigung'].to_frame().reset_index()
merged_df = df1.merge(df2, how = 'left')
merged_df['title_Berichtigung'] = merged_df['title_Berichtigung'].astype('Int64')
merged_df

Unnamed: 0,year,texts,title_Berichtigung
0,1991,103,16
1,1992,138,22
2,1993,148,20
3,1994,136,26
4,1995,123,36
5,1996,164,45
6,1997,177,56
7,1998,428,182
8,1999,1323,678
9,2000,1245,673


Here we delete the corrected articles published before 2012.

In [27]:
data.drop(data[(data['title'].str.contains('Berichtigung|Berichtigte Neufassung', na = False)) & (~data['title'].str.contains('Berichtigungsaktien', na=False)) & \
               (data.topic == 'WiPo') & (data.year < 2012)].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4519997


#### Updated news texts

The updated news texts might contain relatively large chunks of new information. Therefore, we do not want to delete them. Instead, we have done the following:

1. If we manage to find an updated news text and its original version and both are published on the same day, we delete the original article as it does not contain the latest information.

2. If the update and the original article are published on different days, we keep both news reports. The original article is important because it was published on the day market participants received the news. At the same time, the updated artilce might contain important new information. If there are several updates, we only keep the latest one as containing the full information. In case the latest update is very close to the original article, it will be removed later by the `fuzzy_duplicates` function.

In [28]:
from aktualisierung import delete_aktualisierung_index
# In this project we will concentrate on dpa rather than dpa-afx data. Please see explanation below.
# We do not consider articles that contain 'Nachrichtenüberblick' in the keywords because these articles 
# contain multiple articles.
dpa_data = data[(data.topic=='WiPo') & (~data.keywords.str.contains('Nachrichtenüberblick'))]
dates = dpa_data.groupby(['year', 'month', 'day'])
dates = list(dates.groups)
dates = np.array_split(dates, NUM_CORE)
chunks = [pd.concat([dpa_data[(dpa_data['year'] == t[0]) & (dpa_data['month'] == t[1]) & (dpa_data['day'] == t[2])] for t in tup]) for tup in dates]
# Use the 'delete_aktualisierung_index' function to get a list with indices of all indermediate updates and 
# original articles published on the same day as the respective updates.
startTime = datetime.now()
if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    aktualisierung_index = pool.map(delete_aktualisierung_index, chunks)
    pool.close()
    pool.join()
print(datetime.now()-startTime)
aktualisierung_index = [idx for l in aktualisierung_index for idx in l]

0:07:27.091103


In [29]:
# Delete indermediate updates and original articles published on the same day as the respective updates based on indices.
data.drop(aktualisierung_index, inplace = True)
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True)
print(len(data))

4518993


## Duplicates specific to article types

After exploring corrected and updated news texts, we realised that there are other types of articles that suffer from the problem of duplication. These are summaries (Zusammenfassung, Gesamtzusammenfassung, Morgenfassung), overviews (Überblick), repeated articles (Wiederholung), and advance notifications (Vorausmeldung).

* __Summaries (Zusammenfassung, Gesamtzusammenfassung, Morgenfassung)__

Often we may see two versions of the same (or almost the same) article on the same day: the first is the original article and the second is a summary of that article. Many summaries are actually similar to updated articles because they consist of the original article and a new piece of text. However, in some cases, the summary is a condensed version of the original article including only the most important information. In any case, the content of the summary and the original article are very similar or identical. Therefore, if we can identify an original article and a summary published on the same day and with the same title, we consider the shorter article as a duplicate. In our corpus, we have 49,773 duplicates of this type.

* __Overviews (Überblick)__

In most cases, overviews are stand-alone articles containing all the information about a particular event available on a particular day. Sometimes, however, journalists write an article about a certain event in the morning, and then publish an overview in the evening, which is an extended version of the original article describing the latest developments. These overivews are very similar to updated news texts. We find 4,675 overviews with the same title as the original articles and published on the same day. We remove the short article from the overview and the original article.

* __Repeated articles (Wiederholung)__

We found 1,482 repeated articles with the same title as the original articles and published on the same day. These articles are either identical to the original articles or contain a small correction ('Berichtigte Wiederholung'). We remove the short article from the repeated article and the original article.

* __Advance notifications (Vorausmeldung)__

Advance notifications are news articles about events that will take place in the future. We found 606 advance notifications that have the same title as other articles published on the same day. In these cases, the advance notifications are a shorter version of the original articles. If we can identify an original article and an advance notification published on the same day and with the same title, we consider the shorter article as a duplicate.

First, we use the function `duplicates_pairs` to output the data frame with the titles of duplicates and original articles along with their publication day.

In [30]:
# Select article types that potentially suffer from the duplication problem. These are summaries (Zusammenfassung, 
# Gesamtzusammenfassung, Morgenfassung), overviews (Überblick), repeated articles (Wiederholung), and advance notifications
# (Vorausmeldung).

# In this project we will concentrate on dpa rather than dpa-afx data. Please see explanation below.

# We do not consider articles that contain 'dpa-Vorausmeldungen kompakt' in the title or 'Nachrichtenüberblick' in the 
# keywords because these articles contain multiple articles.

dup_types = data[(data.title.str.contains('Zusammenfassung|zusammenfassung|Überblick|Wiederholung|Vorausmeldung')) & (data.topic == 'WiPo') & \
                 (~data.title.str.contains('dpa-Vorausmeldungen kompakt')) & (~data.keywords.str.contains('Nachrichtenüberblick'))]
len(dup_types)

# Dates on which at least one summary, overview, repeated article, or advance notification is published.
dates = dup_types.groupby(['year', 'month', 'day'])
dates = list(dates.groups)
# Exclude dpa-afx data and articles that contain 'Nachrichtenüberblick' in the keywords.
dpa_data = data[(data.topic=='WiPo') & (~data.keywords.str.contains('Nachrichtenüberblick'))]
# Create a list of data frames, where each data frame contains all the articles published on the same day as one of 
# the summaries, overviews, repeated articles, or advance notifications.
day_df = [dpa_data[(dpa_data['year'] == t[0]) & (dpa_data['month'] == t[1]) & (dpa_data['day'] == t[2])] for t in dates]

In [31]:
startTime = datetime.now()
# Use the function duplicate_pairs to return the dataframe with the titles of duplicates and original articles 
# along with their publication date.
import duplicates_pairs

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    duplicates_intermediate = pool.map(duplicates_pairs.duplicates_pairs, day_df)
    duplicates_df = pd.concat(duplicates_intermediate) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:03:49.153319


Out of 705,975 summaries, overviews, repeated articles, and advance notifications, we find 56,535 articles that we consider to be duplicates.

In [32]:
duplicates_df = duplicates_df.reset_index()
print("The number of pairs found = {}".format(len(duplicates_df)))
print("The number of summaries, overviews, repeated articles, or advance notifications = {}".format(len(dup_types)))
duplicates_df['day'] = list(map(int, duplicates_df.day))
duplicates_df['month'] = list(map(int, duplicates_df.month))
duplicates_df['year'] = list(map(int, duplicates_df.year))
duplicates_df.head()

The number of pairs found = 56536
The number of summaries, overviews, repeated articles, or advance notifications = 705975


Unnamed: 0,index,day,month,year,title_duplicate,title_original
0,0,2,1,1991,Zusammenfassung Ernst von Siemens gestorben.,Ernst von Siemens gestorben.
1,0,5,1,1991,Zusammenfassung Drei Polizisten in Bologna ers...,Drei Polizisten in Bologna erschossen.
2,0,9,1,1991,Morgenzusammenfassung Baker und Asis in Genf -...,Bush fordert vom Kongreß Zustimmung zu Militär...
3,0,10,1,1991,Vorausmeldung Gesamtberliner Parlament konnsti...,Gesamtberliner Parlament konnstituiert sich.
4,0,14,1,1991,Zweite Zusammenfassung Gorbatschow rechtfertig...,Gorbatschow rechtfertigt Armee-Einsatz in Lita...


We use the function `delete_duplicates` to output indices of the duplicate articles. We then delete the articles with these indices.

In [33]:
startTime = datetime.now()
# Use the function delete_duplicates to output indices of the duplicate articles.
import delete_duplicates

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    duplicates_index = pool.map(delete_duplicates.delete_duplicates, day_df)
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)
duplicates_index = [idx for l in duplicates_index for idx in l]

0:03:48.450534


In [34]:
# Delete duplicate articles.
data.drop(duplicates_index, inplace = True)
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True)
print(len(data))

4461990


In [35]:
#data.to_csv('dpa_prepro_step3.csv')

## Filtering

Unfortunately, dpa articles are not as consistently sorted into sections and subsections as articles from other 
news media. Instead, we investigate the most commonly used titles and keywords and remove irrelevant articles based on them.

We exclude irrelevant articles based on the following titles

* 1) Londoner Edelmetallpreise: Precious metal prices (without text)
* 2) Tageskalender: List of upcoming events
* 3) (Tabelle), TABELLE: : Tables in text form
* 4) SPORT, Sport (except for the titles that contain TRANSPORT, INTERSPORT, PASSPORT, Sportartikel, news about sportswear manufacturers, sporting goods industry, SPORTARTIKLER, Sportmodelle, or Sportswear): news related to Sports. Beware that news about sports marketing agencies (e.g., Sportfive), sports media websites (e.g., Sportal), and new sports models of car manufacturers might be removed as well. 
* 5) KORREKTUR: Article corrections
* 6) Impressum: Dpa contact data
* 7) Testmeldung: Test-articles from dpa
* 8) Kurse A, Kurse B, Kurse C, Kurse D, KURSE A, KURSE B, KURSE C, KURSE DREI, KURSE Drei, Kurse drei: Stock charts without text
* 9) DGAP-DD: DGAP reports
* 10) New Yorker Aktien-Schlußkurse: Stock closing prices at the New York stock exchange (articles only occur from 1997 to 2002)
* 11) VERMISCHTES: Miscellaneous with no relation to economics
* 12) Angekündigte US-Quartalszahlen auf einen Blick: Quarterly US figures
* 13) Terminvorschau: Appointment preview
* 14) Aus der Landespolitik: News reports on regional politics. We think they are unlikely to be important for our research question. 
* 15) Die Woche in Berlin, Die Woche in Bonn, Die politische Woche: Announcements of political (and economic) news for next week. The format is very different from other news reports and difficult for the models we use.
* 16) Notizen aus der Politik, NOTIZEN AUS DER POLITIK: Collection of varios short political articles which contentwise  do not seem to be relevant for our research question and are covered only within the limited time period, 2002-2011.
* 17) Presseschau (except for the titles that contain 'zu'): Press reviews that include only headlines of important news reports from various media outlets. We can not analyze the full texts and headlines together because different types of data require different models. Articles with headlines containing both 'Presseschau' and 'zu' are actual press reviews (not just headlines) of a single newspaper on a single topic.
* 18) ÜBERBLICK: Analysten-Umstufungen: Changes in stock ratings (different article format).
* 19) Chronologie, CHRONOLOGIE: Chronology - important dates and events. We remove these articles because for our research question backward-looking articles are arguably not that important.
* 20) dpa-Landesdienst: Regional news unlikely to be important for our research question.
* 21) Die Top-Themen am Aktienmarkt: Short news articles about stock market. They do contain relevant information, but they also have a very different format and they were first issued in 2011. 
* 22) Aktien Asien Schluss.: Quantitative information about the stock market.
* 23) Börsentag auf einen Blick: Articles about the stock market, different format, a lot of quantitative information.
* 24) US-Quartalszahlen vom Vortag: Quarterly stock market figures, tables.
* 25) Achtung - Sonderdisposition: A schedule of the upcoming news articles on a particular topic.
* 26) dpa-Vorausmeldungen kompakt: Multiple short news articles on the upcoming events. The length of each of the articles usually does not exceed 100 words.
* 27) Aktien-Schlußkurse: Stock market prices, quantitative information.
* 28) Tabelle Kurse: Stock market figures, tables.
* 29) Tabelle Die Gewinner und Verlierer: Stock market figures, tables.
* 30) Auf und ab im EWS: changes in exchange rates due to introduction of European Monerary System, quantitative information.
* 31) Der Verfassungsschutzbericht in Zahlen: crime statistics, quantitative information.
* 32) Die Toten des Jahres: list of celebrity deaths this year.
* 33) Spendenkonten für Jugoslawien-Hilfe: list of donation accounts for Yugoslavia aid.
* 34) Kontonummern für die Ruanda-Hilfe: list of donation accounts for Rwanda aid.
* 35) Extra Zahlreiche Hilfsorganisationen: list of donation accounts for Kosovo victims.
* 36) dpa-Schwerpunkt Spendenkonten: list of donation accounts for East Africa aid.
* 37) Tabelle Weltbörsen und Finanzmärkte auf einen Blick: quantitative information on stock market.
* 38) Interbanken-Kurse und Metallnotierungen: quantitative news articles on interbank exchange rates and metal prices.
* 39) Hintergrund: background articles are not news in themselves, but rather they provide background information on the topic. These articles are often outdated or irrelevant for short-term economic analysis. They might also contain a high share of quantitative information (e.g., detailed labour market statistics, election results in numbers).
* 40) Chronik eins/zwei/drei/vier/fünf/sechs/sieben/acht/neun/zehn/elf/zwölf: backward-looking articles.
* 41) Liste eins/zwei/drei/vier/fünf, Subventionsliste eins/zwei: long lists with a lot of background information on the topic, e.g. lists of members of the 16th Bundestag.

In [36]:
# Filter out non-economic articles based on titles.
fil_titles = '''Londoner Edelmetallpreise|Tageskalender|\(Tabelle\)|SPORT|Sport|KORREKTUR|Impressum|Testmeldung|Kurse A[^a-z]|Kurse B[^a-z]|Kurse C[^a-z]|Kurse D[^a-z]|KURSE A|KURSE B|KURSE C|KURSE DREI|Kurse drei|KURSE Drei|Kurse/drei|DGAP-DD|New Yorker Aktien-Schlusskurse|VERMISCHTES|Angekündigte US-Quartalszahlen|Terminvorschau|Aus der Landespolitik|Die Woche in Berlin|Die Woche in Bonn|Die politische Woche|Notizen aus der Politik|NOTIZEN AUS DER POLITIK|ÜBERBLICK: Analysten-Umstufungen|TABELLE:|Chronologie|CHRONOLOGIE|dpa-Landesdienst|Die Top-Themen am Aktienmarkt|Aktien Asien Schluss\.|Börsentag auf einen Blick|US-Quartalszahlen vom Vortag|Achtung - Sonderdisposition|dpa-Vorausmeldungen kompakt|Aktien-Schlußkurse|Tabelle Kurse|Tabelle Die Gewinner und Verlierer|Auf und ab im EWS|Der Verfassungsschutzbericht in Zahlen|Die Toten des Jahres|Spendenkonten für Jugoslawien-Hilfe|Kontonummern für die Ruanda-Hilfe|Extra Zahlreiche Hilfsorganisationen|dpa-Schwerpunkt Spendenkonten|Tabelle Weltbörsen und Finanzmärkte auf einen Blick|Interbanken-Kurse und Metallnotierungen|Hintergrund|Chronik eins|Chronik zwei|Chronik drei|Chronik vier|Chronik fünf|Chronik sechs|Chronik sieben|Chronik acht|Chronik neun|Chronik zehn|Chronik elf|Chronik zwölf|Liste eins|Liste zwei|Liste drei|Liste vier|Liste fünf|Subventionsliste eins|Subventionsliste zwei'''
titles_exc = '''TRANSPORT|INTERSPORT|PASSPORT|Sportartikel|SPORTARTIKLER|Sportmodelle|Sportswear'''
data.drop(data[(data['title'].str.contains(fil_titles, na = False)) & (~data['title'].str.contains(titles_exc, na=False))].index, inplace=True)
# Filter out press reviews that consist of headlines only
data.drop(data[(data['title'].str.contains('Presseschau', na = False)) & (~data['title'].str.contains('zu', na=False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4028344


We exclude articles based on a title and a text:

* 1) A title contains 'Überblick: ANALYSTEN-EINSTUFUNGEN', and a text contains 'Folgende Investmentbanken haben sich': Changes in stock ratings (different article format).

In [37]:
# Filter out articles based on a title and a text.
data.drop(data[(data.title.str.contains('Überblick: ANALYSTEN-EINSTUFUNGEN', na = False)) & (data.texts.str.contains('Folgende Investmentbanken haben sich', na=False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4025351


* 2) A title is 'Achtung.', and a text contains 'Zusammenfassung': A schedule of the upcoming news articles on a particular topic.

In [38]:
data.drop(data[(data.title =='Achtung.') & (data.texts.str.contains('Zusammenfassung', na=False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4025325


We exclude articles based on a title and keywords:

* 1) A title contains 'Dispositionen', and keywords contain 'Wahl' or 'wahl': quantitative information about elections (different article format).

In [39]:
# Filter out articles based on a title and keywords.
data.drop(data[(data.title.str.contains('Dispositionen', na = False)) & (data.keywords.str.contains('Wahl|wahl', na=False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4024900


* 2) A title contains 'Achtung', and keywords consist of one word 'Kurse': quantitative information about prices (different article format).

In [40]:
# Filter out articles based on a title and keywords.
data.drop(data[(data.title.str.contains('Achtung', na = False)) & (data.keywords == 'Kurse')].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4024889


* 3) A title consists of one word 'Achtung.', and keywords contain 'Börsen': internal information about changes in reporting (different article format).

In [41]:
# Filter out articles based on a title and keywords.
data.drop(data[(data.title == 'Achtung.') & (data.keywords.str.contains('Börsen', na = False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4024865


We exclude irrelevant articles based on the following sections.

* 1) Tabelle: Tables in text form (some articles are still left after the previos step)
* 2) Historisches: News about historical events
* 3) Achtung: Announcemt of upcoming news

In [42]:
# Filter out non-economic articles based on sections.
fil_genres = '''Tabelle|Historisches|Achtung'''
data.drop(data[data['genre'].str.contains(fil_genres, na = False)].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

4004947


We exclude non-economic articles based on the following keywords.
* 1) Redaktionshinweis: Editor's notes for Dpa journalists
* 2) DGAP: DGAP reports
* 3) Sport, SPORT, SPO (except for Sportartikel, this section contains articles on sports companies): Sport news (some sports articles are still left after the previos steps)
* 4) Kurse A, Kurse B, Kurse C, Kurse D, KURSE A, KURSE B, KURSE C, Kurse D,
     KURSE DREI, Kurse drei, KURSE Drei, KURSE drei, Kurse Drei, Kurse/drei: Stock charts without text (some articles are  still left after the previos steps)
* 5) Tagesvorschau, Vorschau, VORSCHAU, vorschau: List of titles of upcoming news
* 6) Bilderdienst: Dpa Picture Service
* 7) Geschichte: News related to historical events
* 8) Landespolitik: Regional news irrelevant for economic forecasting
* 9) dpa-Morgenlage: News about what happened last night. These articles have a special format: quick and very short reports.
* 10) Börsen Aktienkurse Währung: Quantitative information about stock market
* 11) Rechtschreibung: News about German Orthography

In [43]:
# Filter out non-economic articles based on keywords.
fil_keywords = '''Redaktionshinweis|DGAP|Sport|SPORT|SPO|Kurse A|Kurse B|Kurse C|Kurse D|KURSE A|KURSE B|KURSE C|KURSE DREI|Kurse drei|KURSE Drei|KURSE drei|Kurse Drei|Kurse/drei|Kurse D|Tagesvorschau|Vorschau|vorschau|VORSCHAU|Bilderdienst|Geschichte|Landespolitik|dpa-Morgenlage|Börsen Aktienkurse Währung|Rechtschreibung'''
keywords_exc = '''Sportartikel'''
data.drop(data[(data['keywords'].str.contains(fil_keywords, na = False)) & (~data['keywords'].str.contains(keywords_exc, na=False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

3764930


We exclude articles based on keywords and text:

* 1) Keywords contain 'Wahlen', and a text contains 'Abkürzungen:': quantitative information about elections (different article format).

In [44]:
# Filter out articles based on keywords and text.
data.drop(data[(data.keywords.str.contains('Wahlen', na = False)) & (data.texts.str.contains('Abkürzungen\:', na = False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

3764737


We exclude articles based on the following bits of text.
* 1) Schalterverkaufskurse: Precios metal prices
* 2) dpa-news.de: News regarding the Dpa website
* 3) Wirtschafts- und Finanztermine, Wirtschafts- und Finanz-Termine, Konjunktur- und Wirtschaftstermine: List of dates when economic data will be published/economic events will take place
* 4) DGAP (except for articles that contain DGAP standing for Deutsche Gesellschaft für Auswärtige Politik): DGAP reports
* 5) Bitte verwenden Sie diese Meldung nicht: Retracted articles
* 6) [§] 26 Abs., § 15a WpHG 1, § 15 WpHG, Artikel 19 MAR, article 19 Market Abuse Regulation (MAR): Regulatory news
* 7) Die Pivotpunkte für den Dax-Future: Pivot points for the Dax-Future
* 8) An der Frankfurter Wertpapierbörse wurden, Die Aktien im Dow Jones EuroStoxx 50, Die Aktien im Dow Jones Euro Stoxx 50: Stock charts
* 9) Ihr Ansprechpartner: Redaktion Politik International: List of current political news headlines
* 10) (Achtung - Sonderdisposition): A schedule of the upcoming news articles on a particular topic
* 11) Redaktionstechnik Augsburger Allgemeine: most of the article contains contact information for Augsburger Allgemeine

In [45]:
fill_text = '''Schalterverkaufskurse:|dpa-news\.de|Wirtschafts- und Finanztermine|Wirtschafts- und Finanz-Termine|DGAP|Bitte verwenden Sie diese Meldung nicht|Konjunktur- und Wirtschaftstermine|[§] 26 Abs\. 1|§ 15a WpHG|§ 15 WpHG|Artikel 19 MAR|article 19 Market Abuse Regulation \(MAR\)|Die Pivotpunkte für den Dax-Future|An der Frankfurter Wertpapierbörse wurden|Die Aktien im Dow Jones EuroStoxx 50|Die Aktien im Dow Jones Euro Stoxx 50|\(Achtung - Sonderdisposition\)|Redaktionstechnik Augsburger Allgemeine'''
text_exc = '''Auswärtige Politik'''
data.drop(data[(data['texts'].str.contains(fill_text, na = False)) & (~data['texts'].str.contains(text_exc, na=False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

3633513


In addition, we exclude articles containing both 'Ihr Ansprechpartner\:' and '\(.+[0-9] Zl\)' in the text. They include a schedule of the upcoming news articles on a particular topic.

In [46]:
data.drop(data[(data['texts'].str.contains('Ihr Ansprechpartner\:', na = False)) & (data['texts'].str.contains('\(.+[0-9] Zl\)', na=False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

3633285


We want to exclude irrelevant articles based on the following two sources.
* 1) dpa-frei: Article corrections
* 2) dpa-wahl: Articles about federal election results

In [47]:
data.drop(data[data['source'].str.contains('dpa-frei|dpa-wahl', na = False)].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

3633261


We exclude articles regarding dpa itself.

In [48]:
data.drop(data[data['city'] == 'Die Deutsche Presse-Agentur'].index, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

3633258


We keep articles on two subjects: Politics ('wi') and Economy ('wi'). Articles with the 'rs' value of the column 'rubrics' are removed, because 'rs' stands for editorial management. 

In [49]:
data = data[(data['rubrics']==u'wi') | (data['rubrics']==u'pl')]
data.reset_index(inplace=True, drop=True)
print(len(data))

3632165


Remove four articles published in 2019. 

In [50]:
data = data[data['year']<2019]
data.reset_index(inplace=True, drop=True)
print(len(data))

3632161


In [51]:
#data.to_csv('dpa_prepro_step4.csv')

### Split up articles

Sometimes multiple articles are collected and merged into one entry. For example, articles with the title, keyword, or genre 
'Nachrichtenüberblick' are a collection of the most important articles of the day. Because these smaller articles
can have different sentiments and topics, we separate articles that consist of multiple smaller articles. Articles consisting of multiple smaller articles can be identified with the following words which can appear in titles, keywords, or genres.

- dpa-Nachrichtenüberblick, Nachrichtenüberblick: An overview of news for the upcomming days or news which are a few days old
- Kurznachrichten Wirtschaft: Collection of short economic news
- Analysten-Einstufungen, ANALYSTEN-EINSTUFUNGEN: Analyst stock ratings

We delete one of the articles containing 'Nachrichtenüberblick' in the title because it has too many errors that make it difficult to split the article into paragraphs.

In [52]:
# The article to delete
file_drop = ['8180362.xml']
ind = data[data.file == '8180362.xml']['texts'].index[0]
data[data.file == '8180362.xml']['texts'][ind]

' PARAGRAPH     Rentenreform verabschiedet\n  PARAGRAPH     BERLIN - Der Weg für die Förderung der privaten Altersvorsorge\nmit knapp 21 Mrd. DM ist frei.  e K\n B h e i r t de. Auch das rot-rot-regierte Mecklenburg-Vorpommern\nstimmte zu. Die SPD-CDU-Koalition von Bremen enthielt sich. In\nMecklenburg-Vorpommern droht nach der überraschenden Zustimmung von\nMinisterpräsident Ringstorff eine Koalitionskrise. Nach PDS-Ansicht\nhat die SPD den Koalitionsvertrag gebrochen und damit den\nRegierungspartner herausgefordert.\n  PARAGRAPH    Mieter erhalten mehr Rechte\n  PARAGRAPH    BERLIN - Die mehr als 20 Mio. Mieter in Deutschland erhalten von\nSeptember an mehr Rechte. Die Grenze für Mieterhöhungen wird gesenkt\nund die Kündigungsfristen werden zu Gunsten der Mieter geändert. Das\nsieht die Mietrechtsreform der Bundesregierung vor, die am Freitag im\nBundesrat die letzte parlamentarische Hürde nahm. Bisher betrugen die\nKündigungsfristen für Mieter un d Vermieter gleichermaßen maximal\nz

In [53]:
data.drop(data[data['file'].isin(file_drop)].index, inplace=True)

While testing the code, we have decided to delete a quantitative part of the following two articles:

In [54]:
ind1 = data[data.file == '8833807.xml']['texts'].index[0]
data[data.file == '8833807.xml']['texts'][ind1]



In [55]:
ind2 = data[data.file == '8833809.xml']['texts'].index[0]
data[data.file == '8833809.xml']['texts'][ind2]



In [56]:
data.at[ind1, 'texts'] = data['texts'][ind1].split(' Hier die Eckwerte')[0]
data.at[ind2, 'texts'] = data['texts'][ind2].split('\n\nDie Übersicht')[0]

In [57]:
s_mult_art = '''dpa-Nachrichtenüberblick|Nachrichtenüberblick|Kurznachrichten Wirtschaft|Analysten-Einstufungen|ANALYSTEN-EINSTUFUNGEN'''
mult_art = data[data['title'].str.contains(s_mult_art, na = False)]
mult_art = mult_art.append(data[data['keywords'].str.contains(s_mult_art, na = False)])
mult_art = mult_art.append(data[data['genre'].str.contains(s_mult_art, na = False)])
mult_art.drop_duplicates(['texts'], keep = 'first', inplace=True)
mult_art.reset_index(inplace=True, drop=True)

  mult_art = mult_art.append(data[data['keywords'].str.contains(s_mult_art, na = False)])
  mult_art = mult_art.append(data[data['genre'].str.contains(s_mult_art, na = False)])


In [58]:
# delete 'mult_art' from the original data
data.drop(data[data['title'].str.contains(s_mult_art, na = False)].index, inplace=True)
data.drop(data[data['keywords'].str.contains(s_mult_art, na = False)].index, inplace=True)
data.drop(data[data['genre'].str.contains(s_mult_art, na = False)].index, inplace=True)
data.reset_index(inplace=True, drop=True)

In [59]:
# calculate chunck size 
chunk_size = int(mult_art.shape[0]/NUM_CORE)

# split data into chunks 
chunks = [mult_art.iloc[mult_art.index[i:i + chunk_size]] for 
          i in range(0, mult_art.shape[0], chunk_size)]

In [60]:
# split up articles into smaller articles and append the resulting new articles 
# to the corpus
from datetime import datetime
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    results = pool.map(split_articles.split_articles, chunks) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

results = pd.concat(results)
print(len(results))
results.reset_index(inplace=True, drop=True) # reset the index of the DataFrame

0:01:45.814573
899420


The separated articles consist of fewer words than the articles from which they originally stemmed. Therefore, we count the number of words of the new articles with the count_words_mp function from before and filter out articles with less than 100 words.

In [61]:
# count the number of words for the separated articles and filter out articles with less
# than 100 words
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    count_results = pool.map(count_words_mp.count_words_mp, [text for text in results['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:00:08.202487


In [62]:
results['word_count'] = count_results
results = results[results['word_count']>=100]

In [63]:
# append separated articles to corpus
data = data.append(results)
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame
print(len(data))

  data = data.append(results)


3615409


In [64]:
#data.to_csv('dpa_prepro_step5.csv')

### dpa-Basisdienstes and dpa-afx Wirtschaftsnachrichten

Up to this point, we have been cleaning data from dpa-Basisdienstes (hereafter dpa) and dpa-afx Wirtschaftsnachrichten (hereafter afx) together. This made us realise that afx data has a completely different format and, while useful for forecasting, in our view should be analyzed separately from dpa data. Here we explain why.

The dpa news articles are very similar to the news we read in other newspapers, and they cover similar topics, namely major political and economic events.

The afx news, on the other hand, is aimed at investors and traders rather than the general public, and closely follows stock market developments, companies' financial results, and the state of key industries.

To make it clear what we mean, here are a few articles on topics that are frequently discussed in afx data.

1. The first two articles discuss stock market developments.
2. Articles three and four represent Ad hoc messages (information reported by companies that might influence the price of their securitites).
3. Articles five and six give recommendations to traders whether they should sell, buy, or hold shares.
4. The last two articles talk about financial results and plans of companies.

In [65]:
Stock_market = data[(data.texts.str.contains('^Aktien')) & (data.topic == 'afx')]
Ad_hoc = data[(data.texts.str.contains('Ad hoc-Service')) & (data.topic == 'afx')]
Analysis = data[(data.texts.str.contains('^AKTIE IM FOKUS')) & (data.topic == 'afx')]
Companies = data[(data.texts.str.contains('ROUNDUP')) & (data.topic == 'afx')]

print('---- Articles on the stock market ----')
print('\n')
print(Stock_market.reset_index()['texts'][0][:1000])
print('\n')
print(Stock_market.reset_index()['texts'][100][:1000])
print('\n')
print('---- Ad hoc messages ----')
print('\n')
print(Ad_hoc.reset_index()['texts'][0][:1000])
print('\n')
print(Ad_hoc.reset_index()['texts'][1][:1000])
print('\n')
print('---- Stock market analysis, recommendations for traders ----')
print('\n')
print(Analysis.reset_index()['texts'][0][:1000])
print('\n')
print(Analysis.reset_index()['texts'][1][:1000])
print('\n')
print('---- News on companies ----')
print('\n')
print(Companies.reset_index()['texts'][0][:1000])
print('\n')
print(Companies.reset_index()['texts'][10][:1000])
print('\n')

---- Articles on the stock market ----


Aktien Neuer Markt: Kurse abgebröckelt - Etwas Erleichterung durch US -Daten. FRANKFURT (dpa-AFX) - Die deutschen Wachstumsaktien am Neuen Markt sind nach einer freundlichen Eröffnung stetig abgebröckelt. Im Vorfeld der mit Spannung erwarteten US-Konjunkturzahlen am Nachmittag (ab 14:30) wurden neue Tagestiefs markiert, bevor die Daten «in line» für etwas Erleichterung sorgten. Der Nemax-All-Share-Index <NMDK.ETR> lag zuletzt um 1,08% unter dem Vortagesniveau bei 3.231,70/-35,13 Punkten, während der Nemax50-Index <NMKX.ETR> 1,23% auf 4.068,60/-50,94 Punkte verlor. Vor den Zahlen saßen «alle wie das Kaninchen vor der Schlange und warteten», erklärte ein Händler aus Düsseldorf. Offenbar rechneten einige Marktteilnehmer mit negativen Reaktionen der Märkte und gaben Stücke bei sehr geringer Handelstätigkeit aus der Hand. Dies habe den Gesamtmarkt im frühen Verlauf unter Druck gesetzt. Gleich nach den ersten Zahlen um 14:30 Uhr, bei denen die US-Verb

Overall, we can conclude that these types of articles are of interest to financial market participants, but may be too detailed for the general public.

A few exceptions, which we also consider, are afx articles on business cycle (Konjunktur) and Politics.

* Articles on business cycle might be very important for economic forecasting. We therefore considered including news reports with the keyword 'Konjunktur' (business cycle) published by afx. However, we encountered two problems.

* First, articles with the keyword 'Konjunktur' aimed at the general public often enter our database twice because they are published by both dpa and afx. Articles published exclusively by afx are often more quantitative and seem to target financial market participants. 

* Second, while the proportion of dpa articles with the keyword 'Konjunktur' is relatively stable and ranges from 0.5% to 4% (see 'dpa' column in the dataframe below), the proportion of 'Konjunktur' articles in afx data increases significantly from 3% in 2000 to 17% in 2018 (see 'afx' column). We believe this is due to the fact that in the afx data, the keyword 'Konjunktur' was initially used only in the articles with a lot of quantitative information, and then began to be used in a much wider range of articles. In any case, if we included all dpa articles and afx articles with the keyword 'Konjunktur', we would see that the share of articles with the keyword 'Konjunktur' increases over the years (see 'All' column), most likely due to structural changes in the database rather than economic developments. Therefore, we decided to exclude afx articles with the keyword 'Konjunktur' from further analysis.

* We also exclude afx articles with the keyword 'Politik', because in most cases dpa publishes the same articles. Thus, these articles can be considered as duplicates.

In [66]:
# The proportion of articles with the keyword 'Konjunktur' in the whole dataset.
Konjunktur_all_series = data[(data.keywords.str.contains('Konjunktur'))].drop(columns =['paragraphs']).groupby('year').nunique()['texts']/data[(data.topic == 'WiPo') | ((data.topic == 'afx') & (data.keywords.str.contains('Konjunktur')))].drop(columns =['paragraphs']).groupby('year').nunique()['texts']
Konjunktur_all_df = Konjunktur_all_series.to_frame().reset_index()
Konjunktur_all_df = Konjunktur_all_df.rename(columns = {"texts": "All"})
# The proportion of articles with the keyword 'Konjunktur' in dpa dataset.
Konjunktur_dpa_series = data[(data.keywords.str.contains('Konjunktur')) & (data.topic == 'WiPo')].drop(columns =['paragraphs']).groupby('year').nunique()['texts']/data[(data.topic == 'WiPo')].drop(columns =['paragraphs']).groupby('year').nunique()['texts']
Konjunktur_dpa_df = Konjunktur_dpa_series.to_frame().reset_index()
Konjunktur_dpa_df = Konjunktur_dpa_df.rename(columns = {"texts": "dpa"})
# The proportion of articles with the keyword 'Konjunktur' in afx dataset.
Konjunktur_afx_series = data[(data.keywords.str.contains('Konjunktur')) & (data.topic == 'afx')].drop(columns =['paragraphs']).groupby('year').nunique()['texts']/data[(data.topic == 'afx')].drop(columns =['paragraphs']).groupby('year').nunique()['texts']
Konjunktur_afx_df = Konjunktur_afx_series.to_frame().reset_index()
Konjunktur_afx_df = Konjunktur_afx_df.rename(columns = {"texts": "afx"})
# The dataframe with the above mentioned datafames merged together.
merged_df = Konjunktur_all_df.merge(Konjunktur_dpa_df, how = 'left', on = 'year')
merged_df = merged_df.merge(Konjunktur_afx_df, how = 'left', on = 'year')
merged_df

Unnamed: 0,year,All,dpa,afx
0,1991,0.005356,0.005356,
1,1992,0.006792,0.006792,
2,1993,0.008649,0.008649,
3,1994,0.007132,0.007132,
4,1995,0.006451,0.006451,
5,1996,0.007627,0.007627,
6,1997,0.007536,0.007536,
7,1998,0.010032,0.010032,
8,1999,0.00764,0.00764,
9,2000,0.019581,0.008571,0.026148


All in all, we only keep articles published by dpa.

In [67]:
data = data[data.topic == 'WiPo']
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame
print(len(data))

2211719


In [68]:
data.to_csv('dpa_prepro_step6.csv')

In [69]:
data = pd.read_csv('dpa_prepro_step6.csv', encoding = 'utf-8', index_col = 0,  keep_default_na=False,
                   dtype = {'rubrics': 'str', 
                            'source': 'str',
                            'keywords': 'str',
                            'title': 'str',
                            'city': 'str',
                            'genre': 'str',
                            'wordcount': 'str'},
                  converters = {'paragraphs': literal_eval})

### Identify and Delete non-German Articles

Next, we filter out articles written in any language other than German using a **langdetect** library.

We remove the following types of articles:

1) articles about American elections

In [70]:
data[data.file == '6385957.xml'].iloc[0]['texts']

'Vorläufige Verteilung der 538 Wahlmänner und -frauen: Washington (dpa) - Die vorläufige Verteilung der 538 Wahlmänner und -frauen: Staat Clinton Bush 1988: Alabama 9 Bush Alaska 3 Bush Arizona 8 Bush Arkansas 6 Bush California 54 Bush Colorado 8 Bush Connecticut 8 Bush Delaware 3 Bush Distric of Columbia 3 Dukakis Florida 25 Bush Georgia noch nicht entschieden (13) Bush Hawaii 4 Dukakis Idaho 4 Bush Illinois 22 Bush Indiana 12 Bush Iowa 7 Dukakis Kansas 6 Bush Kentucky 8 Bush Louisiana 9 Bush Maine 4 Bush Maryland 10 Bush Massachusetts 12 Dukakis Michigan 18 Bush Minnesota 10 Dukakis Mississippi 7 Bush Missouri 11 Bush Montana 3 Bush Nebraska 5 Bush Nevada 4 Bush New Hampshire 4 Bush New Jersey 15 Bush New Mexico 5 Bush New York 33 Dukakis North Carolina 14 Bush North Dakota 3 Bush Ohio 21 Bush Oklahoma 8 Bush Oregon 7 Dukakis Pennsylvania 23 Bush Rhode Island 4 Dukakis South Carolina 8 Bush South Dakota 3 Bush Tennessee 11 Bush Texas 32 Bush Utah 5 Bush Vermont 3 Bush Virginia 13 Bus

2) articles written in Low German (Plattdeutsch)

In [71]:
data[data.file == '6808045.xml'].iloc[0]['texts']

"«Westfalen-Blatt» (Bielefeld) zu Plattdeutsch im Bundestag. do koennsse duessen fridag in'n bunnes-dage chanz unchewuehnlicke toene heuern: de damens un herrens awjeordneten haet ssick de wohrheit ens mohl platt vo'n kopp sseggt. dobui gueng et oemme de niederduetske sproke (wick dat uise platt «amtlick» neumt wei- hert...). et, dat niederduetske, schall met olle annern regionol- sproken «gliek up gliek» stellt weihern, unjefaehr met denn, watter dat «gaelische» or dat «baskische» es, un (nich to vagieden) met denn «sorbischen», watter bit tae-u inner duetsken sproken-kultur, oll laengest, osser ne eigen sproke gellen doett. woroemme datse do-oarber kueart? weil et iut de raude kassen in bruesel fo dat wahr'n un bluiben van de ae-ulen, «wossenen» sproken, ne masse cheld giw. ober schiube wui dat mohl uppe ssuit: platt beduett jo nich platzt, weil't, in'n giegendeil to'n haeuggen, ne «un-kultur» weuher. ssoennern platt, dat es de sproke, de uppen flacken, platten lanne to huis es. un d

3) articles written in German and translated into English or French

In [72]:
data[data.file == '3959856.xml'].iloc[0]['texts']

'David Levy: Treffen mit Arafat innerhalb von zwei Wochen. Jerusalem (dpa) - Der israelische Außenminister David Levy will sich innerhalb von zwei Wochen mit dem Präsident der palästinensischen Selbstverwaltungsbehörde Jassir Arafat treffen. Levy teilte am Donnerstag im israelischen Rundfunk außerdem mit, daß ein Treffen zwischen Regierungschef Benjamin Netanjahu und Arafat möglicherweise noch eher zustande komme. Levy sagte, es würden große Anstrengungen unternommen, um das Treffen zu ermöglichen. Seit der Wahl Netanjahus Ende Mai hat es keine hochrangigen Kontakte zwischen Israel und dem Palästinensischen Autonomiekabinett gegeben. Der israelische Außenminister sagte außerdem, daß eine Lösung für den Rückzug der israelischen Armee aus Hebron möglicherweise bevorstehe. In Jerusalem hieß es weiter, daß der amerikanische Präsident Bill Clinton Netanjahu während seiner Gespräche in Washington gedrängt habe, sich mit Arafat zu treffen und eine Lösung für den vereinbarten Rückzug aus Hebro

4) articles written in English

In [73]:
data[data.file == '5429862.xml'].iloc[0]['texts']

'Davos: Israeli/Palestinian business council suggested (414) GUARDIAN NEWS SERVICE (DAVOSMIDEAST) By Jane Martinson in Davos. A new council uniting Israeli and Palestinian businesses is to be formed under proposals to stimulate investment in the Middle East put forward at the World Economic Forum (WEF) yesterday. The meeting saw Tzipi Livni, the Israeli foreign minister, and Mahmoud Abbas, the Palestinian president, pledge to work harder towards a settlement. The session ended with the WEF\'s founder, Klaus Schwab, appearing to promise to move the forum\'s money-spinning annual meeting from Davos to Jerusalem next year. The understanding is that the move could take place as long as there had been "progress" on peace, which is likely to take significantly more than a year. The forum has only once moved the annual meeting in its 36-year history, when it moved to New York after 9/11. "Next year in Jerusalem. This is our goal," said Ms Livni yesterday. Mr Schwab had earlier confirmed that 

In [74]:
# Delete all non-German articles from the data  
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    ger_results = pool.map(identify_ger.identify_ger, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:46:38.942938


In [75]:
data['language'] = ger_results
# the number of non-German articles
print(len(data[data.language == 0]))
# keep articles written in German
data = data[data.language==1]
data.reset_index(inplace=True, drop=True)
print(len(data))

24
2211695


In [76]:
data.to_csv('dpa_prepro_step7.csv')

In [77]:
data = pd.read_csv('dpa_prepro_step7.csv', encoding = 'utf-8', index_col = 0,  keep_default_na=False,
                   dtype = {'rubrics': 'str', 
                            'source': 'str',
                            'keywords': 'str',
                            'title': 'str',
                            'city': 'str',
                            'genre': 'str',
                            'wordcount': 'str'},
                  converters = {'paragraphs': literal_eval})

### Identify articles that predominantly consist of numbers

Articles that predominantly consist of numerical data present challenges for sentiment or topic analysis as, once numbers are removed, they tend to contain limited information. Moreover, the topics covered in these articles are often not relevant to our research question. Examples of such topics include budget distribution plans, statistics on car registrations, tax reforms in numbers and examples, statistics on oil and gas imports, history of key interest rate changes, and distribution of seats in the parliament.

While exploring news articles with a high proportion of numbers, we noticed a text in which all the "i"s were replaced with "1"s. We decided to delete this text.

In [78]:
# The article to delete
file_drop = ['3874051.xml']
ind = data[data.file == '3874051.xml']['texts'].index[0]
data[data.file == '3874051.xml']['texts'][ind]

'Zusammenfassung Südkorean1sche Reg1erungsparte1 ohne Mehrhe1t 1m neuen Parlament. Seoul (dpa) - In Südkorea hat d1e reg1erende Neue Korea Parte1 (NKP) von Staatschef K1m Young Sam 1hre parlamentar1sche Mehrhe1t 1n der neugewählten Nat1onalversammlung verloren, ble1bt aber we1ter an der Reg1erung. Präs1dent K1m sagte am Fre1tag 1n e1ner ersten Reakt1on zum Wahlergebn1s, er werde se1ne Reformpol1t1k und Kampagne gegen d1e Korrupt1on entsch1eden fortsetzen. Er sehe d1e Entsche1dung der Wähler als Auftrag für «Reformen und zur Schaffung e1ner sauberen pol1tschen Trad1ton». K1m darf laut Verfassung nach Ablauf se1ner fünfjähr1gen Amtsze1t 1998 ke1n zwe1tes Mal kand1d1eren. D1e NKP w1rd 1m neuen, 299 S1tze zählenden Parlament statt mit bisher 150 Mandaten nur noch m1t 139 vertreten se1n. M1t 79 S1tzen als zwe1tgrößte Parte1 etabl1erte s1ch der von K1ms angeschlagenem R1valen K1m Dae Jung angeführte oppos1t1onelle Nat1onalkongreß für neue Pol1t1k (NCNP). Zwe1 andere oppos1t1onelle Gruppen - 

In [79]:
data.drop(data[data['file'].isin(file_drop)].index, inplace=True)

In [80]:
# use the 'numeric_articles' function to identify economic articles with a high share of numbers in them
inputs = zip(data['texts'], data['word_count'], itertools.repeat(0.50))

startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    numeric_list = pool.starmap(numeric_articles.numeric_articles, inputs)
    pool.close()
    pool.join()

print(datetime.now()-startTime)
data['numeric'] = numeric_list

0:01:48.135737


In [81]:
# inspect example article with high share of numbers
data[data.numeric == True]['texts'].iloc[0]

'Höchste Arbeitslosenquote im Ostteil Berlins. Nürnberg (dpa) - Von den neuen Bundesländern hat Sachsen mit 6,2 Prozent im Dezember die niedrigste Arbeitslosenquote. Überdurchschnittlich ist die Erwerbslosigkeit im Ostteil Berlins (Quote 9,3 Prozent) und in Mecklenburg-Vorpommern (8,7). Da sich die Berechnung anders als in Westdeutschland auf alle zivilen Erwerbspersonen bezieht, können die Quoten der alten und neuen Bundesländer nicht miteinander verglichen werden. Die Arbeitslosigkeit stieg im Dezember gegenüber dem Vormonat in Schleswig-Holstein um 5 173 auf 88 718 (Quote 8,2 Prozent), in Hamburg um 1 429 auf 68 246 (9,5), in Niedersachsen um 13 708 auf 259 466 (8,8), in Bremen um 112 auf 34 492 (12,0), in Nordrhein-Westfalen um 14 338 auf 583 880 (8,4), in Hessen um 7 058 auf 127 740 (5,4), in Rheinland-Pfalz um 9 019 auf 89 757 (6,0), im Saarland um 615 auf 36 403 (8,7), in Baden-Württemberg um 9 823 auf 167 776 (4,0), in Nordbayern um 18 065 auf 116 774 (5,6), in Südbayern um 17 

In [82]:
data[data.numeric == True]['texts'].iloc[1]

'Öleinfuhren größer und teurer. Wiesbaden (dpa/vwd) - Mehr Öl zu höheren Preisen haben die alten Bundesländern im vergangenen Jahr eingeführt. In den ersten elf Monaten von 1990 erhöhten sich die Rohölimporte im Jahresvergleich um 9,0 Prozent auf 65,5 Millionen Tonnen. Die Ölrechnung für diesen Zeitraum fiel mit 17,8 Milliarden DM um 2,4 Milliarden DM höher aus als ein Jahr zuvor. Von Januar bis November lag der Durchschnittspreis für die Tonne Importrohöl mit 272,39 DM um 6,3 Prozent höher als im Vorjahr (256,28 DM). Allein im November registrierte das Statistische Bundesamt nach Mitteilung vom Donnerstag in Wiesbaden einen um 48,0 Prozent über dem Vorjahresniveau liegenden Durchschnittspreis. Einfuhr nach Ursprungsländern Jan.-Nov. 90 Jan.-Nov. 89 in 1 000 Tonnen 1. Großbritannien 13 860 12 721 2. Libyen 10 546 10 025 3. Saudi-Arabien 5 539 5 082 4. Norwegen 5 414 4 969 5. Nigeria 5 413 4 020 6. Sowjetunion 5 057 5 418 7. Venezuela 4 019 4 352 8. Syrien 3 226 2 136 9. Algerien 3 146 

In [83]:
len(data[data.numeric == True])

274

In [84]:
# drop articles predominantly consisting of numbers
data = data[data.numeric == False]
del data['numeric']
data.reset_index(inplace=True, drop=True)
print(len(data))

2211420


### Exclude tables

In this section, we exclude the tables from the news articles in order to reduce noise and focus on important information. To do that, we identify the articles that contain at least one paragraph that is predominantly comprised of numbers and that is at least 10 words long. Then we manually examine these articles to identify common strings that often precede tables. Finally, we use regular expressions to delete the tables based on these strings.

In [85]:
inputs = zip(data['paragraphs'], itertools.repeat(0.70), itertools.repeat(10))

startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    numeric_par = pool.starmap(numeric_par.numeric_par, inputs)
    pool.close()
    pool.join()

print(datetime.now()-startTime)
data['numeric_par'] = numeric_par

0:01:50.842181


In [86]:
quant_par = data[data.numeric_par == True]

In [183]:
quant_par.loc[:,'file'].to_csv('quant_par.csv')

Examples of articles containing at least one paragraph that is predominantly comprised of numbers and that is at least 10 words long.

In [88]:
quant_par.iloc[0]['texts']

'Auftragseingänge im November rückläufig - deutlich weniger Auslandsbestellungen. Bonn (dpa/vwd) - Die Auftragseingänge beim Verarbeitenden Gewerbe in den alten Bundesländern sind im November gegenüber dem Vormonat preis- und saisonbereinigt um 3,5 Prozent zurückgegangen. Dies geht aus vorläufigen Berechnungen des Statistischen Bundesamtes hervor, die am Dienstag vom Bundeswirtschaftsministerium veröffentlicht wurden. Dabei gingen den Angaben zufolge die Bestellungen des Auslands deutlich um 7,5 Prozent zurück. Ausschlaggebend dafür dürfte nach Angaben des Ministeriums die spürbar abgekühlte Konjunktur in einer Reihe wichtiger Partnerstaaten sein. Die Inlandsbestellungen gingen im Berichtsmonat um 1,5 Prozent zurück. Auch der Zweimonatsvergleich (Oktober/November gegenüber August/September), in dem kurzfristige Schwankungen ausgeglichen werden, zeige, daß sich die inländischen Auftragseingänge auf dem hohen Niveau der Vorperiode hielten, während die Auslandsbestellungen um vier Prozent

In [89]:
quant_par.iloc[1]['texts']

'Frankfurt Lebenshaltung zwei und Schluß Lebenshaltung zwei und Schluß Autofahrer zur Kasse gebeten. Weit über der allgemeinen Teuerungsrate war der Preisauftrieb «rund ums Auto». Der sogenannte Kraftfahrerpreisindex lag im Dezember um 4,5 Prozent über dem Stand des Vorjahres. In diesem Teilindex erfassen die Wiesbadener Statistiker die Preise für Neuwagen, Reparaturen sowie Versicherungsprämien und Spritkosten. Dabei hat es von November bis Dezember sogar noch einen Rückgang des Kraftfahrerindex von 1,1 Prozent parallel zu den sinkenden Benzinpreisen gegeben. Lebenshaltungspreise Veränderung zu Neuer Dezember Vorjahr Vormonat Indexstand (1985: 100) Gesamtindex + 2,8 + 0,1 108,1 Ohne Saisonwaren + 2,7 - 0,1 Teilindex für Nahrungsmittel + 2,1 + 0,5 Teilindex für Dienstleistungen und Reparaturen + 2,3 + 0,2 Teilindex Mieten und Garagen + 3,8 + 0,3 4-Personen-Arbeitnehmer-Haushalt mit mittlerem Einkommen + 2,8 0,0 107,8 4-Personen-Arbeitnehmer-Haushalt mit höherem Einkommen + 2,5 0,0 108,

In [175]:
#for index, row in quant_par[:20].iterrows():
#    print(index, row['texts'], '\n')

In [208]:
re_table = data[data.texts.str.contains(r'Im einzelnen kam es zu folgenden Veränderungen.{0,}')]

In [212]:
#for index, row in re_table.iterrows():
#    print(index, row['texts'], '\n')

In [213]:
#for index, row in re_table.iterrows():
#    print(index, re.findall(r'Im einzelnen ergaben sich folgende Veränderungen.{0,}', row['texts']), '\n')

In [214]:
#for index, row in re_table.iterrows():
#    if index in quant_par.index:
#        print(index, re.findall(r'Im einzelnen kam es zu folgenden Veränderungen.{0,}', row['texts']), '\n')

In [185]:
#re.sub(r'Weltautomobilproduktion \(in Millionen Wagen\).{0,}', '', quant_par.iloc[6]['texts'])

Clean tables using regular expressions.

In [None]:
startTime = datetime.now()
if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    remove_tables = pool.map(clean_tables.clean_tables, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

data['texts'] = remove_tables

In [None]:
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    count_results = pool.map(count_words_mp.count_words_mp, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

In [None]:
# Save the result as a new column "word_count"
data['word_count'] = count_results

In [None]:
# remove articles with less than 100 words
data = data[data['word_count']>=100]
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True)
print(len(data))
del data['numeric_par']

In [None]:
#data.to_csv('dpa_prepro_step8.csv')

### Merge continuations of articles

Some dpa-afx articles are split into multiple entries marked by the word 'Fortsetzung' at the beginning of the texts of following entries.

In [38]:
data['texts'].iloc[1]

'FRANKFURT (dpa-AFX) - Der deutsche Aktienmarkt hat am Montag für wenige Augenblicke seinen Rekordschluss vom Juli 1998 überschritten. Mit einem Höchststand von 6.188,68 lag der Dax über dem Tageschluss vom 21. Juli 1998, als der Index 6.186 Punkte erreichte. Der Dax <DAX.ETR> schloss den Handel am ersten Wochentag bei 6.142,19 Zählern und damit um 0,38% oder 23,02 Punkte fester ab. Der Nebenwerteindex M-Dax <MDAX.ETR> gab dagegen auf 3.990,00 Punkte oder um 1,43% nach, und der Neue Markt-Index Nemax 50 <NMKX.ETR> schloss bei 4.610,66 Zählern (-0,58%).      Spätestens auf dem Niveau des All-Time-High bei ungefähr 6.200 Punkten werde der Index auf einen massiven Widerstand stoßen, heißt es am Montag in einem Research-Report von der Nürnberger Schmidt Bank. Michael Schubert, Analyst der Bankgesellschaft Berlin, sagte in einem Gespräch mit dpa-AFX, derzeit sei der einzig negative Faktor, dass die Börse nicht von der Breite getragen werde, sondern nur von wenigen Titeln. Wer jetzt noch nic

In [39]:
data['texts'].iloc[3]

'(Fortsetzung) - Der Tagesgewinner SAP <SAP.ETR> übernahm bereits am Vormittag die Führung bei den Gewinnern ein und schloss den Handel mit einem Kurszuwachs von 6,27% auf 475,20 Euro ab. Händler waren sich einig darüber, dass SAP weiteres Wachstumpotenzial haben. Ende vergangener Woche habe es Kaufempfehlungen für das Papier gegeben, sagte ein Händler. Die ABN Amro Bank hatte ihr Kursziel mit 600 Euro festgesetzt.      Den Wert umgaben zudem Gerüchte über eine bevorstehende Allianz mit einem Weltkonzern, hieß es. Ein anderer Händler hielt den aktuellen SAP-Kurs noch immer für zu billig. Wachstumswerte stünden weiter in der Gunst der Anleger. Bei SAP hätten Image und Aktienkurs unter Problemen in den USA gelitten. Das werde nach Erwartung des Händlers aber spätestens mit den Geschäftszahlen des ersten Quartals 2000 überwunden sein. Mit dem Ende der Angst vor dem Jahr-2000-Problem würde der Kurs weiter steigen.     Ihren Kurshöhenflug fortsetzen konnten zudem die Aktien von Siemens <SIE

In [40]:
data['texts'].iloc[7]

'(Fortsetzung) - Überraschend fest tendierten am Montag die Autowerte. Volkswagen <VOW.ETR> gewannen 1,6% auf 49,35 Euro. Vielleicht sehen wir jetzt tatsächlich eine Branchen-Rotation zur Autobranche hin, sagte ein Händler der DG-Bank. Die nahe Zukunft des Wertes stehe oder falle mit der 50 Euro Marke. Auch BMW <BMW.ETR> (28,10 Euro/+1,04%) und DaimlerChrysler <DCX.ETR> (67,25 Euro/+0,67%)  verbuchten Kurszuwächse. Alle drei deutschen Hersteller verbuchten einem Pressebericht zufolge im Gesamtjahr 1999 steigende Absatzzahlen in den USA.       Die Verliererliste führten am Montag Lufthansa <LHA.ETR> (22,30 Euro/-3,88%) und Veba <VEB.ETR> (45,47 Euro/-3,87%) an. Unter Gewinnmitnahmen litt laut Händlern der Kurs der T-Aktie <DTE.ETR>: Der Titel verlor im Montagshandel 1,31% auf 59,30 Euro. Wäre die Telekom mit dem Indextrend im Wert gestiegen, hätte der Dax den Rekordstand vom Juli 1998 eingestellt, rechnete ein Händler am Nachmittag vor./mr/fs'

We identify which entries belong together and merge them to one article

In [46]:
# Divide data into roughly equal sized chunks where articles from one day only fall under the same chunk
data_cont = data[data['topic'] == 'afx']
dates = data_cont.groupby(['year', 'month', 'day'])
dates = list(dates.groups)
dates = np.array_split(dates, NUM_CORE)

meta_data = data_cont.loc[:, data_cont.columns != 'texts']

chunks = [pd.concat([data_cont[(data_cont['year'] == t[0]) & (data_cont['month'] == t[1]) & (data_cont['day'] == t[2])] for t in tup]) for tup in dates]
chunks = [[chunk, meta_data] for chunk in chunks]

In [48]:
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    continue_results = pool.map(continue_articles.continue_articles, chunks) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:14:56.636717


In [49]:
data.drop(list(itertools.chain(*[tup[0] for tup in continue_results])), inplace=True)

In [None]:
continue_articles = pd.concat([tup[1] for tup in continue_results])
print(len(continue_articles))

In [50]:
data = data.append(continue_articles)
data.reset_index(inplace=True, drop=True)
print(len(data))

1974426


In [51]:
#data.to_csv('dpa_prepro_step9.csv')

## Remove URLs

In [52]:
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    url_corrected = pool.map(correct_url.correct_url, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:06:33.551105


In [53]:
data['texts'] = url_corrected

In [None]:
#data.to_csv('dpa_prepro_step10.csv')

## Remove dpa references

We remove dpa references (e.g. NEW YORK (dpa) - ...) from each article.

In [54]:
if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    dpa_ref_removed = pool.map(clean_dpa_references.clean_dpa_references, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:15:15.393896


In [55]:
data['texts'] = dpa_ref_removed

In [66]:
#data.to_csv('dpa_prepro_step11.csv')

# Correct spelling

### Umlauts

Older articles (1991 - 2000) from the section 'Kommentar' (Commentary) are often missing umlauts and correct capitalization. To
fix these two issues, we use the notebook Umlauts_fix written in Python 2 and the notebook Truecasing written in Python 3.

In [6]:
umlauts = ['ä', 'ö', 'ü', 'ß', 'Ä', 'Ö', 'Ü']
umlauts_replace = ['ae', 'oe', 'ue', 'ss', 'AE', 'OE', 'UE']

In [7]:
dpa_umlauts_fix = data[(data.texts.str.contains('|'.join(umlauts_replace))) & (~data.texts.str.contains('|'.join(umlauts))) & (data.year<2001)]

In [9]:
dpa_umlauts_fix['texts'].iloc[0]

'Ad-hoc announcement sent by DGAP. The sender is solely responsible for the contents of this announcement.  Ad-hoc Mitteilung Nach @ 15 WpHG  WizCom Technologies Ltd. (WizCom) <WZM.FSE> (Neuer Markt:WZM,WKN:915 856) veroeffentlicht das Ergebnis fuer das am 31. Dezember 1999 endende Geschaeftsjahr 20. Maerz 2000, Jerusalem, Israel - Der Umsatz belief sich im Jahr 1999 auf US$ Mio. 11,613. Der Vorjahreswert lag bei US$ Mio. 15,799. Der Umsatz im ersten bzw. zweiten Halbjahr 1999 betrug US$ Mio. 4,046 bzw. US$ Mio. 7,567. Das Unternehmen sieht den Umsatzrueckgang als voruebergehend an und vor allem im zweiten Halbjahr begruendet, weil das neue Produkt, der QuickLink-Pen, erst mit einigen Monaten Verzoegerung auf den Markt gebracht werden konnte. Im 4. Quartal 1999 konnte WizCom den QuickLink Pen in den USA erfolgreich auf den Markt bringen. Im 1. Quartal 2000 vermarktet WizCom den QuickLink Pen in weiteren Laendern, unter anderem Australien, Grossbritannien, Deutschland und Frankreich. De

In [None]:
dpa_umlauts_fix.to_csv('dpa_umlauts_fix.csv', encoding='utf-8-sig', sep = ';')

In [None]:
dpa_umlauts_fixed = pd.read_csv('dpa_umlauts_fixed.csv', encoding = 'utf-8', sep=';')

In [None]:
#data.to_csv('dpa_prepro_step12.csv')

### Truecasing

In [56]:
dpa_cases_fix = data[data.texts.str.contains('^(?!.*[A-Z])')]

In [57]:
dpa_cases_fix['texts'].iloc[0]

'new york (vwd) - enttaeuschend verlief das geschaeft am mittwoch, dem ersten handelstag im neuen jahr, an der new yorker aktienboerse. die zunaechst gesehenen leichten gewinne konnten nur bis in das fruehe nachmittagsgeschaeft behauptet werden. in den letzten 2-1/2 geschaeftststunden gerieten die kurse in die minuszone und wall street schloss auf breiter front schwaecher. der dow-jones-index fuer 30 industriewerte gab um 23,02 auf 2.610,64 punkte nach. auch die uebrigen marktbestimmenden indizes gerieten in die minuszone. bei einem umsatz von 126,28 (114,13) millionen aktien standen die kursverlierer den -gewinnern im verhaeltnis von rund neun zu sieben gegenueber. verantwortlich fuer die schwaeche waren wiederauflebende befuerchtungen ueber eine anhaltende rezessionsphase. nachdem sogar das weisse haus jetzt von einer rezessionaeren entwicklung spricht, hielten sich die meisten anleger mit ihren engagements zurueck, wodurch der vorherrschende abgabedruck ausreichte, um die kurse in d

In [62]:
dpa_cases_fix.to_csv('dpa_case_fix.csv', encoding='utf-8-sig', sep = ';')

In [63]:
dpa_cases_fixed = pd.read_csv('dpa_cases_fixed.csv', encoding = 'utf-8', sep=';')

In [64]:
data.loc[dpa_cases_fixed.index, 'texts'] = dpa_cases_fixed

In [65]:
# fixed version
data['texts'].iloc[0]

'New York( Vwd)- Enttaeuschend verlief das Geschaeft am Mittwoch, dem ersten Handelstag im neuen Jahr, an der New Yorker Aktienboerse. Die Zunaechst Gesehenen leichten Gewinne konnten nur bis in das Fruehe Nachmittagsgeschaeft behauptet werden. In den letzten 2-1/2 Geschaeftststunden gerieten die Kurse in die Minuszone und Wall Street schloss auf breiter Front Schwaecher. Der Dow-Jones-Index Fuer 30 Industriewerte gab um 23,02 auf 2.610,64 Punkte nach. Auch die Uebrigen Marktbestimmenden Indizes gerieten in die Minuszone. Bei einem Umsatz von 126,28( 114,13) Millionen Aktien standen die Kursverlierer den -Gewinnern im Verhaeltnis von rund neun zu sieben Gegenueber. Verantwortlich Fuer die Schwaeche waren Wiederauflebende Befuerchtungen Ueber eine anhaltende Rezessionsphase. Nachdem sogar das Weisse Haus jetzt von einer Rezessionaeren Entwicklung spricht, hielten sich die meisten Anleger mit ihren Engagements Zurueck, wodurch der vorherrschende Abgabedruck ausreichte, um die Kurse in di

In [None]:
#data.to_csv('dpa_prepro_step13.csv')

## Fixing tokens containing a number and a word

In quite a few cases, a number and a word are erroneously merged into a single token. Splitting these tokens into two tokens helps us to deal with the following problems:

(see Handelsblatt notebook)

In [58]:
startTime = datetime.now()

import split_number_word

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    split_corrected = pool.map(split_number_word.split_number_word, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)

0:02:13.158789


In [59]:
data['texts'] = split_corrected

In [60]:
#data.to_csv('dpa_prepro_step14.csv')

# Delete Fuzzy Duplicates

In [61]:
types_keep = ['russische Aktienmarkt', 'europäischen Börsen', 'Deutsche Börse' ,'Europäische Zentralbank',
'Moskauer Aktienmarkt', 'deutsche Aktienmarkt', 'deutschen Aktienmarkt', 'Folgende Investmentbanken',
'deutsche Rentenmarkt', 'amerikanischen Treasury Bonds', 'IRW-PRESS', 'Deutsche Bank', 'Ausgewählte Analysten-Einstufungen',
'Deutsche Staatsanleihen', 'Der japanische Aktienmarkt']

exceptions = ['NO_EXCEPTIONS']

In [63]:
# import a function that outputs the indices of duplicates 
import fuzzy_duplicates
delete_indices = []
startTime = datetime.now() 
for year in list(set(data['year'])):
    data_input = data[(data['year'] == year)]
    for month in list(set(data_input[data_input['year'] == year]['month'])): # old: list(set(data))
            # Prepare inputs
            inputs_year = []
            inputs_month = []
            inputs_month_year = []
            inputs_year.append(year)
            inputs_month.append(month)
            inputs_month_year.append(data_input[(data_input['year'] == year) & (data_input['month'] == month)][["month", "year", "texts"]])

            #from itertools import repeat
            inputs = list(zip(inputs_year, inputs_month, inputs_month_year))
            from datetime import datetime
            if __name__ == "__main__":
                pool = mp.Pool(NUM_CORE)
                # apply function to all combinations of month-year in parallel
                delete_intermediate = pool.starmap(fuzzy_duplicates.fuzzy_duplicates, zip(inputs, repeat(types_keep), repeat(exceptions)))
                delete_indices = delete_indices + delete_intermediate # create one list of indices
                pool.close()
                pool.join()  
    print(year)
print(datetime.now()-startTime)

1991.0
1992.0
1993.0
1994.0
1995.0
1996.0
1997.0
1998.0
1999.0
2000.0
2001.0
2002.0
2003.0
2004.0
2005.0
2006.0
2007.0
2008.0
2009.0
2010.0
2011.0
2012.0
2013.0
2014.0
2015.0
2016.0
2017.0
2018.0
6:58:11.847691


In [64]:
delete_indices = list(set([item for sublist in delete_indices for item in sublist]))

In [65]:
data.drop(delete_indices, inplace=True)
data.reset_index(inplace=True, drop=True)
print(len(data))

1580324


In [66]:
#data.to_csv('dpa_prepro_step15.csv')

# Clean articles

Dpa articles include some unnecessary text passages like inquiry notes or references to webpages. We decided to clean the 
affected articles from these text passages to make the sentiment classification easier for our model(s).

We remove the following terms and sections from the texts:
* 1) stock symbols
* 2) additional metadata in the text meant for the author
* 3) references to previos articles
* 4) references to dpa and dpa-AFX webpage
* 5) uncorrected original article on which a correction is based on
* 6) inquiry notes
* 7) reference to english article on which some articles are based on
* 8) date of the article
* 9) references for aditional information (phone numbers, webpages etc.)
* 10) references to sender
* 11) references to Debitos
* 12) references to issuer
* 13) reference to authors
* 14) references to summary of article 

In [67]:
startTime = datetime.now()
if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    cleaned_articles = pool.map(clean_dpa_articles.clean_dpa_articles, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:58:52.745164


In [68]:
data['texts'] = cleaned_articles

In [69]:
#data.to_csv('dpa_prepro_final.csv')