In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime
import multiprocessing as mp
from langdetect import detect
import re
import numpy as np
import worker_xml
from itertools import repeat

In [3]:
# Set the number of cores to use
NUM_CORE = mp.cpu_count()-2

# Dpa Data (1991 - 2019)

In [4]:
# Folder with unpacked articles
path = 'D:\\Studium\\PhD\Media Tenor\\dpa\\dpa_unpacked'
folder_list = []

# 2 folders
for fol in [fol for fol in os.listdir(path)]:

    # Within each folder: folders for different years
    for f in [f for f in os.listdir(path + '\\' + fol)]:
        folder_list.append(path + '\\' + fol + '\\' + f)

In [5]:
# Select a path to the folder for storing results
PATH = r'D:\\Studium\\PhD\\Media Tenor'
os.chdir(PATH)

In [None]:
# Use the 'worker_xml' function to load articles
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    df_list = pool.map(worker_xml.worker_xml, folder_list)
    data = pd.concat(df_list)
    data.reset_index(inplace=True, drop=True)
    pool.close()
    pool.join()

print(datetime.now()-startTime)

## Filtering

In [None]:
# Filter out articles with less than 50 words
data = data[pd.to_numeric(data['wordcount']) >= 50]
data.reset_index(inplace=True, drop=True)

In [None]:
delete_indices = []

# Filter out articles that mainly consists of numbers
for index, row in data.iterrows():

    # Count number of integers and words
    count_n = sum(s.isdigit() for s in data.iloc[index]['texts'])
    count_w = data.iloc[index]['wordcount']

    if count_n/count_w > 1/2:
        delete_index.append(index)

data.drop(delete_indices, inplace=True)

In [None]:
# Filter out insignificant articles based on keywords, genres etc. and reset the index 
# of the dataframe afterwards
fil_titles = '''Edelmetallpreise|Tageskalender|Tabelle|SPORT|SPORT\:|
                Sport|Berichtigung|Sortenkurse|Devisenkurse|Impressum|Testmeldung|
                WOCHENVORSCHAU|Kurse A|Kurse B|Kurse C'''
data.drop(data[data['title'].str.contains(fil_titles)].index, inplace=True)
data.reset_index(inplace=True, drop=True)

genre_clean = '''Tabelle|Historisches|Achtung|Sport|SPORT'''
data.drop(data[data['genre'].str.contains(genre_clean)].index, inplace=True)
data.reset_index(inplace=True, drop=True)

keyword_clean = '''Tagesvorschau|Sport|Redaktionshinweis|SUM|DGAP|Sport|SPORT|
                     Wochenvorschau|SPO'''
data.drop(data[data['keywords'].str.contains(keyword_clean)].index, inplace=True)
data.reset_index(inplace=True, drop=True)

data.drop(data[data['rubrics'].str.contains('iq')].index, inplace=True)
data.reset_index(inplace=True, drop=True)

text_clean = '''Schalterverkaufskurse:|dpa-news.de'''
data.drop(data[data['texts'].str.contains(text_clean)].index, inplace=True)
data.reset_index(inplace=True, drop=True)

data.drop(data[data['source'] == 'dpa-frei\dpa-wahl'].index, inplace=True)
data.reset_index(inplace=True, drop=True)


### Delete Duplicates

In [None]:
# Prepare inputs
inputs_year = []
inputs_month = []
for year in list(set(data['year'])):
    for month in list(set(data['month'])):
        inputs_year.append(year)
        inputs_month.append(month)

inputs = list(zip(inputs_year, inputs_month, repeat(data)))


In [None]:
# Import a function that outputs the indices of duplicates
import fuzzy_duplicates

startTime = datetime.now()
delete_indices = []

# Apply function to all combinations of month-year in parallel
if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    delete_indices =  pool.map(fuzzy_duplicates.fuzzy_duplicates, inputs)
    pool.close()
    pool.join()

print(datetime.now()-startTime)

data.drop(delete_indices, inplace=True)


### Split up articles

In [None]:
# Store potential multiple articles separately and delete them from the
# original data
s_mult_art = '''dpa-Nachrichtenüberblick|Nachrichtenüberblick|
                Vorschau|vorschau|VORSCHAU|Tagesvorschau'''
mult_art = data[data['title'].str.contains(s_mult_art)]
mult_art = mult_art.append(data[data['keywords'].str.contains(s_mult_art)])
mult_art = mult_art.append(data[data['genre'].str.contains(s_mult_art)])

data.drop(data[data['title'].str.contains(s_mult_art)].index, inplace=True)
data.drop(data[data['keywords'].str.contains(s_mult_art)].index, inplace=True)
data.drop(data[data['genre'].str.contains(s_mult_art)].index, inplace=True)

# Define a function that splits up potential multiple articles into single articles
def split_mult_art(multiple_articles):

    split_art = []
    weekdays = ['Montag', 'Dienstag', 'Mittwoch', 'Donnerstag',
                'Freitag', 'Samstag', 'Sonntag']
    multiple_articles.reset_index(inplace=True, drop=True)

    for index, row in multiple_articles.iterrows():

        # Split articles
        mult_art = re.split(r'  +', row["texts"])
        keep_index = []

        # Keep any text wich contains a weekday or is longer than 100 words
        # but shorter than 50. The 50 words boundary stems from the filter
        # before and should be adapted accordingly
        for art in mult_art:
            if (any([day in art for day in weekdays]) or
                    len(art.split()) >= 100) and len(art.split()) >= 50:
                keep_index.append(mult_art.index(art))

        keep_art = [mult_art[i] for i in keep_index]

        if keep_art != []:
            for art in keep_art:
                # Store the meta data for the article
                seperated_articles = multiple_articles.iloc[index][multiple_articles.iloc[index] != 'texts']
                # Assign the new text to the matching meta data
                seperated_articles['texts'] = art
                split_art.append(seperated_articles)

    return(pd.concat(split_art, axis=1).transpose())

In [None]:
# Append newly split articles to the orginal data
data.append(split_mult_art(mul_art))

### Delete English Articles

In [None]:
# Import a function that outputs the indices of englisch articles
import identify_eng

import itertools

# Delete all English articles from the data
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    split_dfs = np.array_split(data, NUM_CORE)
    index_eng = pool.map(identify_eng.identify_eng, split_dfs)
    index_eng = list(itertools.chain(*index_eng))
    pool.close()
    pool.join()

print(datetime.now()-startTime)
data.drop(index_eng, inplace=True)

### Filtering Based on Topics

In [None]:
# Split data into economic and finance articles
afx = data[data['topic'] == 'afx']
WiPo = data[data['topic'] == 'WiPo']

# Filter out articles based on keywords which are specific to the economic news
keyword_clean = '''Kurse|KURSE|kurse|Börse International|Terminbörse|
                Finanzmärkte International'''
WiPo = WiPo.drop(WiPo[WiPo['keywords'].str.contains(keyword_clean)].index, inplace=True)
WiPo.reset_index(inplace=True, drop=True)

In [None]:
# Save final results
WiPo.to_csv('dpa_WiPo_cleaned.csv')
afx.to_csv('dpa_afx_cleaned.csv')