In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import numpy as np
import itertools
import worker_xml
import multiprocessing as mp
from datetime import datetime
from itertools import repeat
from numbers_delete import numbers_delete
from Split_articles import split_mult_art
from langdetect import detect


In [3]:
# Set the number of cores to use
NUM_CORE = mp.cpu_count()

# Dpa Data (1991 - 2019)

In [4]:
# Folder with unpacked articles
path = r'E:\\Userhome\\jbaer\\dpa_unpacked'
folder_list = []

# 2 folders
for fol in [fol for fol in os.listdir(path)]:

    # Within each folder: folders for different years
    for f in [f for f in os.listdir(path + '\\' + fol)]:
        folder_list.append(path + '\\' + fol + '\\' + f)

In [5]:
# Select a path to the folder for storing results
PATH = r'E:\\Userhome\\jbaer\\Media Tenor Results'
os.chdir(PATH)

In [5]:
# Use the 'worker_xml' function to load articles
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    df_list = pool.map(worker_xml.worker_xml, folder_list)
    data = pd.concat(df_list)
    data.reset_index(inplace=True, drop=True)
    pool.close()
    pool.join()

print(datetime.now()-startTime)

2:03:23.803171


In [6]:
print("Length of data before filtering:", len(data))

Length of data before filtering: 7539874


In [7]:
data.to_csv('dpa_raw.csv')

## Filtering

In [8]:
# Filter out articles with less than 75 words
data = data[pd.to_numeric(data['wordcount']) >= 75]
data.reset_index(inplace=True, drop=True)

In [9]:
# Filter out insignificant articles based on keywords, genres etc.
fil_titles = '''Edelmetallpreise|Tageskalender|Tabelle|SPORT|SPORT\:|
                Sport|Berichtigung|Sortenkurse|Devisenkurse|Impressum|Testmeldung|
                Kurse A|Kurse B|Kurse C|DGAP-DD'''
data.drop(data[data['title'].str.contains(fil_titles)].index, inplace=True)

fil_genres = '''Tabelle|Historisches|Achtung|Sport|SPORT'''
data.drop(data[data['genre'].str.contains(fil_genres)].index, inplace=True)

fil_keywords = '''Sport|Redaktionshinweis|SUM|DGAP|Sport|SPORT|SPO'''
data.drop(data[data['keywords'].str.contains(fil_keywords)].index, inplace=True)

data.drop(data[data['rubrics'].str.contains('iq')].index, inplace=True)

fil_texts = '''Schalterverkaufskurse:|dpa-news.de'''
data.drop(data[data['texts'].str.contains(fil_texts)].index, inplace=True)

data.drop(data[data['source'] == 'dpa-frei\dpa-wahl'].index, inplace=True)
data.reset_index(inplace=True, drop=True)

# Filter out articles based on keywords which are specific to the economic news
keyword_clean = '''Kurse|KURSE|kurse|Börse International|Terminbörse|Finanzmärkte International'''
WiPo = data[data['topic'] == 'WiPo'] 
data.drop(WiPo[WiPo['keywords'].str.contains(keyword_clean)].index, inplace = True)
data.reset_index(inplace=True, drop=True)

In [10]:
# free memory space to avoid memory errors
del WiPo

import gc

gc.collect()

121

In [11]:
# use the 'numbers_delete' function to filter out economic articles with a high share of numbers in them
delete_index = numbers_delete(data)
data.drop(delete_index, inplace = True)
data.reset_index(inplace=True, drop=True)

### Split up articles

In [12]:
# Store entries potentially consisting of multiple articles separately
# as 'mult art' 
s_mult_art = '''dpa-Nachrichtenüberblick|Nachrichtenüberblick|
                Vorschau|vorschau|VORSCHAU|Tagesvorschau|
                dpa-Tagesvorschau'''

mult_art = data[data['title'].str.contains(s_mult_art)]
mult_art = mult_art.append(data[data['keywords'].str.contains(s_mult_art)])
mult_art = mult_art.append(data[data['genre'].str.contains(s_mult_art)])

In [13]:
# delete 'mult_art' from the original data
data.drop(data[data['title'].str.contains(s_mult_art)].index, inplace=True)
data.drop(data[data['keywords'].str.contains(s_mult_art)].index, inplace=True)
data.drop(data[data['genre'].str.contains(s_mult_art)].index, inplace=True)
data.reset_index(inplace=True, drop=True)

In [14]:
# use the 'split_mult_art' function to split up entries in 
# 'mult_art' and append the resulting new articles to the data
data = data.append(split_mult_art(mult_art))

In [15]:
# free memory space to avoid memory errors
del mult_art

gc.collect()

20

In [16]:
print("Length of data after first filtering steps:", len(data))

Length of data after first filtering steps: 5806223


### Filtering Based on Topics

In [17]:
# Split data into economic and finance articles
afx = data[data['topic'] == 'afx']
afx.reset_index(inplace=True, drop=True)

WiPo = data[data['topic'] == 'WiPo'] 
WiPo.reset_index(inplace=True, drop=True)

WiPo = WiPo.drop(columns=['index'])
afx = afx.drop(columns=['index'])

In [18]:
print("Number of WiPo Articles:", len(WiPo), ", Number of afx Articles:", len(afx))

Number of WiPo Articles: 2905412 , Number of afx Articles: 2900811


In [19]:
# Save intermediate results
WiPo.to_csv('dpa_WiPo_pre_filtered.csv')
afx.to_csv('dpa_afx_pre_filtered.csv')

In [20]:
# free memory space to avoid memory errors
del data

gc.collect()

100

In [6]:
WiPo = pd.read_csv('dpa_WiPo_pre_filtered.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Delete Fuzzy Duplicates

In [7]:
# NOTE: I ignored the multiprocessing function to avoid memory issues

import fuzzy_duplicates
delete_indices = []
 
for year, data_yearly in WiPo.groupby('year'):
    
    data_yearly = data_yearly.reset_index()
    # Prepare inputs
    inputs_year = []
    inputs_month = []

    for month in list(set(data_yearly['month'])):
        inputs_year.append(year)
        inputs_month.append(month)

    inputs = list(zip(inputs_year, inputs_month, repeat(data_yearly)))
    
    startTime = datetime.now()
    
    for inp in inputs:

        delete_indices.append(fuzzy_duplicates.fuzzy_duplicates(inp))

        # Apply function to all combinations of month-year in parallel
        # if __name__ == "__main__":
        # pool = mp.Pool(NUM_CORE)
        # delete_indices.append(pool.map(fuzzy_duplicates.fuzzy_duplicates, inputs))
        # pool.close()
        # pool.join()

    print(datetime.now()-startTime)

merged = list(itertools.chain(*delete_indices))
WiPo.drop(merged, inplace=True)



0:30:55.625995
0:31:49.791557
0:32:54.944771
0:30:32.665614
0:31:42.206455
0:32:08.340749
0:30:45.280681
0:37:29.015236
0:47:03.376425
0:48:09.322768
0:46:45.712046
0:43:36.020484
0:42:49.812051
0:38:38.036791
0:40:12.614122
0:42:02.905433
0:44:25.646218
0:56:13.789029
0:53:31.579092
0:42:53.343497
0:44:15.374064
0:43:33.926395
0:42:23.491649
0:39:53.796130
0:39:36.912825
0:40:44.074491
0:33:46.458266
0:27:44.204960
0:00:00.020247


NameError: name 'data' is not defined

### Delete English Articles (optional for afx)

In [50]:
# Import a function that outputs the indices of englisch articles
import identify_eng

# Delete all English articles from the data

for year, data_yearly in afx.groupby('year'):
    
    startTime = datetime.now()

    if __name__ == "__main__":
        pool = mp.Pool(NUM_CORE)
        split_dfs = np.array_split(data_yearly, NUM_CORE)
        index_eng = pool.map(identify_eng.identify_eng, split_dfs)
        index_eng = list(itertools.chain(*index_eng))
        pool.close()
        pool.join()

    print(datetime.now()-startTime)

data.drop(index_eng, inplace=True)

0:00:29.154173


In [10]:
# Save final results
WiPo.to_csv('dpa_WiPo_filtered.csv')

In [None]:
# Save final results
afx.to_csv('dpa_afx_filtered.csv')