In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import numpy as np
import itertools
import worker_xml
import multiprocessing as mp
from datetime import datetime
from itertools import repeat
from numbers_delete import numbers_delete
from Split_articles import split_mult_art
from langdetect import detect

In [2]:
# Set the number of cores to use
NUM_CORE = mp.cpu_count()

# Dpa Data (1991 - 2019)

In [3]:
# Folder with unpacked articles
path = r'E:\\Userhome\\jbaer\\dpa_unpacked'
folder_list = []

# 2 folders
for fol in [fol for fol in os.listdir(path)]:

    # Within each folder: folders for different years
    for f in [f for f in os.listdir(path + '\\' + fol)]:
        folder_list.append(path + '\\' + fol + '\\' + f)

In [3]:
# Select a path to the folder for storing results
PATH = r'E:\\Userhome\\jbaer\\Media Tenor Results'
os.chdir(PATH)

In [5]:
# Use the 'worker_xml' function to load articles
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    df_list = pool.map(worker_xml.worker_xml, folder_list)
    data = pd.concat(df_list)
    data.reset_index(inplace=True, drop=True)
    pool.close()
    pool.join()

print(datetime.now()-startTime)

2:03:54.342357


In [6]:
print("Number of articles before filtering:", len(data))

Number of articles before filtering: 7539874


In [7]:
data.to_csv('dpa_raw.csv')

In [28]:
#data = pd.read_csv('dpa_raw.csv')
#data.fillna('', inplace=True)

## Filtering

In [8]:
# Filter out articles with less than 75 words
data = data[pd.to_numeric(data['wordcount']) >= 75]
data.reset_index(inplace=True, drop=True)

In [9]:
# Filter out insignificant articles based on keywords, genres etc.
fil_titles = '''Edelmetallpreise|Tageskalender|Tabelle|SPORT|SPORT\:|
                Sport|Berichtigung|Sortenkurse|Devisenkurse|Impressum|Testmeldung|
                Kurse A|Kurse B|Kurse C|DGAP-DD'''
data.drop(data[data['title'].str.contains(fil_titles)].index, inplace=True)

fil_genres = '''Tabelle|Historisches|Achtung|Sport|SPORT'''
data.drop(data[data['genre'].str.contains(fil_genres)].index, inplace=True)

fil_keywords = '''Sport|Redaktionshinweis|SUM|DGAP|Sport|SPORT|SPO'''
data.drop(data[data['keywords'].str.contains(fil_keywords)].index, inplace=True)

data.drop(data[data['rubrics'].str.contains('iq')].index, inplace=True)

fil_texts = '''Schalterverkaufskurse:|dpa-news.de'''
data.drop(data[data['texts'].str.contains(fil_texts)].index, inplace=True)

data.drop(data[data['source'] == 'dpa-frei\dpa-wahl'].index, inplace=True)
data.reset_index(inplace=True, drop=True)

# Filter out articles based on keywords which are specific to the economic news
keyword_clean = '''Kurse|KURSE|kurse|Börse International|Terminbörse|Finanzmärkte International'''
WiPo = data[data['topic'] == 'WiPo'] 
data.drop(WiPo[WiPo['keywords'].str.contains(keyword_clean)].index, inplace = True)
data.reset_index(inplace=True, drop=True)

In [10]:
# free memory space to avoid memory errors
del WiPo
del df_list

import gc

gc.collect()

121

In [11]:
# use the 'numbers_delete' function to filter out economic articles with a high share of numbers in them
delete_index = numbers_delete(data)
data.drop(delete_index, inplace = True)
data.reset_index(inplace=True, drop=True)

### Split up articles

In [17]:
# Store entries potentially consisting of multiple articles separately
# as 'mult art' 
s_mult_art = '''dpa-Nachrichtenüberblick|Nachrichtenüberblick|
                Vorschau|vorschau|VORSCHAU|Tagesvorschau|
                dpa-Tagesvorschau'''

mult_art = data[data['title'].str.contains(s_mult_art)]
mult_art = mult_art.append(data[data['keywords'].str.contains(s_mult_art)])
mult_art = mult_art.append(data[data['genre'].str.contains(s_mult_art)])

In [18]:
# delete 'mult_art' from the original data
data.drop(data[data['title'].str.contains(s_mult_art)].index, inplace=True)
data.drop(data[data['keywords'].str.contains(s_mult_art)].index, inplace=True)
data.drop(data[data['genre'].str.contains(s_mult_art)].index, inplace=True)
data.reset_index(inplace=True, drop=True)

In [19]:
# use the 'split_mult_art' function to split up entries in 
# 'mult_art' and append the resulting new articles to the data
data = data.append(split_mult_art(mult_art))

In [20]:
# free memory space to avoid memory errors
del mult_art

gc.collect()

20

In [21]:
print("Number of articles after first filtering steps:", len(data))

Number of articles after first filtering steps: 5804758


In [22]:
data.to_csv('dpa_filter.csv')

In [None]:
#data = pd.read_csv('dpa_filter1_test.csv')
#data.fillna('', inplace=True)

### Filtering Based on Topics

In [23]:
# Split data into economic and finance articles
afx = data[data['topic'] == 'afx']
afx.reset_index(inplace=True, drop=True)

WiPo = data[data['topic'] == 'WiPo'] 
WiPo.reset_index(inplace=True, drop=True)

WiPo = WiPo.drop(columns=['index'])
afx = afx.drop(columns=['index'])

In [24]:
print("Number of WiPo Articles:", len(WiPo), ", Number of afx Articles:", len(afx))

Number of WiPo Articles: 3008508 , Number of afx Articles: 2796250


In [25]:
# Save intermediate results
WiPo.to_csv('dpa_WiPo_filter_test.csv')
afx.to_csv('dpa_afx_filter_test.csv')

In [26]:
# free memory space to avoid memory errors
del data
del afx

gc.collect()

120

In [5]:
WiPo = pd.read_csv('dpa_WiPo_filter1.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Delete Fuzzy Duplicates

In [6]:
# NOTE: I ignored the multiprocessing function to avoid memory issues

import fuzzy_duplicates
delete_indices = []
 
for year, data_yearly in WiPo.groupby('year'):
    
    data_yearly = data_yearly.reset_index()
    # Prepare inputs
    inputs_year = []
    inputs_month = []

    for month in list(set(data_yearly['month'])):
        inputs_year.append(year)
        inputs_month.append(month)

    inputs = list(zip(inputs_year, inputs_month, repeat(data_yearly)))
    
    startTime = datetime.now()
    
    for inp in inputs:

        delete_indices.append(fuzzy_duplicates.fuzzy_duplicates(inp))

        # Apply function to all combinations of month-year in parallel
        # if __name__ == "__main__":
        # pool = mp.Pool(NUM_CORE)
        # delete_indices.append(pool.map(fuzzy_duplicates.fuzzy_duplicates, inputs))
        # pool.close()
        # pool.join()

    print(datetime.now()-startTime)

merged = list(itertools.chain(*delete_indices))
WiPo.drop(merged, inplace=True)

0:27:54.155425
0:28:46.127699
0:29:19.964563
0:28:58.137109
0:31:13.391723
0:30:47.166563
0:29:45.029791
0:36:11.646510
0:45:40.567644
0:47:32.491350
0:46:28.792219
0:43:58.462043
0:42:32.873097
0:38:33.170213
0:39:32.574870
0:41:40.436872
0:45:47.384667
0:58:24.250584
0:56:42.222056
0:44:42.390548
0:45:16.469658
0:43:56.798871
0:42:55.383620
0:41:32.612163
0:47:14.749925
0:48:18.405921
0:40:59.334162
0:34:55.400031
0:00:00.020233


In [7]:
len(WiPo)

2651380

### Delete English Articles (optional for afx)

In [59]:
# Import a function that outputs the indices of englisch articles
import identify_eng

# Delete all English articles from the data

for year, data_yearly in afx.groupby('year'):
    
    startTime = datetime.now()

    if __name__ == "__main__":
        pool = mp.Pool(NUM_CORE)
        split_dfs = np.array_split(data_yearly, NUM_CORE)
        index_eng = pool.map(identify_eng.identify_eng, split_dfs)
        index_eng = list(itertools.chain(*index_eng))
        pool.close()
        pool.join()

    print(datetime.now()-startTime)

afx.drop(index_eng, inplace=True)

0:00:06.466414


In [60]:
len(afx)

1827

In [8]:
# Save final results
WiPo.to_csv('dpa_WiPo_filtered.csv')

In [None]:
# Save final results
afx.to_csv('dpa_afx_filtered.csv')

In [None]:
# Load function for cleaning dpa articles
from clean_dp_articles import clean_dpa

In [None]:
# Load preprocessed data
#data = pd.read_csv('WiPo.csv', encoding = 'utf-8', index_col=False, dtype='unicode')
#data.fillna('', inplace=True)

In [None]:
NUM_CORE = mp.cpu_count()

startTime = datetime.now()

if __name__ == "__main__":

    pool = mp.Pool(NUM_CORE)
    # Split data into smaller dataframes
    data_split = np.array_split(data, NUM_CORE)
    # Reset index for each dataframe
    [df.reset_index(inplace=True, drop=True) for df in data_split]
    # Apply clean dpa function to each dataframe
    data_intermediate = pool.map(clean_dpa, data_split)
    data = pd.concat(data_intermediate)
    pool.close()
    pool.join()

print(datetime.now()-startTime)