This notebook is designed to identify and save the indices of articles from the training set, as well as to evaluate the effectiveness of the filtering step by retaining only sentences that contain at least one term related to business cycle conditions.

---

First, we'll pre-process the entire dataset and then identify the articles from the training set.

In [1]:
import numpy as np
import os
import multiprocessing as mp 
from datetime import datetime
from functools import partial
import keep_economy_related_sentences
import re
import remove_words_without_embeddings
import csv
from string import punctuation
from collections import Counter

# Open and read articles from the 'articles.txt' file 
with open('MediaTenor_data/articles.txt', 'r', encoding = 'utf-8') as f:
    articles = f.read()

# Open and read labels from the 'labels_binary.txt' file    
with open('MediaTenor_data/labels_binary.txt', 'r', encoding = 'utf-8') as f:
    labels = f.read()
    
NUM_CORE = 60 # set the number of cores to use

# Set the path variable to point to the 'word_embeddings' directory.
path = os.getcwd().replace('\\sentiment', '') + '\\word_embeddings'

# Load words related to 'Wirtschaft' and 'Konjunktur'
konjunktur_words = keep_economy_related_sentences.load_words(path + '\\konjunktur_synonyms.txt')
wirtschaft_words = keep_economy_related_sentences.load_words(path + '\\wirtschaft_synonyms.txt')

# Combine the two lists
economy_related_words = konjunktur_words + wirtschaft_words

startTime = datetime.now() 

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    inputs = zip(articles.split('\n'), [economy_related_words]*len(articles.split('\n')))
    economy_related_sentences = pool.starmap(keep_economy_related_sentences.keep_economy_related_sentences, inputs) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

# Initialize an empty string to store the segments of articles related to economy
articles = ''

for article in economy_related_sentences:
    articles = articles + article + ' \n'
    
def remove_multiple_spaces(text):
    """
    This function removes multiple spaces in a string. 
    It uses a regular expression to match 2 or more spaces and replaces them with a single space.
    """
    text = re.sub(r'\s{2,}', ' ', text)
    return text

def remove_short_words(text):
    """
    This function removes words of length 1 from a string.
    """
    text = ' '.join([word for word in text.split() if len(word) > 1])
    return text

def remove_metadata(text, meta_list):
    """
    This function removes metadata from a text.
    Metadata is a list of phrases. If any of these phrases are found in the text,
    everything from the phrase and onwards is cut off.
    """
    for phrase in meta_list:
        if phrase in text:
            text = text.split('dokument', 1)[0]
    return text

# List of metadata phrases
metadata_phrases = ['dokument bihann', 'dokument bid', 'dokument welt', 'dokument bberbr', 'dokument focus']

# Convert articles to lowercase
articles = articles.lower()

# Remove URLs
articles = re.sub(r'https\S+|http\S+|www.\S+', '', articles)

# Remove punctuation
articles = articles.replace('.', ' ').replace('-', ' ').replace('/', ' ')
articles = ''.join([c for c in articles if c not in punctuation and c not in ['»', '«']])

# Remove non-alphabetic characters from the text
articles = ''.join([c for c in articles if (c.isalpha() or c in [' ', '\n'])])

# Split articles by new lines
articles_split = articles.split('\n')

# Remove multiple spaces, short words, and metadata
articles_split = list(map(remove_multiple_spaces, articles_split))
articles_split = list(map(remove_short_words, articles_split))
articles_split = list(map(lambda text: remove_metadata(text, metadata_phrases), articles_split))

# Load list of words that do not have embeddings
with open('words_without_pretrained_vector.csv', newline='', encoding='utf-8-sig') as f:
    reader = csv.reader(f)
    words_without_embeddings = [word for sublist in reader for word in sublist]

# Remove words that do not have embeddings
startTime = datetime.now() 

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    articles_split = pool.starmap(remove_words_without_embeddings.remove_words_without_embeddings, [(text, words_without_embeddings) for text in articles_split]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

# Join all articles into a single string
all_text = ' '.join(articles_split)

# Create a list of all words in the MediaTenor data
words = all_text.split()

# Count the occurrences of each word in the articles
word_counts = Counter(words)

# Sort words by their count, in descending order
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)

# Create a dictionary that maps each word to a unique integer
# Reserved 0 for padding and 1 for unknown words
vocab_to_int = {word: idx for idx, word in enumerate(sorted_vocab, 2)}
vocab_to_int['<pad>'] = 0
vocab_to_int['<unk>'] = 1

# Convert each article to a list of integers (tokens)
articles_ints = [[vocab_to_int[word] for word in article.split()] for article in articles_split]

# Get the indices of articles which have more than 20 tokens
valid_article_indices = [idx for idx, article in enumerate(articles_ints) if len(article) > 20]

split_idx = 1920
train_indices = valid_article_indices[:split_idx]

# Save the train indices to a CSV file
with open('train_indices.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(train_indices)

0:00:54.174750
0:00:08.691316


Now, we're ready to assess how well our filtering step works by comparing the original articles with their filtered versions. The filtering retains only sentences that contain at least one term related to business cycle conditions. This allows us to verify that our method is accurately focusing on the relevant parts of the text.

In [2]:
# Open and read articles from the 'articles.txt' file 
with open('MediaTenor_data/articles.txt', 'r', encoding = 'utf-8') as f:
    articles = f.read().split('\n')  # Splitting into a list of articles

# Open and read labels from the 'labels_binary.txt' file    
with open('MediaTenor_data/labels_binary.txt', 'r', encoding = 'utf-8') as f:
    labels = f.read().split('\n')  # Splitting into a list of labels
    
# Filter articles and labels for the training set
train_articles = [articles[i] for i in train_indices]
train_labels = [labels[i] for i in train_indices]

In [3]:
from IPython.display import display, HTML

def highlight_economy_words(text, words):
    def replace(match):
        return f"<span style='color:blue'>{match.group(0)}</span>"
    
    for word in words:
        # Pattern to match the whole word followed by common punctuation, case-insensitively,
        # but not preceded by a hyphen (-).
        pattern = r'(?<!-)\b' + re.escape(word) + r'\b(?=[.,;!?\):]|\s|$)'
        text = re.sub(pattern, replace, text, flags=re.IGNORECASE)
    
    return text

In [4]:
index_explore = 350
article_to_explore = train_articles[index_explore]
print(article_to_explore)
label_to_explore = train_labels[index_explore]
economy_related_sentences_example = keep_economy_related_sentences.keep_economy_related_sentences(train_articles[index_explore], economy_related_words)

Berlin - 2017 verspricht ein sonniges Jahr für die deutsche Wirtschaft zu werden.  Volkswirte rechnen mit einer stabilen Aufwärtsentwicklung. Dafür sprechen die Erholung der Weltkonjunktur und die verbesserten Exportchancen für die Industrie. Auch globale Risiken durch den Brexit hätten sich abgeschwächt, betonen Volkswirte deutscher Großbanken.  "Die Konjunktur läuft immer besser, die Unternehmen werden immer optimistischer. Was sich derzeit bei der Konjunktur zeigt, geht über unsere Erwartungen hinaus", sagt Allianz-Volkswirt Rolf Schneider.  Optimistisch zeigt sich auch DZ-Bank-Ökonom Michael Holstein, der für 2017 mit einem Wachstum der deutschen Wirtschaft von 1,7 Prozent rechnet. Für den Commerzbank-Konjunkturexperten Eckart Tuchtfeld kommt hinzu, dass inzwischen in vielen Chefetagen die Sorge vor einer Abschottung des US-Marktes eine untergeordnete Rolle spiele. 


In [5]:
# Display original article
display(HTML(f"<h3>Original Article:</h3><p>{highlight_economy_words(article_to_explore, economy_related_words)}</p>"))

In [6]:
# Display label and transformed text with highlights
display(HTML(f"<h3>Label:</h3><p>{label_to_explore}</p>"))
display(HTML(f"<h3>Transformed Article (Economy-related sentences):</h3><p>{highlight_economy_words(economy_related_sentences_example, economy_related_words)}</p>"))