In this notebook, I will identify all uncertainty-related terms in our corpus and calculate their frequencies. First, I load the datasets from Handelsblatt, SZ, Welt, and dpa.

In [1]:
import os
import pandas as pd
from ast import literal_eval

# Set the path variable to point to the 'newspaper_data_processing' directory.
path = os.getcwd().replace('\\nowcasting_with_text\\uncertainty', '\\newspaper_data_processing')

# Load pre-processed 'dpa' dataset from a CSV file.
dpa = pd.read_csv(path + '\\dpa\\' + 'dpa_prepro_final.csv', encoding = 'utf-8', sep=';', index_col = 0,  keep_default_na=False,
                   dtype = {'rubrics': 'str', 
                            'source': 'str',
                            'keywords': 'str',
                            'title': 'str',
                            'city': 'str',
                            'genre': 'str',
                            'wordcount': 'str'},
                  converters = {'paragraphs': literal_eval})

# Keep only the article texts and their respective publication dates.
dpa = dpa[['texts', 'day', 'month', 'year']]

# Load pre-processed 'SZ' dataset from a CSV file.
sz = pd.read_csv(path + '\\SZ\\' + 'sz_prepro_final.csv', encoding = 'utf-8-sig', sep=';', index_col = 0, dtype = {'newspaper': 'str',
                                                                                                 'newspaper_2': 'str',
                                                                                                 'quelle_texts': 'str',
                                                                                                 'page': 'str',
                                                                                                 'rubrics': 'str'})
sz.page = sz.page.fillna('')
sz.newspaper = sz.newspaper.fillna('')
sz.newspaper_2 = sz.newspaper_2.fillna('')
sz.rubrics = sz.rubrics.fillna('')
sz.quelle_texts = sz.quelle_texts.fillna('')

# Keep only the article texts and their respective publication dates.
sz = sz[['texts', 'day', 'month', 'year']]

# Load pre-processed 'Handelsblatt' dataset from a CSV file.
hb = pd.read_csv(path + '\\Handelsblatt\\' + 'hb_prepro_final.csv', encoding = 'utf-8-sig', sep=';', index_col = 0, dtype = {'kicker': 'str',
                                                                                                 'page': 'str',
                                                                                                 'series_title': 'str',
                                                                                                 'rubrics': 'str'})
hb.page = hb.page.fillna('')
hb.series_title = hb.series_title.fillna('')
hb.kicker = hb.kicker.fillna('')
hb.rubrics = hb.rubrics.fillna('')

# Keep only the article texts and their respective publication dates.
hb = hb[['texts', 'day', 'month', 'year']]

# Load pre-processed 'Welt' dataset from a CSV file.
welt = pd.read_csv(path + '\\Welt\\' + 'welt_prepro_final.csv', encoding = 'utf-8-sig', sep=';', index_col = 0, dtype = {'newspaper': 'str',
                                                                                                 'rubrics': 'str',
                                                                                                 'title': 'str'})
welt.title = welt.title.fillna('')
welt.rubrics = welt.rubrics.fillna('')

# Keep only the article texts and their respective publication dates.
welt = welt[['texts', 'day', 'month', 'year']]

# Concatenate the 'dpa', 'sz', 'hb', and 'welt' DataFrames into a single DataFrame 'data'.
data = pd.concat([dpa, sz, hb, welt])

# The number of articles in the final dataset.
print(len(data))

# Sort the data in chronological order.
data = data.sort_values(['year', 'month', 'day'], ascending=[True, True, True])
# Reset the index of the DataFrame
data.reset_index(inplace=True, drop=True)
data.head()

3336299


Unnamed: 0,texts,day,month,year
0,Schalck: Milliardenkredit sicherte Zahlungsfäh...,1,1,1991
1,Welajati: Iran bleibt bei einem Krieg am Golf ...,1,1,1991
2,Bush will offenbar seinen Außenminister erneut...,1,1,1991
3,Sperrfrist 1. Januar 1000 HBV fordert umfassen...,1,1,1991
4,Schamir weist Nahost-Äußerungen des neuen EG-P...,1,1,1991


I utilize a function, `find_unsicher`, to search through all the texts in the corpus for occurrences of words containing the substring "unsicher". I systematically lowercase each text to ensure the search is case-insensitive, then employ a regular expression to identify and extract these words.

In [2]:
# Use multiprocessing module for parallel computing
import multiprocessing as mp

# Set the number of cores to use
#NUM_CORE = mp.cpu_count()-4
NUM_CORE = 30

import find_unsicher

from datetime import datetime
startTime = datetime.now()

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    list_of_results = pool.map(find_unsicher.find_unsicher, [text for text in data.texts])
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:02:20.970416


Now I extract unique terms that contain the substring "unsicher" and save them to a text file. 

In [3]:
uncertainty_terms = set([item for sublist in list_of_results for item in sublist])
print(len(uncertainty_terms))

417


In [4]:
# Prepare the data to be written to a text file
uncertainty_terms_data = "\n".join(sorted(uncertainty_terms))

# Define the file path
file_path = 'uncertainty_terms.txt'

# Write the data to the file with UTF-8 encoding
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(uncertainty_terms_data)

To understand the frequency of various uncertainty terms across a corpus, I compute the occurrence count of each term from the `uncertainty_terms` set within individual texts, producing dictionaries where the keys are words from `uncertainty_terms` and the values represent their respective counts across the texts.

In [5]:
import count_uncertainty_terms
#NUM_CORE = mp.cpu_count()-4
NUM_CORE = 60

startTime = datetime.now()

# Create an iterable of tuples, each containing a text and the uncertainty_terms
args = [(text, uncertainty_terms) for text in data.texts]

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    list_of_counts = pool.starmap(count_uncertainty_terms.count_uncertainty_terms, args)
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

1:02:11.720714


Next, I aggregate and sort the occurrence counts of uncertainty terms across the entire corpus.

In [6]:
# Aggregate counts across the entire corpus
aggregate_counts = {}
for count_dict in list_of_counts:
    for term, count in count_dict.items():
        if term in aggregate_counts:
            aggregate_counts[term] += count
        else:
            aggregate_counts[term] = count

# Convert the aggregate counts dictionary to a list of tuples and sort by count
sorted_counts = sorted(aggregate_counts.items(), key=lambda x: x[1], reverse=True)

Upon analyzing the corpus for uncertainty-related terms and their frequencies, I identify two distinct categories: general uncertainty terms (e.g., "Unsicherheit," "Verunsicherung", or "unsicher") and German compounds including the concept of uncertainty (e.g., "Unsicherheitsfaktor," "Rechtsunsicherheit", or "Zinsunsicherheit"). I save the sorted list `sorted_counts` to a csv file.

In [8]:
import csv

# The filename for a CSV file
filename = "sorted_counts.csv"

# Open the file in write mode and specify newline to prevent extra blank lines
with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
    # Create a csv.writer object
    writer = csv.writer(csvfile)
    # Write the header row
    writer.writerow(["Word", "Count"])
    # Write the data rows
    for item in sorted_counts:
        writer.writerow(item)

This code offers a faster alternative for calculating frequencies of all uncertainty terms across a corpus with scikit-learn's `CountVectorizer`, yielding identical results to the `count_uncertainty_terms` function but more efficiently, thanks to scikit-learn's optimization.

In [9]:
startTime = datetime.now()
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Convert the set of uncertainty terms to a list to use as vocabulary for the CountVectorizer
vocabulary = list(uncertainty_terms)

# Initialize CountVectorizer with the vocabulary of uncertainty terms
vectorizer = CountVectorizer(vocabulary=vocabulary)

# Fit the vectorizer to the texts and transform the texts into a term frequency matrix
X = vectorizer.fit_transform(data.texts)

# Sum the occurrences of each term across all documents to get the total counts
term_frequencies = np.sum(X, axis=0)

# Convert the matrix to a flat array and then to a list for easier handling
term_frequencies = np.squeeze(np.asarray(term_frequencies)).tolist()

# Map the term frequencies back to the corresponding terms
term_frequency_dict = dict(zip(vectorizer.get_feature_names(), term_frequencies))

# Sort the dictionary by frequency
sorted_term_frequency = sorted(term_frequency_dict.items(), key=lambda x: x[1], reverse=True)

print(datetime.now()-startTime)

0:23:01.312013


This is to show that results are indeed identical.

In [10]:
are_identical = dict(sorted_counts) == dict(sorted_term_frequency)

if are_identical:
    print("The dictionaries are identical.")
else:
    print("The dictionaries are not identical.")

The dictionaries are identical.
