## 1. Load essential libraries

In [1]:
import os, json
import pandas as pd
import numpy as np
import regex as re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag_sents
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## 2. Upload Data

In [2]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    pass

Mounted at /content/drive


In [3]:
path = os.getcwd()+'/drive/MyDrive/Fiscal_Policy_Project/Data/Bundesfinanzministerium'  

In [4]:
contents = [x for x in os.listdir(path) if not os.path.isfile(os.path.join(path,x))]

# List of all files in a list of tuples (folder, filename)
json_files = []
for x in contents:
    files = os.listdir(os.path.join(path, x))
    for file in files:
        if file.endswith('.json'):
            json_files.append((x, file))

# 3. Create Dataframe

In [5]:
#Create dataframe for files
df = pd.DataFrame(data=json_files, columns=["Filetype", "Filename"])

#get first and last files
df.head().append(df.tail())

  df.head().append(df.tail())


Unnamed: 0,Filetype,Filename
0,Speeches,Rede_2005-02-21_801.json
1,Speeches,Rede_2005-12-19_753.json
2,Speeches,Rede_2019-05-09_76.json
3,Speeches,Rede_2007-01-25_791.json
4,Speeches,Rede_2006-03-28_772.json
533,Interviews,2022-09-15-47.json
534,Interviews,2021-04-10-132.json
535,Interviews,2021-02-06-121.json
536,Interviews,2021-05-05-176.json
537,Interviews,2022-08-19-52.json


In [6]:
# Pandas dataframe that will have the contents of file DATE and TEXT along with Folder and name

jsons_data = pd.DataFrame(columns=['DATE', 'AUTHOR', 'TEXT', 'FOLDER', 'FILE NAME'])

for i, file in enumerate(json_files):
    with open(os.path.join(path, file[0], file[1]), 'r', encoding='utf-8') as fd:
      json_text = json.load(fd) 
      
      TEXT = json_text.get('text')
      DATE = json_text.get('published_date')
      AUTHOR = json_text.get('author')
        
      # Push a list of data into a pandas DataFrame at row given by 'index'
      jsons_data.loc[i] = [DATE, AUTHOR, TEXT, file[0], file[1]]
        
jsons_data

Unnamed: 0,DATE,AUTHOR,TEXT,FOLDER,FILE NAME
0,2005-02-21,Hans Eichel,Es gilt das gesprochene Wort!\n \n\n Sehr ...,Speeches,Rede_2005-02-21_801.json
1,2005-12-19,Peer Steinbrück,Es gilt das gesprochene Wort!\n \n \nSehr geeh...,Speeches,Rede_2005-12-19_753.json
2,2019-05-09,Olaf Scholz,Am 9. Mai 2019 sprach Bundesfinanzminister Ola...,Speeches,Rede_2019-05-09_76.json
3,2007-01-25,,Es gilt das gesprochene Wort!Sehr geehrter Her...,Speeches,Rede_2007-01-25_791.json
4,2006-03-28,Peer Steinbrück,28. März 2006\nEs gilt das gesprochene Wort! ...,Speeches,Rede_2006-03-28_772.json
...,...,...,...,...,...
533,2022-09-15,Christian Lindner,"Jüdische Allgemeine: Herr Minister, das Luxemb...",Interviews,2022-09-15-47.json
534,2021-04-10,,"taz: Herr Scholz, die Grünen wollen in Baden-W...",Interviews,2021-04-10-132.json
535,2021-02-06,,"RND: Herr Scholz, hat Friedrich Merz Sie schon...",Interviews,2021-02-06-121.json
536,2021-05-05,,"Handelsblatt: Herr Scholz, die Pandemie hat di...",Interviews,2021-05-05-176.json


# 4. Clean Data

In [7]:
# Check for Na
jsons_data['TEXT'].isna().sum()

0

In [8]:
def preprocess(txt):
    txt = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", txt)
    txt = txt.lower()
    txt = re.sub(r"\W"," ",txt)
    txt = re.sub(r"\d"," ",txt)
    txt = re.sub(r"\s+[a-z]\s+"," ",txt)
    txt = re.sub(r"\s+[a-z]$"," ",txt)
    txt = re.sub(r"^[a-z]\s+"," ",txt)
    txt = re.sub(r"\s+"," ",txt) 
    return txt


In [9]:
jsons_data['Processed TEXT'] = jsons_data['TEXT'].apply(lambda txt: preprocess(txt))

In [10]:
for index, row in jsons_data.iterrows():
    if row['Processed TEXT'] == '':
        jsons_data.drop(index, inplace=True)

In [11]:
jsons_data['Processed TEXT']

0      es gilt das gesprochene wort sehr geehrte dame...
1      es gilt das gesprochene wort sehr geehrter her...
2      am mai sprach bundesfinanzminister olaf scholz...
3      es gilt das gesprochene wort sehr geehrter her...
4       märz es gilt das gesprochene wort rededes bun...
                             ...                        
533    jüdische allgemeine herr minister das luxembur...
534    taz herr scholz die grünen wollen in baden wür...
535    rnd herr scholz hat friedrich merz sie schon a...
536    handelsblatt herr scholz die pandemie hat die ...
537    rp herr lindner die gasumlage belastet haushal...
Name: Processed TEXT, Length: 538, dtype: object

# - 5. Remove Stopwords

In [None]:
stopwords.words('german')

['aber',
 'alle',
 'allem',
 'allen',
 'aller',
 'alles',
 'als',
 'also',
 'am',
 'an',
 'ander',
 'andere',
 'anderem',
 'anderen',
 'anderer',
 'anderes',
 'anderm',
 'andern',
 'anderr',
 'anders',
 'auch',
 'auf',
 'aus',
 'bei',
 'bin',
 'bis',
 'bist',
 'da',
 'damit',
 'dann',
 'der',
 'den',
 'des',
 'dem',
 'die',
 'das',
 'dass',
 'daß',
 'derselbe',
 'derselben',
 'denselben',
 'desselben',
 'demselben',
 'dieselbe',
 'dieselben',
 'dasselbe',
 'dazu',
 'dein',
 'deine',
 'deinem',
 'deinen',
 'deiner',
 'deines',
 'denn',
 'derer',
 'dessen',
 'dich',
 'dir',
 'du',
 'dies',
 'diese',
 'diesem',
 'diesen',
 'dieser',
 'dieses',
 'doch',
 'dort',
 'durch',
 'ein',
 'eine',
 'einem',
 'einen',
 'einer',
 'eines',
 'einig',
 'einige',
 'einigem',
 'einigen',
 'einiger',
 'einiges',
 'einmal',
 'er',
 'ihn',
 'ihm',
 'es',
 'etwas',
 'euer',
 'eure',
 'eurem',
 'euren',
 'eurer',
 'eures',
 'für',
 'gegen',
 'gewesen',
 'hab',
 'habe',
 'haben',
 'hat',
 'hatte',
 'hatten',
 '

In [None]:
counter=0
def remove_stopwords(text,language,processed,verbose):
    word_list = text.split() 
    filtered_words = [word for word in word_list if word not in stopwords.words(language)]
    text = ' '.join(filtered_words)
    global counter
    counter+=1
    if verbose==1 and counter % 10 == 0:
        print(f"processed {counter}/{len(processed)}")
    if counter >= len(processed):
        counter=0
    return text

In [None]:
jsons_data['Processed TEXT'] = jsons_data['Processed TEXT'].apply(lambda text:remove_stopwords(text,'german',jsons_data['Processed TEXT'],verbose=1))


processed 10/6997
processed 20/6997
processed 30/6997
processed 40/6997
processed 50/6997
processed 60/6997
processed 70/6997
processed 80/6997
processed 90/6997
processed 100/6997
processed 110/6997
processed 120/6997
processed 130/6997
processed 140/6997
processed 150/6997
processed 160/6997
processed 170/6997
processed 180/6997
processed 190/6997
processed 200/6997
processed 210/6997
processed 220/6997
processed 230/6997
processed 240/6997
processed 250/6997
processed 260/6997
processed 270/6997
processed 280/6997
processed 290/6997
processed 300/6997
processed 310/6997
processed 320/6997
processed 330/6997
processed 340/6997
processed 350/6997
processed 360/6997
processed 370/6997
processed 380/6997
processed 390/6997
processed 400/6997
processed 410/6997
processed 420/6997
processed 430/6997
processed 440/6997
processed 450/6997
processed 460/6997
processed 470/6997
processed 480/6997
processed 490/6997
processed 500/6997
processed 510/6997
processed 520/6997
processed 530/6997
pr

In [12]:
with open(path+'/jsons_data_BMF.pickle','wb') as f:
    pickle.dump(jsons_data,f)