In [2]:
import pandas as pd
import json
from datetime import datetime, date, timedelta
import glob
import re
import spacy
nlp = spacy.load('de_dep_news_trf')

In [3]:
storage = r'../scraping/data/storage/'

In [20]:
def timestamp_to_date(timestamp):
    return date.fromtimestamp(timestamp)

def format_text(text):
    # remove Uni-Code
    t = text.encode('latin-1', 'ignore').decode('latin-1')
    
    # Hashtags entfernen
    t = re.sub(r'#\w+', '', t)
    
    # Links entfernen
    t = re.sub(r'http\S+', '', t)
    
    # remove Steuersymbole/andere Zeichen
    t = re.sub(r'[\n\t\ \"\':+?!]+', ' ', t)
    
    return t

def format_words(word):
    return re.sub('[\[\-.,:?&()!@#|$0-9 ]', '', word)

# filling ignore_words
# Using stop words form https://countwordsfree.com/stopwords/german
ignore_words = []
with open(r'stop_words_german.json', 'r', encoding='utf-8') as words:
    for word in words.read()[1:-1].split(","):
        ignore_words.append(word[1:-1])

def frame_to_dic(dataframe):
    dic = {}
    for i in dataframe['text'].to_numpy():
        for j in nlp(i):
            if j.pos_ == 'PROPN':
                s = (j.lemma_).upper()
            else:
                continue
            if (s in dic):
                dic[s] += 1
            else:
                dic[s] = 1
    return dic

def get_dic_around(date, dic_frame, day_range):
    res = {}
    for i in dic_frame[(dic_frame['date'] >= (date - timedelta(days=day_range))) & (dic_frame['date'] <= (date + timedelta(days=day_range)))]['dic']:
        res = {**res, **i}
    return res

def get_top_k(dictionary, k):
    n = sum(sorted(dictionary.values(), reverse=True)[:k])
    sort_dic = sorted(dictionary, key=dictionary.get, reverse=True)
    return {i: dictionary[i]/n for i in list(sort_dic)[:k]}

### Initilize empty dataframe

In [5]:
df = pd.DataFrame(columns=['date', 'user', 'shortcode', 'platform', 'text'])

### Daten hinzufügen

In [6]:
files = glob.glob(storage + '*.json')

for j in files:
    # Öffnen einer JSON Datei 
    f = open(j, mode="r", encoding="utf-8") 

    # JSON als dictionary 
    data = json.load(f)

    for i in data:
        
        text = i['text'].upper()
        
        if ('CORONA' in text) or ('COVID' in text) or ('SARS-COV-2' in text) or ('SARS COV' in text):
            
            date = timestamp_to_date(i['date'])
            user = i['user']
            shortcode = i['shortcode']
            platform = i['platform']
            text = format_text(i['text'])
            df = df.append({'date': date, 'user': user,
                            'shortcode': shortcode, 'platform': platform, 
                            'text': text}, ignore_index=True)

    # Closing file 
    f.close()
    print('Completed:', j)
print('Done!')

Completed: ../scraping/data/storage\2020_1.json
Completed: ../scraping/data/storage\2020_10.json
Completed: ../scraping/data/storage\2020_11.json
Completed: ../scraping/data/storage\2020_12.json
Completed: ../scraping/data/storage\2020_2.json
Completed: ../scraping/data/storage\2020_3.json
Completed: ../scraping/data/storage\2020_4.json
Completed: ../scraping/data/storage\2020_5.json
Completed: ../scraping/data/storage\2020_6.json
Completed: ../scraping/data/storage\2020_7.json
Completed: ../scraping/data/storage\2020_8.json
Completed: ../scraping/data/storage\2020_9.json
Completed: ../scraping/data/storage\2021_1.json
Completed: ../scraping/data/storage\2021_2.json
Completed: ../scraping/data/storage\2021_3.json
Completed: ../scraping/data/storage\2021_4.json
Done!


### Taking a look at the dataframe

In [7]:
df

Unnamed: 0,date,user,shortcode,platform,text
0,2020-01-28,BR_Presse,1222203172309151746,twitter,Sondersendung Gesundheit spezial in Bayern Wie...
1,2020-01-31,derspiegel,1223347087716364288,twitter,127 Tests waren negativ und einer positiv Mita...
2,2020-01-31,derspiegel,1223319549614804992,twitter,Auch in manchen deutschen Apotheken werden Sch...
3,2020-01-31,derspiegel,1223225154702102534,twitter,"Und für alle, die noch mehr wissen wollen, hie..."
4,2020-01-31,derspiegel,1223222791278931968,twitter,In Bayern sind mittlerweile fünf -Fälle bekann...
...,...,...,...,...,...
55688,2021-04-01,welt,1377551392387297280,twitter,Frankreich und Italien verschärfen ihre Corona...
55689,2021-04-01,welt,1377535482419089408,twitter,Corona Neue Variante in Brasilien aufgetaucht ...
55690,2021-04-01,welt,1377528682865131522,twitter,Alles nur geklaut So dreist entstand die Anti-...
55691,2021-04-01,welt,1377488458869051393,twitter,Corona Dramatische Prognose vom RKI droht sich...


### Building the dictionary frame

In [23]:
dic_frame = pd.DataFrame(columns=['date', 'dic'])

for i in df['date'].unique():
    print(i)
    frame = df[df['date'] == i]
    dic_frame = dic_frame.append({'date': i, 'dic': frame_to_dic(frame)}, ignore_index=True)

dic_frame = dic_frame.sort_values(by=['date'])
dic_frame

2020-01-28
2020-01-31
2020-01-30
2020-01-29
2020-01-27
2020-01-26
2020-01-25
2020-01-24
2020-01-23
2020-01-22
2020-01-21
2020-01-17
2020-01-20
2020-01-19
2020-01-11
2020-01-18
2020-01-09
2020-10-28
2020-10-27
2020-10-26
2020-10-21
2020-10-12
2020-10-31
2020-10-30
2020-10-29
2020-10-25
2020-10-24
2020-10-23
2020-10-22
2020-10-20
2020-10-19
2020-10-18
2020-10-17
2020-10-16
2020-10-15
2020-10-14
2020-10-13
2020-10-11
2020-10-10
2020-10-09
2020-10-08
2020-10-07
2020-10-06
2020-10-05
2020-10-04
2020-10-03
2020-10-02
2020-10-01
2020-11-09
2020-11-07
2020-11-30
2020-11-29
2020-11-28
2020-11-27
2020-11-26
2020-11-25
2020-11-24
2020-11-23
2020-11-22
2020-11-21
2020-11-20
2020-11-19
2020-11-18
2020-11-17
2020-11-16
2020-11-15
2020-11-14
2020-11-13
2020-11-12
2020-11-11
2020-11-10
2020-11-08
2020-11-06
2020-11-05
2020-11-04
2020-11-03
2020-11-02
2020-11-01
2020-12-12
2020-12-06
2020-12-31
2020-12-30
2020-12-29
2020-12-28
2020-12-27
2020-12-26
2020-12-25
2020-12-24
2020-12-23
2020-12-22
2020-12-21

Unnamed: 0,date,dic
16,2020-01-09,"{'WUHAN': 1, 'WHO': 1, 'SARS': 1, 'MERS': 1}"
14,2020-01-11,"{'CHINA': 2, 'WUHAN': 1, 'CORONA-VIRUS': 1}"
11,2020-01-17,{'CHINA': 1}
15,2020-01-18,{'CHINA': 1}
13,2020-01-19,"{'CHINA': 4, 'CORONA-VIRUS': 1, 'WUHAN': 1}"
...,...,...
446,2021-04-04,"{'SPIEGEL': 1, 'NIEDERLANDE': 6, 'DEUTSCHLAND'..."
445,2021-04-05,"{'ARMIN': 3, 'LAUTERBACH': 3, 'SPAHN': 4, 'MÜN..."
444,2021-04-06,"{'KARLSRUHE': 1, 'COVID-19': 1, 'CORONA': 14, ..."
443,2021-04-07,"{'MICHAEL': 2, 'KÖLCH': 1, 'CORONA': 22, 'SCOO..."


In [33]:
get_top_k(dic_frame['dic'][302], 25)

{'DEUTSCHLAND': 0.1111111111111111,
 'CORONA': 0.07407407407407407,
 '@BENDZKO': 0.07407407407407407,
 'LEIPZIG': 0.07407407407407407,
 'BADEN-WÜRTTEMBERG': 0.07407407407407407,
 '@RKI_DE': 0.05555555555555555,
 'ITALIEN': 0.05555555555555555,
 'TIM': 0.05555555555555555,
 '@MSI_BW': 0.05555555555555555,
 'HANAU': 0.037037037037037035,
 'RKI': 0.037037037037037035,
 'USA': 0.037037037037037035,
 'LASCHET': 0.037037037037037035,
 'ROBERT': 0.018518518518518517,
 'KOCH-INSTITUT': 0.018518518518518517,
 'STRIVE': 0.018518518518518517,
 'MASIYIWA': 0.018518518518518517,
 'POPSTAR': 0.018518518518518517,
 '@PETERALTMAIER': 0.018518518518518517,
 '@HBRAUN': 0.018518518518518517,
 'SHANGHAI': 0.018518518518518517,
 'CHINA': 0.018518518518518517,
 'SÜDOSTEUROPA': 0.018518518518518517,
 'KALIFORNIEN': 0.018518518518518517,
 'MÖRFELDEN-WALLDORF': 0.018518518518518517}

In [27]:
def write_json(data, file):
    with open(file, mode='w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
        f.close()

In [34]:
days = pd.date_range(start=min(dic_frame['date']),end=max(dic_frame['date'])).tolist()

result = []

for day in days:
    word_dic = get_top_k(get_dic_around(date=day, dic_frame=dic_frame, day_range=5), 25)
    
    day_dic = {'day': str(day), 'words': list(word_dic.keys()), 'weights': list(word_dic.values())}
    
    result.append(day_dic)

write_json(result, r'../../corona_data/visual_data.json')