In [1]:
import pandas as pd
import json
from datetime import datetime, date, timedelta
import glob
import re
import spacy
nlp = spacy.load('de_core_news_sm')

In [2]:
storage = r'../scraping/data/storage/'

In [5]:
def timestamp_to_date(timestamp):
    return date.fromtimestamp(timestamp)

def format_text(text):
    # remove Uni-Code
    t = text.encode('latin-1', 'ignore').decode('latin-1')
    
    # Hashtags entfernen
    t = re.sub(r'#\w+', '', t)
    
    # Links entfernen
    t = re.sub(r'http\S+', '', t)
    
    # remove Steuersymbole/andere Zeichen
    t = re.sub(r'[\n\t\ \"\':+?!]+', ' ', t)
    
    return t

def format_words(word):
    return re.sub('[\[\-.,:?&()!@#|$0-9 ]', '', word)

ignore_words = ["SEIN", "EINEN", "WERDEN", "AUCH", "DIES", "HABEN", "SICH", "KÖNNEN", "NICHT", "NOCH", "MEIN"]

def frame_to_dic(dataframe):
    dic = {}
    for i in dataframe['text'].to_numpy():
        for j in nlp.tokenizer(i):
            s = (j.lemma_).upper()
            if len(s) < 4:
                continue
            if s in ignore_words:
                continue
            if (s in dic):
                dic[s] += 1
            else:
                dic[s] = 1
    return dic

def get_dic_around(date, dic_frame, day_range):
    res = {}
    for i in dic_frame[(dic_frame['date'] >= (date - timedelta(days=day_range))) & (dic_frame['date'] <= (date + timedelta(days=day_range)))]['dic']:
        res = {**res, **i}
    return res

def get_top_k(dictionary, k):
    n = sum(sorted(dictionary.values(), reverse=True)[:k])
    sort_dic = sorted(dictionary, key=dictionary.get, reverse=True)
    return {i: dictionary[i]/n for i in list(sort_dic)[:k]}

### Initilize empty dataframe

In [6]:
df = pd.DataFrame(columns=['date', 'user', 'shortcode', 'platform', 'text'])

### Instagram Daten hinzufügen

In [7]:
files = glob.glob(storage + '*.json')

for j in files:
    # Öffnen einer JSON Datei 
    f = open(j, mode="r", encoding="utf-8") 

    # JSON als dictionary 
    data = json.load(f)

    for i in data:
        
        text = i['text'].upper()
        
        if ('CORONA' in text) or ('COVID' in text) or ('SARS-COV-2' in text) or ('SARS COV' in text):
            
            date = timestamp_to_date(i['date'])
            user = i['user']
            shortcode = i['shortcode']
            platform = i['platform']
            text = format_text(i['text'])
            df = df.append({'date': date, 'user': user,
                            'shortcode': shortcode, 'platform': platform, 
                            'text': text}, ignore_index=True)

    # Closing file 
    f.close() 
    print('Completed:', j)
print('Done!')

Completed: ../scraping/data/storage\2020_1.json
Completed: ../scraping/data/storage\2020_10.json
Completed: ../scraping/data/storage\2020_11.json
Completed: ../scraping/data/storage\2020_12.json
Completed: ../scraping/data/storage\2020_2.json
Completed: ../scraping/data/storage\2020_3.json
Completed: ../scraping/data/storage\2020_4.json
Completed: ../scraping/data/storage\2020_5.json
Completed: ../scraping/data/storage\2020_6.json
Completed: ../scraping/data/storage\2020_7.json
Completed: ../scraping/data/storage\2020_8.json
Completed: ../scraping/data/storage\2020_9.json
Completed: ../scraping/data/storage\2021_1.json
Completed: ../scraping/data/storage\2021_2.json
Completed: ../scraping/data/storage\2021_3.json
Completed: ../scraping/data/storage\2021_4.json
Done!


### Build dictionary frame

In [8]:
df

Unnamed: 0,date,user,shortcode,platform,text
0,2020-01-28,BR_Presse,1222203172309151746,twitter,Sondersendung Gesundheit spezial in Bayern Wie...
1,2020-01-31,derspiegel,1223347087716364288,twitter,127 Tests waren negativ und einer positiv Mita...
2,2020-01-31,derspiegel,1223319549614804992,twitter,Auch in manchen deutschen Apotheken werden Sch...
3,2020-01-31,derspiegel,1223225154702102534,twitter,"Und für alle, die noch mehr wissen wollen, hie..."
4,2020-01-31,derspiegel,1223222791278931968,twitter,In Bayern sind mittlerweile fünf -Fälle bekann...
...,...,...,...,...,...
55688,2021-04-01,welt,1377551392387297280,twitter,Frankreich und Italien verschärfen ihre Corona...
55689,2021-04-01,welt,1377535482419089408,twitter,Corona Neue Variante in Brasilien aufgetaucht ...
55690,2021-04-01,welt,1377528682865131522,twitter,Alles nur geklaut So dreist entstand die Anti-...
55691,2021-04-01,welt,1377488458869051393,twitter,Corona Dramatische Prognose vom RKI droht sich...


In [9]:
dic_frame = pd.DataFrame(columns=['date', 'dic'])

for i in df['date'].unique():
    frame = df[df['date'] == i]
    dic_frame = dic_frame.append({'date': i, 'dic': frame_to_dic(frame)}, ignore_index=True)

dic_frame = dic_frame.sort_values(by=['date'])
dic_frame

Unnamed: 0,date,dic
16,2020-01-09,"{'FALL': 1, 'RÄTSELHAFT': 1, 'CHINESISCH': 1, ..."
14,2020-01-11,"{'DUTZEND': 1, 'MENSCH': 3, 'EINER': 4, 'NEUAR..."
11,2020-01-17,"{'MEHR': 1, 'MENSCH': 1, 'LEIDEN': 1, 'CHINA':..."
15,2020-01-18,"{'CHINA': 1, 'WEIT': 2, 'FALL': 1, 'AUFTRETEN'..."
13,2020-01-19,"{'AUSMASS': 1, 'MYSTERIÖS': 1, 'LUNGENKRANKHEI..."
...,...,...
446,2021-04-04,"{'CHORPROBE': 1, 'PFLEGEHEIM': 1, 'FITNESSSTUD..."
445,2021-04-05,"{'ARMIN': 3, 'WOLLEN': 7, 'NACHDENKEN': 2, '-M..."
444,2021-04-06,"{'NACH': 12, 'EINER': 11, 'WEIT': 6, 'CORONAFA..."
443,2021-04-07,"{'ZWEITE': 2, 'JAHR': 6, 'PANDEMIE': 4, 'ENDE'..."


In [26]:
d = get_dic_around(date=datetime(2020,12,12).date(), dic_frame=dic_frame, day_range=3)
d

{'KANZLERIN': 1,
 'DRINGEN': 1,
 'CORONAKRISE': 1,
 'SCHNELLEN': 1,
 'ENTSCHEIDUNG': 2,
 'MINISTERPRÄSIDENTEN': 2,
 'SCHLESWIG-HOLSTEIN': 2,
 'SPRINGEN': 1,
 'NACH': 13,
 'EINER': 12,
 'HARTEN': 2,
 'LOCKDOWN': 9,
 'IMMER': 2,
 'LAUT': 1,
 'QUERDENKER': 1,
 'BADEN-WÜRTTEMBERG': 3,
 'VERFASSUNGSSCHUTZ': 3,
 'BEOBACHTEN': 1,
 'WELCH': 2,
 'FOLGE': 4,
 'CORONA-PROTESTBEWEGUNG': 3,
 'DEUTSCHLAND': 12,
 'ANTWORT': 2,
 'WICHTIG': 3,
 'FRAGE': 1,
 'WIESO': 1,
 'EMOTIONAL': 2,
 'CORONA-REDE': 1,
 'RISKIEREN': 2,
 'ABLEHNUNG': 1,
 'WIDERWILLEN': 1,
 'TROTZ': 1,
 'ERZEUGEN': 1,
 'WESHALB': 1,
 'WELTWIRTSCHAFT': 1,
 'HOFFEN': 1,
 'WARUM': 4,
 'NIEMAND': 1,
 'RASSISMUS': 1,
 'FUSSBALL': 2,
 'WEGSEHEN': 1,
 'DÜRFEN': 4,
 'LAGE': 5,
 'MITTWOCHABEND': 1,
 '@OTRENKAMP': 1,
 'GEBEN': 6,
 'GROSS': 3,
 'REGIONALE': 1,
 'UNTERSCHIED': 1,
 'CORONA-FALLZAHLEN': 1,
 'KOMMEN': 4,
 'ZUSTANDE': 1,
 'EXPERTE': 2,
 'THORSTEN': 1,
 'LEHR': 1,
 'ÜBER': 11,
 'INFEKTIONSKETTEN': 1,
 'FALSCH': 1,
 'SICHERHEIT': 1,
 'V

In [27]:
def write_json(data, file):
    with open(file, mode='w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
        f.close()

In [34]:
days = pd.date_range(start=min(dic_frame['date']),end=max(dic_frame['date'])).tolist()

result = []

for day in days:
    word_dic = get_top_k(get_dic_around(date=day, dic_frame=dic_frame, day_range=5), 25)
    
    day_dic = {'day': str(day), 'words': list(word_dic.keys()), 'weights': list(word_dic.values())}
    
    result.append(day_dic)

write_json(result, r'../../corona_data/visual_data.json')