In [1]:
import pandas as pd
import json
from datetime import datetime, date, timedelta
import glob
import re
import spacy
from tqdm import tqdm
nlp = spacy.load('de_core_news_lg')

In [2]:
storage = r'../scraping/data/storage/'

In [3]:
def timestamp_to_date(timestamp):
    return date.fromtimestamp(timestamp)

def format_text(text):
    # remove Uni-Code
    t = text.encode('latin-1', 'ignore').decode('latin-1')
    
    # Hashtags entfernen
    t = re.sub(r'#', '', t)
    
    # Links entfernen
    t = re.sub(r'http\S+', '', t)
    
    # remove Steuersymbole/andere Zeichen
    t = re.sub(r'[\n\t\ \"\':+?!]+', ' ', t)
    
    # remove \xad
    t = re.sub(r'\xad', '', t)
    
    return t

def format_words(word):
    return re.sub('[\[\-.,:?&()!@#|$0-9 ]', '', word)

# filling ignore_words
# Using stop words form https://countwordsfree.com/stopwords/german
ignore_words = []
with open(r'stop_words_german.json', 'r', encoding='utf-8') as words:
    for word in words.read()[1:-1].split(","):
        ignore_words.append(word[1:-1])
ignore_words = set(ignore_words)

def frame_to_dic(dataframe):
    dic = {}
    for i in dataframe['text'].to_numpy():
        for j in nlp(i):
            # todo nummern aussortieren
            if len(j.lemma_) > 3 and j.lemma_ not in ignore_words and '@' not in j.lemma_ and '-' not in j.lemma_:
                s = (j.lemma_).upper()
            else:
                continue
            if (s in dic):
                dic[s] += 1
            else:
                dic[s] = 1
    return dic

def get_dic_around(date, dic_frame, day_range):
    res = {}
    for i in dic_frame[(dic_frame['date'] >= (date - timedelta(days=day_range))) & (dic_frame['date'] <= (date + timedelta(days=day_range)))]['dic']:
        if type(i) == str:
            i = json.loads(i.replace("'", "\""))
        res = {**res, **i}
    return res

def get_top_k(dictionary, k):
    if type(dictionary) == str:
        dictionary = json.loads(dictionary.replace("'", "\""))
    n = sum(sorted(dictionary.values(), reverse=True)[:k])
    sort_dic = sorted(dictionary, key=dictionary.get, reverse=True)
    return {i: dictionary[i]/n for i in list(sort_dic)[:k]}

### Initilize empty dataframe

In [48]:
df = pd.DataFrame(columns=['date', 'user', 'shortcode', 'platform', 'text'])

### Daten hinzufügen

In [49]:
files = glob.glob(storage + '*.json')

for j in tqdm(files):
    # Öffnen einer JSON Datei 
    f = open(j, mode="r", encoding="utf-8") 

    # JSON als dictionary 
    data = json.load(f)

    for i in data:
        
        text = i['text'].upper()
        
        if ('CORONA' in text) or ('COVID' in text) or ('SARS-COV-2' in text) or ('SARS COV' in text):
            
            date = timestamp_to_date(i['date'])
            user = i['user']
            shortcode = i['shortcode']
            platform = i['platform']
            text = format_text(i['text'])
            df = df.append({'date': date, 'user': user,
                            'shortcode': shortcode, 'platform': platform, 
                            'text': text}, ignore_index=True)

    # Closing file 
    f.close()
df.to_csv("tweet_frame.csv")

100%|██████████████████████████████████████████| 16/16 [06:13<00:00, 23.35s/it]


### Taking a look at the dataframe

In [4]:
df = pd.read_csv("tweet_frame.csv")

### Building the dictionary frame

In [50]:
dic_frame = pd.DataFrame(columns=['date', 'dic'])

for i in tqdm(df['date'].unique()):
    frame = df[df['date'] == i]
    dic_frame = dic_frame.append({'date': i, 'dic': frame_to_dic(frame)}, ignore_index=True)

dic_frame = dic_frame.sort_values(by=['date'])
dic_frame.to_csv("dic_frame.csv", index=False)

100%|████████████████████████████████████████| 450/450 [05:17<00:00,  1.42it/s]


In [5]:
dic_frame = pd.read_csv("dic_frame.csv")
dic_frame['date'] = pd.to_datetime(dic_frame['date'])

In [6]:
def write_json(data, file):
    with open(file, mode='w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
        f.close()

In [21]:
days = pd.date_range(min(dic_frame['date']), max(dic_frame['date']))
all_words = set([])
res = []
word_index = {}
weight_dic = [0] * len(days)
counter = 0

for ind,day in tqdm(enumerate(days)):
    word_dic = get_top_k(get_dic_around(date=day, dic_frame=dic_frame, day_range=7), 10)
    words = list(word_dic.keys())
    
    for word in words:
        if word in word_index:
            res[word_index[word]]["days"].append(ind)
        else:
            word_index[word] = counter
            res.append({"word": word, "days":[ind]})
            counter += 1
        all_words.add(word)
    day_dic = {'words': words, 'weights': list(word_dic.values())}
    # write_json(day_dic, '../../data/corona/days/' + str(ind) + '.json')
    weight_dic[ind] = day_dic

456it [00:02, 202.50it/s]


In [66]:
index_to_be_removed = []

r = res.copy()

for ind, i in enumerate(r):
    day_set = set(i['days'])
    if len(day_set) < 5:
        index_to_be_removed.append(ind)
        continue
    new_days = []
    start_day = 0
    for j in sorted(i['days']):
        if j-1 not in day_set:
            start_day = j
        if j+1 in day_set:
            continue
        else:
            new_days.append([start_day, j + 1])
    i['days'] = new_days

counter = 0
for ind in index_to_be_removed:
    res.pop(ind - counter)
    counter += 1

write_json(res, r'../../corona_data/main.json')

In [22]:
weight_dic

[{'words': ['MENSCH',
   'CHINA',
   'PATIENT',
   'LUNGENKRANKHEIT',
   'EXPERTE',
   'UNBEKANNT',
   'CORONAVIRUS',
   'AUSLÖSER',
   'ERREGER',
   'KRITISCH'],
  'weights': [0.13043478260869565,
   0.13043478260869565,
   0.13043478260869565,
   0.08695652173913043,
   0.08695652173913043,
   0.08695652173913043,
   0.08695652173913043,
   0.08695652173913043,
   0.08695652173913043,
   0.08695652173913043]},
 {'words': ['PATIENT',
   'EXPERTE',
   'UNBEKANNT',
   'AUSLÖSER',
   'ERREGER',
   'KRITISCH',
   'ZUSTAND',
   'STERBEN',
   'FALL',
   'RÄTSELHAFT'],
  'weights': [0.15789473684210525,
   0.10526315789473684,
   0.10526315789473684,
   0.10526315789473684,
   0.10526315789473684,
   0.10526315789473684,
   0.10526315789473684,
   0.10526315789473684,
   0.05263157894736842,
   0.05263157894736842]},
 {'words': ['PATIENT',
   'EXPERTE',
   'UNBEKANNT',
   'AUSLÖSER',
   'ERREGER',
   'KRITISCH',
   'ZUSTAND',
   'STERBEN',
   'GEBEN',
   'FALL'],
  'weights': [0.15, 0.1, 0.1

In [23]:
res

[{'word': 'MENSCH',
  'days': [0,
   4,
   5,
   8,
   20,
   24,
   25,
   34,
   35,
   44,
   46,
   50,
   51,
   52,
   53,
   54,
   58,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   70,
   71,
   72,
   73,
   74,
   76,
   77,
   78,
   79,
   80,
   81,
   82,
   84,
   85,
   86,
   87,
   88,
   89,
   92,
   93,
   94,
   96,
   97,
   98,
   100,
   101,
   102,
   103,
   105,
   106,
   108,
   116,
   117,
   119,
   120,
   121,
   123,
   124,
   125,
   126,
   127,
   128,
   129,
   133,
   134,
   135,
   136,
   137,
   138,
   142,
   143,
   145,
   148,
   149,
   153,
   154,
   160,
   161,
   165,
   167,
   169,
   171,
   173,
   174,
   175,
   180,
   181,
   184,
   195,
   196,
   198,
   199,
   201,
   202,
   203,
   205,
   226,
   228,
   232,
   236,
   237,
   238,
   240,
   248,
   249,
   250,
   252,
   256,
   258,
   270,
   271,
   275,
   280,
   288,
   294,
   295,
   296,
   299,
   306,
   307,
   309,
   310,
   313,
 

In [30]:
for i in res:
    w = []
    for day in i['days']:
        ind = weight_dic[day]['words'].index(i['word'])
        w.append(weight_dic[day]['weights'][ind])
    write_json(w, '../../data/corona/words/' + i['word'] + '.json')
    

In [24]:
[1,2,3,4,5,6].index(2)

1