In [None]:
import pandas as pd
import json
from datetime import datetime, date, timedelta
import glob
import re
import spacy
from tqdm import tqdm
import numpy as np
from math import log
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
nlp = spacy.load('de_core_news_lg')

In [None]:
storage = r'../scraping/data/storage/'
keywords_location = r'./keywords.json'

In [None]:
def timestamp_to_date(timestamp):
    return date.fromtimestamp(timestamp)

def format_text(text):
    # remove Uni-Code
    t = text.encode('latin-1', 'ignore').decode('latin-1')
    
    # Hashtags entfernen
    t = re.sub(r'#', '', t)
    
    # Links entfernen
    t = re.sub(r'http\S+', '', t)
    
    # remove Steuersymbole/andere Zeichen
    t = re.sub(r'[\n\t\ \"\':+?!]+', ' ', t)
    
    # remove \xad
    t = re.sub(r'\xad', '', t)
    
    return t

def format_words(word):
    return re.sub('[\[\-.,:?&()!@#|$0-9 ]', '', word)

# filling stop_words
# Using stop words form https://countwordsfree.com/stopwords/german
stop_words = []
# TODO verbessern
with open(r'stop_words_german.json', 'r', encoding='utf-8') as words:
    stop_words = json.load(words)
stop_words = set(stop_words)

# reading keywords
keywords = []
with open(keywords_location, mode='r', encoding='utf-8') as file:
    keywords = json.load(file)

def frame_to_dic(dataframe):
    dic = {}
    for i in dataframe['text'].to_numpy():
        for j in nlp(i):
            # todo nummern aussortieren
            if len(j.lemma_) > 3 and j.lemma_ not in stop_words and '@' not in j.lemma_ and '-' not in j.lemma_:
                s = (j.lemma_).upper()
            else:
                continue
            if (s in dic):
                dic[s] += 1
            else:
                dic[s] = 1
    return dic

def get_dic_around(date, dic_frame, day_range):
    res = {}
    for i in dic_frame[(dic_frame['date'] >= (date - timedelta(days=day_range))) & (dic_frame['date'] <= (date + timedelta(days=day_range)))]['dic']:
        if type(i) == str:
            i = json.loads(i.replace("'", "\""))
        res = {**res, **i}
    return res

def get_top_k(dictionary, k):
    if type(dictionary) == str:
        dictionary = json.loads(dictionary.replace("'", "\""))
    n = sum(sorted(dictionary.values(), reverse=True)[:k])
    sort_dic = sorted(dictionary, key=dictionary.get, reverse=True)
    return {i: dictionary[i]/n for i in list(sort_dic)[:k]}

In [None]:
def write_json(data, file):
    with open(file, mode='w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
        f.close()

In [None]:
days = pd.date_range(min(dic_frame['date']), max(dic_frame['date']))
all_words = set([])
res = []
word_index = {}
weight_dic = [0] * len(days)
counter = 0

for ind,day in tqdm(enumerate(days)):
    word_dic = get_top_k(get_dic_around(date=day, dic_frame=dic_frame, day_range=7), 10)
    words = list(word_dic.keys())
    
    for word in words:
        if word in word_index:
            res[word_index[word]]["days"].append(ind)
        else:
            word_index[word] = counter
            res.append({"word": word, "days":[ind]})
            counter += 1
        all_words.add(word)
    day_dic = {'words': words, 'weights': list(word_dic.values())}
    # write_json(day_dic, '../../data/corona/days/' + str(ind) + '.json')
    weight_dic[ind] = day_dic

In [None]:
index_to_be_removed = []

r = res.copy()

for ind, i in enumerate(r):
    day_set = set(i['days'])
    if len(day_set) < 5:
        index_to_be_removed.append(ind)
        continue
    new_days = []
    start_day = 0
    for j in sorted(i['days']):
        if j-1 not in day_set:
            start_day = j
        if j+1 in day_set:
            continue
        else:
            new_days.append([start_day, j + 1])
    i['days'] = new_days

counter = 0
for ind in index_to_be_removed:
    res.pop(ind - counter)
    counter += 1

write_json(res, r'../../corona_data/main.json')

In [None]:
for i in res:
    w = []
    for day in i['days']:
        ind = weight_dic[day]['words'].index(i['word'])
        w.append(weight_dic[day]['weights'][ind])
    write_json(w, '../../data/corona/words/' + i['word'] + '.json')
    

### Initilize empty dataframe

In [None]:
df = pd.DataFrame(columns=['date', 'user', 'shortcode', 'platform', 'text'])

### Daten hinzufügen

In [None]:
files = glob.glob(storage + '*.json')

for j in tqdm(files):
    # Öffnen einer JSON Datei 
    f = open(j, mode="r", encoding="utf-8") 

    # JSON als dictionary 
    data = json.load(f)

    for i in data:
        
        text = i['text'].lower()
        
        # checking if any of the keywords is in the given sequence
        if any(word in text for word in keywords):
            
            date = timestamp_to_date(i['date'])
            user = i['user']
            shortcode = i['shortcode']
            platform = i['platform']
            text = format_text(i['text'])
            df = df.append({'date': date, 'user': user,
                            'shortcode': shortcode, 'platform': platform, 
                            'text': text}, ignore_index=True)

    # Closing file 
    f.close()
df.to_csv("tweet_frame.csv")

In [None]:
df = pd.read_csv("tweet_frame.csv")

In [None]:
df = df[df['date'] >= date(2020, 1, 5)]

In [None]:
"""
Takes a dataframe, returns an array with the same length as the amount of days
The index of the array can be mapped to sorted(df['date'].unique())

Each element in the array (string) consists of all words assosiated with that day

Will be used for tfidf calculation
"""
date_range = pd.date_range(min(df['date'].unique()), max(df['date'].unique()))

def stem_date_tweets(df):
    res = []
    for i in tqdm(date_range):
        sentence = []
        for tweet in df[df['date'] == i]['text']:
            for word in nlp(tweet):
                if len(word.lemma_) < 2:
                    continue
                if word.lemma_ in stop_words or word.text_ in stop_words:
                    continue
                if any(number in word.lemma_ for number in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
                    continue
                sentence.append(word.lemma_)
        res.append(" ".join(sentence))
    return res

stemed_tweets = stem_date_tweets(df)

In [None]:
"""
X is a #days x #unique words big matrix
"""
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(stemed_tweets)
X_words = vectorizer.get_feature_names()
X

In [None]:
"""
Hier werden die geglätteten dics gebaut
"""
date_arr = []
for day in tqdm(range(0, len(date_range))):
    arr = list(np.sum(list(np.array(X.todense()[max(0,day - 3):min(num_of_words, day + 4)])), axis = 0))
    top_k_ind = list(map(arr.index, heapq.nlargest(15, arr)))
    top_k_w = heapq.nlargest(15, arr)
    temp = {}
    for i, word in enumerate(top_k_ind):
        temp[X_words[word]] = top_k_w[i]
    date_arr.append(temp)

In [None]:
date_arr[350:353]

In [None]:
date_range[350]