In [None]:
import pandas as pd
import json
from datetime import datetime, date, timedelta
import glob
import re
import spacy
from tqdm import tqdm
import numpy as np
from math import log
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
nlp = spacy.load('de_core_news_lg')

In [None]:
storage = r'../scraping/data/storage/'
keywords_location = r'./keywords.json'

In [None]:
def timestamp_to_date(timestamp):
    return date.fromtimestamp(timestamp)

def format_text(text):
    # remove Uni-Code
    t = text.encode('latin-1', 'ignore').decode('latin-1')
    
    # Hashtags entfernen
    t = re.sub(r'#', '', t)
    
    # Links entfernen
    t = re.sub(r'http\S+', '', t)
    
    # remove Steuersymbole/andere Zeichen
    t = re.sub(r'[\n\t\ \"\':+?!]+', ' ', t)
    
    # remove \xad
    t = re.sub(r'\xad', '', t)
    
    return t

def format_words(word):
    return re.sub('[\[\-.,:?&()!@#|$0-9 ]', '', word)

# filling stop_words
# Using stop words form https://countwordsfree.com/stopwords/german
stop_words = []
# TODO verbessern
with open(r'stop_words_german.json', 'r', encoding='utf-8') as words:
    stop_words = json.load(words)
stop_words = set(stop_words)

# reading keywords
keywords = []
with open(keywords_location, mode='r', encoding='utf-8') as file:
    keywords = json.load(file)

### Initilize empty dataframe

In [None]:
df = pd.DataFrame(columns=['date', 'user', 'shortcode', 'platform', 'text'])

### Daten hinzufügen

In [None]:
files = glob.glob(storage + '*.json')

for j in tqdm(files):
    # Öffnen einer JSON Datei 
    f = open(j, mode="r", encoding="utf-8") 

    # JSON als dictionary 
    data = json.load(f)

    for i in data:
        
        text = i['text'].lower()
        
        # checking if any of the keywords is in the given sequence
        if any(word in text for word in keywords):
            
            date = timestamp_to_date(i['date'])
            user = i['user']
            shortcode = i['shortcode']
            platform = i['platform']
            text = format_text(i['text'])
            df = df.append({'date': date, 'user': user,
                            'shortcode': shortcode, 'platform': platform, 
                            'text': text}, ignore_index=True)

    # Closing file 
    f.close()
df.to_csv("tweet_frame.csv", index=False)

In [None]:
# If already build once, data can be read in here
df = pd.read_csv("tweet_frame.csv", index_col=False)
df['date'] = pd.to_datetime(df['date'])
df = df[df['date'] >= pd.to_datetime("2020-01-07")]

In [None]:
"""
Takes a dataframe, returns an array with the same length as the amount of days
The index of the array can be mapped to sorted(df['date'].unique())

Each element in the array (string) consists of all words assosiated with that day

Will be used for tfidf calculation
"""
date_range = pd.date_range(min(df['date'].unique()), max(df['date'].unique()))

def stem_date_tweets(df):
    res = []
    for i in tqdm(date_range):
        sentence = []
        for tweet in df[df['date'] == i]['text']:
            for word in nlp(tweet):
                if len(word.text) < 3:
                    continue
                if word.lemma_ in stop_words or word.text in stop_words:
                    continue
                if any(number in word.lemma_ for number in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
                    continue
                sentence.append(word.lemma_)
        res.append(" ".join(sentence))
    return res

stemed_tweets = stem_date_tweets(df)

In [None]:
"""
X is a #days x #unique words big matrix
"""
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(stemed_tweets)
X_words = vectorizer.get_feature_names()
X

In [None]:
"""
Smoothing of tf-idf, also ignoring word that appear on less than 100th of days

todo:   -no idea if its working properly
        -smoothing
"""
date_arr = []
day_smoothing = 3
k = 25
for day in tqdm(range(0, len(date_range))):
    arr = list(np.sum(list(np.array(X.todense()[max(0,day - day_smoothing):min(len(date_range), day + day_smoothing)])), axis = 0))
    top_k_ind = list(map(arr.index, heapq.nlargest(k, arr)))
    top_k_w = heapq.nlargest(k, arr)
    top_k_w = list(np.array(top_k_w)/sum(top_k_w))
    temp = {'day': day, 'words': [], 'weights': []}
    for i, word in enumerate(top_k_ind):
        temp['words'].append(X_words[word])
        temp['weights'].append(top_k_w[i])
    date_arr.append(temp)

In [None]:
import exporter
import importlib
importlib.reload(exporter)

exporter.export(date_arr, str(min(date_range))[:10], len(date_range), "corona")