In [2]:
import pandas as pd
import json
from datetime import datetime, date, timedelta
import glob
import re
import spacy
nlp = spacy.load('de_core_news_sm')

In [2]:
def timestamp_to_date(timestamp):
    return date.fromtimestamp(timestamp)

def format_text(text):
    # remove Uni-Code
    t = text.encode('latin-1', 'ignore').decode('latin-1')
    
    # Hashtags entfernen
    t = re.sub(r'#\w+', '', t)
    
    # Links entfernen
    t = re.sub(r'http\S+', '', t)
    
    # remove Steuersymbole/andere Zeichen
    t = re.sub(r'[\n\t\ \"\':+?!]+', ' ', t)
    
    return t

def format_words(word):
    return re.sub('[\[\-.,:?&()!@#|$0-9 ]', '', word)

ignore_words = ["SEIN", "EINEN", "WERDEN", "AUCH", "DIES", "HABEN", "SICH", "KÖNNEN", "NICHT", "NOCH", "MEIN"]

def frame_to_dic(dataframe):
    dic = {}
    for i in dataframe['text'].to_numpy():
        for j in nlp.tokenizer(i):
            s = (j.lemma_).upper()
            if len(s) < 4:
                continue
            if s in ignore_words:
                continue
            if (s in dic):
                dic[s] += 1
            else:
                dic[s] = 1
    return dic

def get_dic_around(date, dic_frame, day_range):
    res = {}
    for i in dic_frame[(dic_frame['date'] >= (date - timedelta(days=day_range))) & (dic_frame['date'] <= (date + timedelta(days=day_range)))]['dic']:
        res = {**res, **i}
    return res

def get_top_k(dictionary, k):
    n = sum(sorted(dictionary.values(), reverse=True)[:k])
    sort_dic = sorted(dictionary, key=dictionary.get, reverse=True)
    return {i: dictionary[i]/n for i in list(sort_dic)[:k]}

### Initilize empty dataframe

In [4]:
df = pd.DataFrame(columns=['date', 'user', 'shortcode', 'platform', 'likes', 'text'])

### Instagram Daten hinzufügen

In [8]:
files = glob.glob("..\\scraping\\instagram\\instagram_data\\*.json")

for j in files:
    # Öffnen einer JSON Datei 
    f = open(j, mode="r", encoding="utf-8") 

    # JSON als dictionary 
    data = json.load(f)

    for i in data['GraphImages']:
        try:
            text = (i['edge_media_to_caption']['edges'][0]['node']['text']).upper()
        except IndexError:
            # Post ohne Text -> überspringen
            continue
        if ('CORONA' in text) or ('COVID' in text) or ('SARS-COV-2' in text) or ('SARS COV' in text):
            date = timestamp_to_date(i['taken_at_timestamp'])
            user = i['username']
            shortcode = i['shortcode']
            platform = 'instagram'
            likes = i['edge_media_preview_like']['count']
            text = format_text(i['edge_media_to_caption']['edges'][0]['node']['text'])
            df = df.append({'date': date, 'user': user,
                            'shortcode': shortcode, 'platform': platform, 
                            'likes': likes, 'text': text}, ignore_index=True)

    # Closing file 
    f.close() 
    print('Completed:', j)
print('Done!')

Completed: ..\scraping\instagram\instagram_data\bayerischer_rundfunk.json
Completed: ..\scraping\instagram\instagram_data\mdr_san.json
Completed: ..\scraping\instagram\instagram_data\mdr_sn.json
Completed: ..\scraping\instagram\instagram_data\mdr_th.json
Completed: ..\scraping\instagram\instagram_data\ndr2.json
Completed: ..\scraping\instagram\instagram_data\spiegelmagazin.json
Completed: ..\scraping\instagram\instagram_data\tagesschau.json
Completed: ..\scraping\instagram\instagram_data\wdr2.json
Completed: ..\scraping\instagram\instagram_data\welt.json
Done!


### Twitter Daten hinzufügen

In [5]:
files = glob.glob("..\\scraping\\twitter\\twitter_data\\*.txt")

for j in files:

    f = open(j, mode="r", encoding="utf-8") 

    for i in f:
        temp = json.loads(i[:-1])
        text = temp['content'].upper()
        if ('CORONA' in text) or ('COVID' in text) or ('SARS-COV-2' in text) or ('SARS COV' in text):
            s = temp['date'][:10]
            date = datetime(int(s[:4]), int(s[5:7]), int(s[8:10])).date() # String to datetime.date
            user = temp['user']['username']
            shortcode = temp['id']
            platform = 'twitter'
            likes = temp['likeCount']
            text = format_text(temp['content'])
            df = df.append({'date': date, 'user': user,
                            'shortcode': shortcode, 'platform': platform, 
                            'likes': likes, 'text': text}, ignore_index=True)


    # Closing file 
    f.close() 
    print('Completed:', j)
print('Done!')

Completed: ..\scraping\twitter\twitter_data\BR_Presse.txt
Completed: ..\scraping\twitter\twitter_data\derspiegel.txt
Completed: ..\scraping\twitter\twitter_data\faznet.txt
Completed: ..\scraping\twitter\twitter_data\hessenschau.txt
Completed: ..\scraping\twitter\twitter_data\hrPresse.txt
Completed: ..\scraping\twitter\twitter_data\MDRAktuell.txt
Completed: ..\scraping\twitter\twitter_data\mdrde.txt
Completed: ..\scraping\twitter\twitter_data\MDR_SAN.txt
Completed: ..\scraping\twitter\twitter_data\MDR_SN.txt
Completed: ..\scraping\twitter\twitter_data\mdr_th.txt
Completed: ..\scraping\twitter\twitter_data\ndr.txt
Completed: ..\scraping\twitter\twitter_data\NDRinfo.txt
Completed: ..\scraping\twitter\twitter_data\swr3.txt
Completed: ..\scraping\twitter\twitter_data\SWRAktuellBW.txt
Completed: ..\scraping\twitter\twitter_data\SZ.txt
Completed: ..\scraping\twitter\twitter_data\tagesschau.txt
Completed: ..\scraping\twitter\twitter_data\WDR.txt
Completed: ..\scraping\twitter\twitter_data\WDR2

### Build dictionary frame

In [9]:
dic_frame = pd.DataFrame(columns=['date', 'dic'])
for i in df['date'].unique():
    frame = df[df['date'] == i]
    dic_frame = dic_frame.append({'date': i, 'dic': frame_to_dic(frame)}, ignore_index=True)
dic_frame = dic_frame.sort_values(by=['date'])
dic_frame.to_csv("instagram_frame.csv")

In [9]:
df

Unnamed: 0,date,user,shortcode,platform,likes,text
0,2020-12-11,bayerischer_rundfunk,,,341,Ein Tag im Zeichen der Nächstenliebe auch und ...
1,2020-03-20,bayerischer_rundfunk,,,1233,Frühlingsanfang ohne Biergärten. Während das d...
2,2020-03-19,bayerischer_rundfunk,,,1441,Stadtidylle ohne Menschen. Der Ausnahmezustand...
3,2020-12-11,bayerischer_rundfunk,,,341,Ein Tag im Zeichen der Nächstenliebe auch und ...
4,2020-03-20,bayerischer_rundfunk,,,1233,Frühlingsanfang ohne Biergärten. Während das d...
...,...,...,...,...,...,...
7218,2020-02-05,welt,B8MWSY2C70z,instagram,4621,Daniel Göttlich und sein Teamkollege Linus Kla...
7219,2020-01-31,welt,B7-Ycl2CW4d,instagram,5037,Die Weltgesundheitsorganisation (WHO) hat nach...
7220,2020-01-28,welt,B72vRk3iuEd,instagram,7576,Millionen Menschen stehen in China unter Quara...
7221,2020-01-24,welt,B7tPiNzijwh,instagram,5934,China beschließt weitere Schutzvorkehrungen ge...


In [34]:
d = get_dic_around(date=datetime(2020,12,12).date(), dic_frame=dic_frame, day_range=3)

In [38]:
get_top_k(d, 10000)

{'HABEN': 0.004005126561999359,
 'SICH': 0.003524511374559436,
 'KÖNNEN': 0.0033643063120794617,
 'NICHT': 0.0032041012495994873,
 'NOCH': 0.0028836911246395386,
 'MEIN': 0.00256328099967959,
 'MEHR': 0.00256328099967959,
 'WOLLEN': 0.002242870874719641,
 'CORONA': 0.002242870874719641,
 'NACH': 0.0020826658122396666,
 'VIEL': 0.0019224607497596924,
 'EINER': 0.0019224607497596924,
 'DEUTSCHLAND': 0.0019224607497596924,
 'WEIHNACHTEN': 0.0019224607497596924,
 'SAGEN': 0.001762255687279718,
 'ÜBER': 0.001762255687279718,
 'ZULASSUNG': 0.0016020506247997437,
 'LOCKDOWN': 0.0014418455623197693,
 'SPAHN': 0.0014418455623197693,
 'MENSCH': 0.0014418455623197693,
 'GEGEN': 0.0014418455623197693,
 'HEUTE': 0.0014418455623197693,
 'ABER': 0.001281640499839795,
 'LAND': 0.001281640499839795,
 'DEZEMBER': 0.001281640499839795,
 'ERST': 0.001281640499839795,
 'CORONA-IMPFSTOFF': 0.001281640499839795,
 'ZULASSEN': 0.001281640499839795,
 'DASS': 0.0011214354373598205,
 'KEIN': 0.0011214354373598205

In [8]:
df.sort_values(by=['date']).to_csv("raw_frame.csv")

In [160]:
s = "2020-12-02"
datetime(int(s[:4]), int(s[5:7]), int(s[8:10])).date()

datetime.date(2020, 12, 2)

In [3]:
datetime(2020,1,1).timestamp()

1577833200.0