In [None]:
from glob import glob
import os
import json
from datetime import datetime, date, timezone, timedelta
import pandas as pd
from tqdm import tqdm

In [None]:
config_path = r'./scraping/twitter/config.json'
temp_storage = r'./scraping/data/temp_storage/'
storage_path = r'./scraping/data/storage/'

In [None]:
def create_new_storage(storage, categories):
    # check if a main file exsists
    if not os.path.isfile(storage + 'main.csv'):
        main = pd.DataFrame(columns=['id', 'date', 'user', 'text'])
        main.to_csv(storage + 'main.csv', index=False)
        with open(storage + 'README.txt', 'x') as f:
            # todo: add correct link
            f.write('For instructions visit github.com/Jakob-L-M/')
    
    df = pd.DataFrame(columns=['id', 'date', 'user', 'text'])
    for category in categories:
        os.mkdir(storage+category)
        df.to_csv(storage + category + '/data.csv', index=False)
        with open(storage + category + '/keywords.json', 'x', encoding='utf-8') as f:
            json.dump(['Write','your','keywords','here'], f)
        f.close()
        with open(storage + category + '/stopwords.json', 'x', encoding='utf-8') as f:
            json.dump(['Write','your','stopwords','here'], f)
        f.close()
        with open(storage + category + '/config.json', 'x') as f:
            json.dump({'start_date': '2000-01-01', 'end_date': '', 'day_smoothing': 3, 'k': 25}, f)
        f.close()

In [None]:
def update_main(storage, temp_storage):
    temp_files = glob(temp_storage + "*.txt")
    main = pd.read_csv(storage+'main.csv')
    for j in tqdm(temp_files):

        f = open(j, mode="r", encoding="utf-8")

        user_tweets = []

        for i in f.readlines():

            temp = json.loads(i[:-1])

            if temp['retweetedTweet'] is None:

                s = temp['date']
                # String to datetime.date
                dt = datetime(int(s[:4]), int(s[5:7]), int(s[8:10]), int(s[11:13]), int(s[14:16]), int(s[17:19]))

                dic = {}

                dic['date'] = pd.to_datetime(dt)
                dic['text'] = temp['content']
                dic['id'] = temp['id']
                dic['user'] = temp['user']['username']

                user_tweets.append(dic)
        # Closing file 
        f.close()

        main = main.append(user_tweets)

    main = main.drop_duplicates()  

    main.to_csv(storage + 'main.csv', index=False)

In [None]:
def update_category(storage, category):
    category_data = pd.read_csv(storage+category+'/data.csv')
    main_data = pd.read_csv(storage+'main.csv')
    with open(storage+category+'/keywords.json', 'r', encoding='utf-8') as f:
        keywords = json.load(f)
        use_filter = True
        if keywords[0] != "":
            keyword_filter = lambda x: any(word in x['text'] for word in keywords)
        else:
            use_filter = False
    f.close()
    with open(storage+category+'/config.json', 'r') as f:
        config = json.load(f)
    f.close()
    
    if config['start_date'] != '':
        main_data = main_data[main_data['date'] > config['start_date']]
        
    if config['end_date'] != '':
        main_data = main_data[main_data['date'] < config['end_date']]
    # if there already exsists data, we will slice the main file first to only check for new entries
    if len(category_data['date']) != 0:
        main_data = main_data[main_data['date'] > max(category_data['date'])]
    
    if use_filter:
        main_data = main_data[main_data.apply(keyword_filter, axis=1)]
    
    # append new data
    category_data = category_data.append(main_data, ignore_index=True)
    
    # Just to be 100% safe
    category_data = category_data.drop_duplicates()
    
    category_data.to_csv(storage+category+'/data.csv', index = False)