In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from collections import deque

import pathlib
import numpy as np
import pandas as pd
import random

np.random.seed(314)
tf.random.set_seed(314)
random.seed(314)

In [111]:
SENTIMENT_DATA = pathlib.Path('../data/Sent_by_day.csv')
COVID_DATA = pathlib.Path('../data/owid-covid-data.csv')
def data_processing():
    sentiment_data = pd.read_csv(SENTIMENT_DATA, index_col=0, parse_dates=[['Year', 'Month', 'Day']])
    sentiment_data.index.rename('date', inplace=True)
    sentiment_data.drop(['Unnamed: 0'], axis=1, inplace=True)
    negative_data = sentiment_data[sentiment_data['Sentiment'] == 'negative']
    positive_data = sentiment_data[sentiment_data['Sentiment'] == 'positive']
    neutral_data = sentiment_data[sentiment_data['Sentiment'] == 'neutral']
    sentiment_data = pd.concat([negative_data, positive_data, neutral_data], axis=1, join="inner")
    sentiment_data = sentiment_data.drop(['Sentiment'], axis=1)
    sentiment_data.columns = ['negative_count', 'negative_avg', 'positive_count', 'positive_avg', 'neutral_count', 'neutral_avg']
    sentiment_data = sentiment_data.groupby(['date']).sum().rolling(3).mean()
    sentiment_data = sentiment_data.dropna()

    daily_data = pd.read_csv(COVID_DATA, index_col=3, parse_dates=['date'])
    daily_data = daily_data[daily_data.continent.isin(['Europe', 'North America'])]
    daily_data = daily_data['new_cases'].groupby(daily_data.index).sum().rolling(3).mean()
    daily_data = daily_data[daily_data.index >= pd.to_datetime('2020-01-24')]
    daily_data = daily_data[daily_data.index <= pd.to_datetime('2020-12-31')]

    return (sentiment_data, daily_data)

data = pd.concat(data_processing(), axis=1)
data.to_csv('../data/processed_data.csv')
print(data)


            negative_count  negative_avg  positive_count  positive_avg  \
date                                                                     
2020-01-24    1.157327e+05      0.405452    12932.666667      0.045402   
2020-01-25    2.362353e+05      0.453745    23761.333333      0.043395   
2020-01-26    3.663233e+05      0.431332    31814.000000      0.040433   
2020-01-27    4.390923e+05      0.463175    30329.666667      0.032458   
2020-01-28    4.215193e+05      0.456303    31781.666667      0.035839   
...                    ...           ...             ...           ...   
2020-12-27    9.427477e+05      0.459835   242130.000000      0.126867   
2020-12-28    1.141295e+06      0.477485   241240.000000      0.105030   
2020-12-29    1.289657e+06      0.474366   261259.666667      0.098268   
2020-12-30    1.413140e+06      0.460078   280056.666667      0.092313   
2020-12-31    1.448581e+06      0.454409   319876.666667      0.100642   

            neutral_count  neutral_av