# COVID19 ANALYSIS USING SOCIAL MEDIA DATA

### An analysis of the mention of symptoms on Twitter in Île-de-France

We present in this dashboard the first results of our analysis of Twitter data relating COVID. We analyze the mentions of COVID-related terms over time, and especially the mention of COVID symptoms. We note a strong correlation between the number of tweets including symptoms and the number of victims in Île-de-France (Paris region).

We collected tweets specifically from users in Île-de-France. We first used the Streaming API to identify users in the Paris area, and then collected the historic data from these users. This dashboard presents our analyses, based on 30,000 Twitter users, for a total of about 33 million tweets from December 2019. We exclude the retweets from this analysis which is now based on 17 million tweets.

The graphs are interactive, ie. one can select the variables of interest in the legend.

In [1]:
import pandas as pd
import numpy as np
import json
import os
import multiprocessing as mp
from time import time
import socket
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

In [2]:
from matplotlib import pyplot as plt
import plotly.figure_factory as ff
import plotly.offline as py
import plotly.graph_objs as go
from plotly import tools
py.init_notebook_mode(connected = True)

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [3]:
# Paths to data
path_to_data = "../data/"
tweets_covid_related = pd.read_csv(os.path.join(path_to_data,'tweets_covid_related.csv'), index_col=0)
tweets_symptoms = pd.read_csv(os.path.join(path_to_data,'tweets_mention_symptoms.csv'), sep=';')

urgences = pd.read_csv(os.path.join(path_to_data,'emergencies.csv'), sep=';')
open_covid = pd.read_csv(os.path.join(path_to_data,'open_covid.csv'), sep=';')
tweets_symptoms['day'] = pd.to_datetime(tweets_symptoms['day'])
open_covid['date'] = pd.to_datetime(open_covid['date'])
urgences['date_de_passage'] = pd.to_datetime(urgences['date_de_passage'])

In [4]:
# symptoms_dict_fr = {'toux' : ['toux', 'tousse'],
#                    'maux de gorge' : ['maux de gorge', 'mal de gorge', 'mal à la gorge'],
#                    'fièvre' : ['fievre', 'fièvre', 'fiévre'],
#                     'mal de tête' : ['mal de tête','mal de crâne','mal à la tête','mal de tete','mal de crane','mal à la tete'],
#                    'perte goût et odorat' : ['perte du goût', "perte de l'odorat", 'perte du gout',"plus de goût","plus de gout","plus d'odeur"],
#                    'engelures' : ['engelures'],
#                    'symptômes' : ['symptome', 'symptôme'],
#                    'difficultés respiratoires' : ['difficultés à respirer', 'difficultés respiratoires', 'difficulté à respirer','mal à respirer']}

symptoms_en=['cough','sore_throat','fever','loss_taste','skin_symptom','breathing_difficulties','symptoms']

## Evolution of tweets related to COVID and symptoms

In [5]:
def plot_covid_evolution() :
    traces=[]
    df = tweets_covid_related
    for value in ['covid','confinement','has_symptom','RestezChezVous']:
        traces.append(go.Scatter(x = df['day'], 
                                 y = df[value].values,
                                mode = 'lines',
                                name = value))
    
    layout = go.Layout(title="Evolution of mentions of Covid-related terms in Ile-de-France")
    fig = go.Figure(traces, layout)
    
    fig.add_shape(dict(type="rect",
                       yref='paper',
                       x0='2020-03-17',
                       y0=0,
                       x1='2020-05-11',
                       y1=1,
                       fillcolor="LightSalmon",
                       opacity=0.2,
                       layer='below',
                       line_width=0))
    
    fig.update_layout(annotations=[dict(
        x='2020-04-15',
        y=0.9,
        yref="paper",
        text="Lockdown (France)", showarrow=False)])
    
    py.iplot(fig)
plot_covid_evolution()

In [6]:
def plot_symptoms_evolution() :
    traces=[]
    df = tweets_symptoms
    #for symptom in list(symptoms_dict_fr.keys()) :
    for symptom in symptoms_en:
        traces.append(go.Scatter(x = df['day'], 
                                 y = df[symptom].values,
                                mode = 'lines',
                                name = symptom))
   

    layout = go.Layout(title="Evolution of mention of symptoms in Twitter in Ile-de-France")
    fig = go.Figure(traces, layout)
    
    fig.add_shape(dict(type="rect",
                       yref='paper',
                       x0='2020-03-17',
                       y0=0,
                       x1='2020-05-11',
                       y1=1,
                       fillcolor="LightSalmon",
                       opacity=0.2,
                       layer='below',
                       line_width=0))
    
    fig.update_layout(annotations=[dict(
        x='2020-04-15',
        y=0.9,
        yref="paper",
        text="Lockdown (France)", showarrow=False)])
    
    py.iplot(fig)
plot_symptoms_evolution()

## Comparison with the evolution of the epidemy in Île-de-France

We used public data from [Santé Publique France](https://www.data.gouv.fr/fr/datasets/donnees-des-urgences-hospitalieres-et-de-sos-medecins-relatives-a-lepidemie-de-covid-19/) about emergencies and SOS Médecins data related to COVID. We use data about passages to emergencies for suspicion of COVID, and hospitalizations for suspicion of COVID. We present the raw (daily) data, as well as the data averaged on 3 days.

We notice that the curves of tweets mentioning symptoms and emergencies seem really similar, the first one preceding the emergencies curves by about 11 days.

In [17]:
### Let's read the actual tweets so that we can join them with the live data stream from the TweetAnnotator

tweets_symptoms = pd.read_csv(os.path.join(path_to_data,'list_covid_symptoms.csv'), sep=';')
tweets_symptoms['day'] = pd.to_datetime(tweets_symptoms['day'])

# let's now get the annotations
tweet_annotation = pd.read_csv('https://test-tweetannotator.herokuapp.com/download-annotations',sep=';')
# now we merge the annotations into one outer-joined df (this keeps all the non-annotated tweets too!)
merged_df = tweets_symptoms.merge(tweet_annotation,left_on='id_str',right_on='tweet_id',how='outer').drop_duplicates()

# let's count how often each tweet gets a NO
counted_nos = merged_df.groupby(['id_str','symptom']).size().to_frame('size').reset_index()

# now we can filter out all the tweets that have at least a single 'no' rating
tweets_symptoms_filtered = tweets_symptoms[~tweets_symptoms.id_str.isin(list(counted_nos[counted_nos['symptom'] == 'no']['id_str']))]

# now let's group both DF by day to for the data viz & count the # of tweets
tweets_symptoms = tweets_symptoms.groupby(['day']).count()
tweets_symptoms_filtered = tweets_symptoms_filtered.groupby(['day']).count()

# now we calculate the 7 day moving average
tweets_symptoms['has_symptom'] = tweets_symptoms['id_str']
tweets_symptoms['has_symptom_mean_3'] = tweets_symptoms.rolling('7d').mean()['id_str']
tweets_symptoms_filtered['has_symptom'] = tweets_symptoms_filtered['id_str']
tweets_symptoms_filtered['has_symptom_mean_3'] = tweets_symptoms_filtered.rolling('7d').mean()['id_str']
tweets_symptoms = tweets_symptoms.reset_index()
tweets_symptoms_filtered = tweets_symptoms_filtered.reset_index()

In [18]:
def plot_symptoms_urgences_with_ma() :
    
    #['nbre_pass_corona','nbre_hospit_corona','nbre_acte_corona']
    #label_emergency = emergencies_dict.get(type_emergency)
    traces=[]
    traces.append(go.Scatter(x = tweets_symptoms['day'],
                            y = tweets_symptoms['has_symptom_mean_3'].values,
                            mode = 'lines',
                             line=dict(color='yellow'),
                            name = 'Tweets symptoms unfiltered (avg 7d)',
                            yaxis="y1"))
    
    traces.append(go.Scatter(x = tweets_symptoms_filtered['day'],
                            y = tweets_symptoms_filtered['has_symptom_mean_3'].values,
                            mode = 'lines',
                             line=dict(color='blue'),
                            name = 'Tweets symptoms filtered (avg 7d)',
                            yaxis="y1"))    
    
    traces.append(go.Scatter(x = urgences.date_de_passage, 
                             y = urgences['nbre_pass_corona' + '_mean_3'],
                             mode = 'lines',
                             name =  'Passages to emergencies (avg 3d)',
                             line=dict(color='green'),
                             yaxis="y2"))
    
#     traces.append(go.Scatter(x = urgences.date_de_passage, 
#                              y = urgences['nbre_hospit_corona'],
#                              mode = 'lines',
#                              name = 'Hospitalizations',
#                              opacity=0.3,
#                              line=dict(color='grey'),
#                              yaxis="y2"))
#     traces.append(go.Scatter(x = urgences.date_de_passage, 
#                              y = urgences['nbre_hospit_corona' + '_mean_3'],
#                              mode = 'lines',
#                              name =  'Hospitalizations (avf 3d)',
#                              line=dict(color='grey'),
#                              yaxis="y2"))
    
    layout = go.Layout(title="Evolution of mentions of symptoms and emergencies related to COVID in Ile-de-France ",
                       legend={"x" : 1.1, "y" : 1},
                       yaxis=dict(title='Number of tweets'),
                       yaxis2=dict(title='Number of emergencies related to COVID',
                                   overlaying='y',
                                   side='right'))
    
    
    fig = go.Figure(traces, layout)
    
    
    fig.add_shape(dict(type="rect",
                       yref='paper',
                       x0='2020-03-17',
                       y0=0,
                       x1='2020-05-11',
                       y1=1,
                       fillcolor="LightSalmon",
                       opacity=0.2,
                       layer='below',
                       line_width=0))
    
    fig.update_layout(annotations=[dict(
        x='2020-04-15',
        y=0.95,
        yref="paper",
        text="Lockdown (France)", showarrow=False)])
    
    py.iplot(fig)
    
plot_symptoms_urgences_with_ma()

We shift the tweets curve by 11 days and find that the two curves superpose.

In [19]:
def plot_shifted_emergencies(lag) :
    
    # Shift : 
    tweets_symptoms['symptom_shift_3'] = tweets_symptoms['has_symptom_mean_3'].shift(lag)
    tweets_symptoms['symptom_shift'] = tweets_symptoms['has_symptom'].shift(lag)
    tweets_symptoms_filtered['symptom_shift'] = tweets_symptoms_filtered['has_symptom'].shift(lag)
    tweets_symptoms_filtered['symptom_shift_3'] = tweets_symptoms_filtered['has_symptom_mean_3'].shift(lag)

    
    symptoms = 'symptom_shift'

    #label_emergency = emergencies_dict.get(type_emergency)
    
    traces=[]
    traces.append(go.Scatter(x = tweets_symptoms['day'],
                            y = tweets_symptoms['symptom_shift_3'].values,
                            mode = 'lines',
                             line=dict(color='yellow'),
                            name = 'Tweets symptoms (avg 7d)',
                            yaxis="y1"))

    traces.append(go.Scatter(x = tweets_symptoms_filtered['day'],
                            y = tweets_symptoms_filtered['symptom_shift_3'].values,
                            mode = 'lines',
                             line=dict(color='blue'),
                            name = 'Tweets symptoms (avg 7d)',
                            yaxis="y1"))
    
    traces.append(go.Scatter(x = urgences.date_de_passage, 
                             y = urgences['nbre_pass_corona_mean_3'],
                             mode = 'lines',
                             name = 'Passages to emergencies (avg 3d)',
                             line=dict(color='green'),
                             yaxis="y2"))
    traces.append(go.Scatter(x = urgences.date_de_passage, 
                             y = urgences['nbre_hospit_corona_mean_3'],
                             mode = 'lines',
                             name = 'Hospitalizations (avg 3d)',
                             line=dict(color='grey'),
                             yaxis="y2"))
    
    layout = go.Layout(title="Evolution of mentions of symptoms (shifted 11 days) and emergencies related to COVID in Ile-de-France",
                        legend={"x" : 1.08, "y" : 1},
                       yaxis=dict(title='Number of tweets'),
                       yaxis2=dict(title='Number of passages in emergencies',
                                   overlaying='y',
                                   side='right'))
    
    fig = go.Figure(traces, layout)
    py.iplot(fig)
    
plot_shifted_emergencies(lag=11)

In [20]:
from math import log
from sklearn.linear_model import LinearRegression

In [21]:
correlation_matrix = pd.DataFrame()
correlation_matrix_3 = pd.DataFrame()
correlation_matrix['urgences'] = urgences.loc[urgences['date_de_passage'].isin(tweets_symptoms['day'].tolist()),
                                              'nbre_pass_corona']
correlation_matrix['symptoms'] = tweets_symptoms.loc[tweets_symptoms['day'].isin(urgences['date_de_passage'].tolist()),
                                                    'symptom_shift'].reset_index(drop=True)

correlation_matrix_3['urgences'] = urgences.loc[urgences['date_de_passage'].isin(tweets_symptoms['day'].tolist()),
                                              'nbre_pass_corona_mean_3']
correlation_matrix_3['symptoms'] = tweets_symptoms.loc[tweets_symptoms['day'].isin(urgences['date_de_passage'].tolist()),
                                                    'symptom_shift_3'].reset_index(drop=True)
correlation_matrix_3 = correlation_matrix_3.dropna()

for df in correlation_matrix, correlation_matrix_3:
    for col in df.columns:
        df[col] = df[col].astype(int)
        df['log_'+col] = df.loc[(correlation_matrix[col]!=0),
                                                                col].apply(lambda x: log(x))

In [22]:
def plot_symptoms_correlation(log=False):
    
    def select_df(in_df):
        out_df = in_df[['urgences','symptoms']]
        if log==True:
            out_df = in_df[[col for col in in_df.columns if col.startswith('log')]]
            out_df.columns = out_df.columns.str.replace('log_', '')
            out_df = out_df.dropna()
        return out_df
    df = select_df(correlation_matrix)
    df_3 = select_df(correlation_matrix_3)
        
    # Regressions
    reg = LinearRegression().fit(np.vstack(df['symptoms']), df['urgences'])
    bestfit = reg.predict(np.vstack(df['symptoms']))
    
    reg_3 = LinearRegression().fit(np.vstack(df_3['symptoms']), df_3['urgences'])
    bestfit_3 = reg_3.predict(np.vstack(df_3['symptoms']))
    
    
    
    # Plot the graph
    traces=[]
    traces.append(go.Scatter(x = df['symptoms'],
                             y = df['urgences'],
                            mode = 'markers',
                             opacity=0.3,
                             name = 'Correlation with nb tweets shift'))
    traces.append(go.Scatter (x = df_3['symptoms'],
                            y = df_3['urgences'],
                            mode = 'markers',
                             name = 'Correlation with nb tweets shift (average 3days)'))
    
    traces.append(go.Scatter(x=df['symptoms'],
                            y=bestfit,
                             opacity=0.3,
                            mode='lines',
                            name='Regression line'))
    
    traces.append(go.Scatter(x=df_3['symptoms'],
                            y=bestfit_3,
                            mode='lines',
                            name='Regression line on average3d'))
    
    layout = go.Layout(title="Correlation between of symptoms in Twitter (shifted 11 days) and emergencies related to COVID19",
                       xaxis=dict(title='Number of tweets'),
                       yaxis=dict(title='Number of passages in emergencies'))
    
    fig = go.Figure(traces, layout)
    py.iplot(fig)
    
    #print("Regression coefficient on raw data : " + str(reg.coef_))
    #print("Regression coefficient on data averaged 3days : " + str(reg_3.coef_))

plot_symptoms_correlation()

## Comparison with the number of deaths in Île-de-France

We also analyzed the evolution of the number of deaths due to COVID, based on the [Data from OpenCOVID19-fr](https://www.data.gouv.fr/en/datasets/chiffres-cles-concernant-lepidemie-de-covid19-en-france/). We also notice a similar trends between the number of tweets mentioning symptoms, and the number of deaths in Île-de-France, but with a lag of around 20 days.

In [25]:
def plot_symptoms_deaths_with_ma() :
    
    #['nbre_pass_corona','nbre_hospit_corona','nbre_acte_corona']
    #label_emergency = emergencies_dict.get(type_emergency)
    traces=[]
    traces.append(go.Scatter(x = tweets_symptoms['day'],
                            y = tweets_symptoms['has_symptom_mean_3'].values,
                            mode = 'lines',
                             line=dict(color='yellow'),
                            name = 'Tweets symptoms unfiltered (avg 7days)',
                            yaxis="y1"))
    traces.append(go.Scatter(x = tweets_symptoms_filtered['day'],
                            y = tweets_symptoms_filtered['has_symptom_mean_3'].values,
                            mode = 'lines',
                             line=dict(color='blue'),
                            name = 'Tweets symptoms filtered (avg 7days)',
                            yaxis="y1"))

    traces.append(go.Scatter(x = open_covid.date, 
                             y = open_covid['deaths_week'],
                             mode = 'lines',
                             name =  'Deaths due to COVID (avg 7days)',
                             line=dict(color='green'),
                             yaxis="y2"))
    
    
    layout = go.Layout(title="Evolution of mentions of symptoms and deaths due to COVID in Ile-de-France ",
                       legend={"x" : 1.08, "y" : 1},
                       yaxis=dict(title='Number of tweets'),
                       yaxis2=dict(title='Number of deaths due to COVID',
                                   overlaying='y',
                                   side='right'))
    
    
    fig = go.Figure(traces, layout)
    
    
    fig.add_shape(dict(type="rect",
                       yref='paper',
                       x0='2020-03-17',
                       y0=0,
                       x1='2020-05-11',
                       y1=1,
                       fillcolor="LightSalmon",
                       opacity=0.2,
                       layer='below',
                       line_width=0))
    
    fig.update_layout(annotations=[dict(
        x='2020-04-15',
        y=0.95,
        yref="paper",
        text="Shutdown (France)", showarrow=False)])
    
    py.iplot(fig)
    
plot_symptoms_deaths_with_ma()

In [30]:
def plot_shifted_deaths(lag) :
    
    # Shift : 
    tweets_symptoms['symptom_shift_3'] = tweets_symptoms['has_symptom_mean_3'].shift(lag)
    tweets_symptoms_filtered['symptom_shift_3'] = tweets_symptoms_filtered['has_symptom_mean_3'].shift(lag)

    
    symptoms = 'symptom_shift'
    #label_emergency = emergencies_dict.get(type_emergency)
    
    traces=[]
    traces.append(go.Scatter(x = tweets_symptoms['day'],
                            y = tweets_symptoms['symptom_shift_3'].values,
                            mode = 'lines',
                             line=dict(color='yellow'),
                            name = 'Tweets symptoms unfiltered (avg 7d)',
                            yaxis="y1"))
    
    traces.append(go.Scatter(x = tweets_symptoms_filtered['day'],
                            y = tweets_symptoms_filtered['symptom_shift_3'].values,
                            mode = 'lines',
                             line=dict(color='blue'),
                            name = 'Tweets symptoms filtered (avg 7d)',
                            yaxis="y1"))

    traces.append(go.Scatter(x = open_covid.date, 
                             y = open_covid['deaths_week'],
                             mode = 'lines',
                             name = 'Deaths due to COVID (avg 7d)',
                             line=dict(color='green'),
                             yaxis="y2"))
#     traces.append(go.Scatter(x = urgences.date_de_passage, 
#                              y = urgences['nbre_hospit_corona_mean_3'],
#                              mode = 'lines',
#                              name = 'Hospitalizations (avg 3d)',
#                              line=dict(color='grey'),
#                              yaxis="y2"))
    
    layout = go.Layout(title="Evolution of mentions of symptoms (shifted 20 days) and deaths due to COVID in Ile-de-France",
                        legend={"x" : 1.08, "y" : 1},
                       yaxis=dict(title='Number of tweets'),
                       yaxis2=dict(title='Number of deaths due to COVID',
                                   overlaying='y',
                                   side='right'))
    
    fig = go.Figure(traces, layout)
    py.iplot(fig)
    
plot_shifted_deaths(lag=20)