In [1]:
import pandas as pd
import numpy as np
import os
from time import time
import socket
from pathlib import Path
import unicodedata
import sys

import warnings
warnings.filterwarnings('ignore')

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from py4j.java_gateway import java_import
from functools import reduce
from pyspark.sql import DataFrame
from pyspark import SparkContext
from pyspark.sql.window import Window

# 0. Init

In [3]:
memory = '10g'
pyspark_submit_args = ' --driver-memory ' + memory  + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [4]:
try:
    spark
except NameError:
    print('Create Local SparkSession')
    spark=SparkSession.builder.config("spark.driver.host", "localhost").appName("clean_tweets").getOrCreate()
    
# IgnoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
sc = spark.sparkContext

Create Local SparkSession


In [5]:
#spark.sparkContext.getConf().getAll()
#os.environ

In [6]:
# Paths to data
path_to_data = "../data/"
path_to_external_data = os.path.join(path_to_data, "external-data/")
path_to_output = os.path.join(path_to_data,'visualisation_data')
path_to_parquets = os.path.join(path_to_data,'chunks')
parquet_files = sorted([os.path.join(path_to_parquets,'IDF-departments-to-analyze'),
                        os.path.join(path_to_parquets, 'IDF-updates','**')])
#parquet_files = [os.path.join(path_to_parquets, 'IDF-departments-to-analyze','**')]

In [7]:
print('List files to be processed...')

fs=spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
list_status=fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path_to_parquets))

paths=[file.getPath().toString() for file in list_status]
np.random.seed(0)
paths=np.random.permutation(sorted(parquet_files))

print('# Files:', len(paths))

List files to be processed...
# Files: 2


In [9]:
tweets=spark.read.option("encoding", "UTF-8").parquet(*parquet_files)
tweets=tweets.drop_duplicates()

In [10]:
print("Number of tweets : %d" % tweets.count())
print("Number of unique users : %d" % tweets.select('user_id').distinct().count())

Number of tweets : 42338679
Number of unique users : 30651


# 1. Data cleaning

In [11]:
# UNIDECODE : remove accents
def make_trans():
    matching_string = ""
    replace_string = ""

    for i in range(ord(" "), sys.maxunicode):
        name = unicodedata.name(chr(i), "")
        if "WITH" in name:
            try:
                base = unicodedata.lookup(name.split(" WITH")[0])
                matching_string += chr(i)
                replace_string += base
            except KeyError:
                pass

    return matching_string, replace_string

def clean_text(c):
    matching_string, replace_string = make_trans()
    return F.translate(
        F.regexp_replace(c, "\p{M}", ""), 
        matching_string, replace_string
    ).alias(c)

In [12]:
# Preprocess tweets
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [13]:
def clean_dataset(df):
    
    df=df.select(
            'id_str',
            'user_id',
            F.date_format(F.col('created_at'),"yyyy-MM-dd").alias('day').cast("date"),
            'full_text',
            'lang'
            )
    
    df = df.repartition(160)
    
    df = df.dropDuplicates(subset=['id_str'])
    
    # only after december 2019
    df = df.filter(df.day>'2019-12-01')
    
    # remove rt
    df = df.filter(~ df.full_text.startswith('RT'))
    
    # anonymization: identify userid, url, etc.
#     pre_process_udf = F.udf(lambda x:text_processor.pre_process_doc(x))
#     df=df.withColumn('clean_text', pre_process_udf('full_text'))
    df = df.withColumn('clean_text', F.regexp_replace('full_text', r'@[A-Za-z0-9-_]+','@mention'))
    df = df.withColumn('clean_text', F.regexp_replace('clean_text', 'https?://[A-Za-z0-9./]+','[url]'))

    df=df.withColumn('anonymized_text', F.col('clean_text'))
    
    # Cleaning: tolower, remove punctuation
    df=df.withColumn('clean_text', F.lower(F.col('clean_text')))
    df=df.withColumn('clean_text', F.ltrim(df.clean_text))
 
    # language : french
    df = df.filter(df.lang=='fr')
    
    return df

In [14]:
tweets = clean_dataset(tweets)
#tweets = tweets.select('user_id','id_str','day', clean_text('full_text'),'lang', 'department')
tweets = tweets.select('user_id','id_str','day','anonymized_text', clean_text('clean_text'),'lang')
tweets = tweets.withColumn('clean_text', F.regexp_replace('clean_text', '[^\sa-zA-Z0-9@]', ' '))

# 2. Symptoms analysis

## 2.1. Contains keyword

In [15]:
tweets = tweets.withColumn('covid', F.col('clean_text').rlike('covid|corona |coronavirus'))
tweets = tweets.withColumn('confinement', F.col('clean_text').rlike('confin|quarantaine'))
tweets = tweets.withColumn('restezchezvous', F.col('clean_text').rlike('je reste chez moi|jerestechezmoi|restezchezvous|restez chez vous'))

In [16]:
print("Number of tweets mentioning COVID : %d" % tweets.filter(tweets.covid==1).count())
print("Number of tweets mentioning lockdown/quarantine : %d" % tweets.filter(tweets.confinement==1).count())

Number of tweets mentioning COVID : 178673
Number of tweets mentioning lockdown/quarantine : 171725


In [17]:
symptoms_dict_fr = {'cough' : ['toux', 'tousse'],
                   'sore_throat' : ['maux de gorge', 'mal de gorge', 'mal a la gorge'],
                   'fever' : ['fievre', 'de la temperature'],
                    #'mal de tête' : ['mal de tête','mal de crâne','mal à la tête','mal de tete','mal de crane','mal à la tete'],
                   'loss_taste' : ['perte du gout', "perte de lodorat","perte de l odorat","perdu l odorat","perdu lodorat",
                                   "perdu le gout","plus de gout","plus dodeur","plus d odeur"],
                   'skin_symptom' : ['engelures'],
                   'symptoms' : ['symptom'],
                   'breathing_difficulties' : ['difficultes a respirer', 'difficultes respiratoires', 'difficulte a respirer',
                                               'mal a respirer']
                   #'hospitalisation' : ['hôpital','hopital','hospital','réanim','reanim']
                   }

for symptom in symptoms_dict_fr.keys():
    tweets = tweets.withColumn(symptom, F.col('clean_text').rlike(('|').join(['^' + x for x in symptoms_dict_fr.get(symptom)] + 
                                                                            [' ' + x for x in symptoms_dict_fr.get(symptom)])))
tweets = tweets.withColumn('nb_symptoms', sum(tweets[c].cast('long') for c in list(symptoms_dict_fr.keys())))
tweets = tweets.withColumn('contains_symptom', F.col('nb_symptoms')>=1)

In [19]:
print("Number of tweets mentionning symptoms : %d" % tweets.filter(tweets.contains_symptom==1).count())

Number of tweets mentionning symptoms : 9326


## 2.2. Contains symptoms + pronums

In [20]:
# NEW FILTERS : if contains a pronum (marker of lived experience), or tweet startswith a symptom, 
# AND tweet does not contain hashtag

# Indicators of feeling
pronouns = ['g','j a','j ai','jai','m a','m ai','je','me', 'mes', 'l a', 'l ai','mon','ma','son','sa','jsui','j sui','j suis','jtousse']

tweets = tweets.withColumn('pronoun', 
                           (F.col('clean_text').rlike(' |'.join(['^'+s for s in pronouns]))) |
                          (F.col('clean_text').rlike((' |').join([' '+s for s in pronouns]))))

# Startswith symptom
tweets = tweets.withColumn('clean_text2', F.regexp_replace('clean_text','@mention', ''))
tweets = tweets.withColumn('clean_text2', F.ltrim(tweets.clean_text2)) \
                .withColumn('start_symptom', (F.col('clean_text2').rlike('|'.join(['^'+s for s in list(symptoms_dict_fr.keys())]))))

# Hashtags
tweets = tweets.withColumn('hashtag', F.col('clean_text').contains('#'))

# Classif
tweets = tweets.withColumn('has_symptom', ((F.col('pronoun')==1) | (F.col('start_symptom')==1)) & 
                           (F.col('hashtag')==0) & (F.col('contains_symptom')==1))

In [21]:
print("Number of tweets of people having symptoms (symptom+pronoun) : %d" % tweets.filter(tweets.has_symptom==1).count())

Number of tweets of people having symptoms (symptom+pronoun) : 4190


## 2.4. Groupby day

In [22]:
tweets_mention_symptoms = tweets.filter(tweets['contains_symptom']==1)
tweets_has_symptoms = tweets.filter(tweets['has_symptom']==1)
tweets_covid_related = tweets.filter((tweets.covid==1)|(tweets.confinement==1)|(tweets.restezchezvous==1))

In [24]:
cols_covid=['covid','confinement','RestezChezVous']

tweets_mention_symptoms = tweets_mention_symptoms.select(['day'] +
                                                         [F.col(c).cast('long') for c in list(symptoms_dict_fr.keys())] +
                                                         [F.col('contains_symptom').cast('long')])\
                                                .groupby('day').sum()\
                                                .orderBy('day')\
                                                .toDF(*['day']+list(symptoms_dict_fr.keys())+['has_symptom'])

tweets_has_symptoms = tweets_has_symptoms.select(['day'] +
                                                         [F.col(c).cast('long') for c in list(symptoms_dict_fr.keys())] +
                                                         [F.col('has_symptom').cast('long')])\
                                                .groupby('day').sum()\
                                                .orderBy('day')\
                                                .toDF(*['day']+list(symptoms_dict_fr.keys())+['has_symptom'])

tweets_covid_related = tweets_covid_related.select(['day'] +
                                                  [F.col(c).cast('long') for c in cols_covid+['contains_symptom']]) \
                                                   .groupby('day').sum()\
                                                   .orderBy('day')\
                                                   .toDF(*['day']+cols_covid+['has_symptom'])

list_covid_symptoms = tweets.filter(tweets['contains_symptom']==1)\
                            .select(['id_str','day','anonymized_text'])

In [26]:
# To Pandas
start=time()
print('Converting to Pandas: number of tweets mentioning symptoms...')
tweets_mention_symptoms = tweets_mention_symptoms.withColumn('day', F.to_timestamp(tweets_mention_symptoms.day, 'yyy-MM-dd')).toPandas()
print('Done in : ' + str(round(time()-start)) + 'sec')

start=time()
print('Converting to Pandas: number of tweets mentioning symptoms+pronouns...')
tweets_has_symptoms = tweets_has_symptoms.withColumn('day', F.to_timestamp(tweets_has_symptoms.day, 'yyy-MM-dd')).toPandas()
print('Done in :' + str(round(time()-start)) + 'sec')

start=time()
print('Converting covid related terms df to Pandas...')
tweets_covid_related = tweets_covid_related.withColumn('day', F.to_timestamp(tweets_covid_related.day, 'yyy-MM-dd')).toPandas()
print('Done in :' + str(round(time()-start)) + 'sec')

start=time()
print('Converting tweets mentioning symptoms to Pandas...')
list_covid_symptoms = list_covid_symptoms.withColumn('day', F.to_timestamp(list_covid_symptoms.day, 'yyy-MM-dd')).toPandas()
print('Done in: ' + str(round(time()-start)) + 'sec')

Converting to Pandas: number of tweets mentioning symptoms...
Done in : 314sec
Converting to Pandas: number of tweets mentioning symptoms+pronouns...
Done in :284sec
Converting covid related terms df to Pandas...
Done in :199sec
Converting tweets mentioning symptoms to Pandas...
Done in: 237sec


# 3. Cleaning data for dashboard

In [30]:
import requests

emergencies_data_path = os.path.join(path_to_external_data, "sante-publique-france", 'emergencies_dataset.csv')
source='https://www.data.gouv.fr/fr/datasets/r/eceb9fb4-3ebc-4da3-828d-f5939712600a'
req = requests.get(source)
url_content = req.content
csv_file = open(emergencies_data_path, 'wb')
csv_file.write(url_content)
csv_file.close()
print('Downloading emergency file')

open_covid_path = os.path.join(path_to_external_data, "sante-publique-france", 'open_covid.csv')
source = 'https://www.data.gouv.fr/en/datasets/r/0b66ca39-1623-4d9c-83ad-5434b7f9e2a4'
req = requests.get(source)
url_content = req.content
csv_file = open(open_covid_path, 'wb')
csv_file.write(url_content)
csv_file.close()
print('Downloading deaths file')

Downloading emergency file
Downloading deaths file


In [31]:
# Loading data from Santé Publique France
emergencies = pd.read_csv(emergencies_data_path, sep=";")
emergencies['date_de_passage'] = pd.to_datetime(emergencies['date_de_passage'])
emergencies['dep']=emergencies['dep'].astype(str)

# Keep only Île-De-France
emergencies = emergencies.loc[emergencies['dep'].isin(['75','77','78','91','93','94','95'])].groupby(['date_de_passage']).agg('sum').reset_index()

In [32]:
# Deaths file
open_covid = pd.read_csv(os.path.join(open_covid_path), sep=',')
open_covid = open_covid[open_covid['maille_nom']=='Île-de-France']
open_covid = open_covid.groupby('date').agg('mean').reset_index()

# deaths in cumulative : change to frequency
z = np.array(open_covid['deces'])
z[1:] -= z[:-1].copy()
open_covid['deaths_freq'] = z

In [33]:
# rolling mean
def rolling_mean(ts, window):
    return ts.rolling(window=window).mean()

def rolling_mean_df(df) :
    df['has_symptom_mean_week'] = rolling_mean(df['has_symptom'], window=7)
    df['has_symptom_mean_3'] = rolling_mean(df['has_symptom'], window=3)
    return df

for df in [tweets_mention_symptoms, tweets_has_symptoms]:
    df=rolling_mean_df(df)
    
    
emergencies_dict = {'nbre_pass_corona' : 'Nb passages emergencies',
                   'nbre_hospit_corona' : 'Nb hospitalizations',
                   'nbre_acte_corona' : 'Nb medical acts'}
for type_urgence in list(emergencies_dict.keys()):
    emergencies[type_urgence+'_mean_week'] = rolling_mean(emergencies[type_urgence], window=7)
    emergencies[type_urgence+'_mean_3'] = rolling_mean(emergencies[type_urgence], window=3)
    
open_covid['deaths_3'] = rolling_mean(open_covid['deaths_freq'], window=3)
open_covid['deaths_week'] = rolling_mean(open_covid['deaths_freq'], window=7)

In [34]:
tweets_mention_symptoms.to_csv(os.path.join(path_to_data,'visualisation_data','tweets_mention_symptoms.csv'), sep=';')
tweets_has_symptoms.to_csv(os.path.join(path_to_data,'visualisation_data','tweets_has_symptoms.csv'), sep=';')
emergencies.to_csv(os.path.join(path_to_data,'visualisation_data','emergencies.csv'), sep=';')
open_covid.to_csv(os.path.join(path_to_data,'visualisation_data','open_covid.csv'), sep=';')
list_covid_symptoms.to_csv(os.path.join(path_to_data,'visualisation_data','list_covid_symptoms.csv'),sep=';')