This notebook generates a csv file that contains tweets about covid-19, plus other important informations such as the number of retweets, likes and replies, datetime ... 

In [2]:
import pickle
import pandas as pd
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import os

pd.set_option('display.max_colwidth', None)


## Load data 

In [None]:
## Number of pickle files
n_pickles=27  
## The path where the pickles are stored. 
path='projects/filiere/data'

with open(os.path.join(path,'0.pickle'),'rb') as pkl :
    sample0=pickle.load(pkl)

## save into a dataframe 
df_all=pd.DataFrame(sample0)
for n_pickle in range(1,n_pickles):
    with open(os.path.join(path,'{}.pickle'.format(str(n_pickle))),'rb') as pkl :
        sample=pickle.load(pkl)
    df_all=pd.concat([df_all,pd.DataFrame(sample)],ignore_index=True)

In [None]:
## Get an idea about the resulted dataframe
df_all.head()

## Get informations 

The idea here is to get all possible informations from the HTML file. Unfortunately, it does not contain the location. So
to study for instance the french population, we will assume at this point that all french tweets concern French pepole. 
Nevertheless, we provide another notebook that explores twitter api in order to get users locations. 


In [None]:
def get_infos(x): 
    """
    Function that parses html file using BeautifulSoup in order to get these infos : number of replies, retweets, likes 
    and if the tweet is a share tweet or not .
    """
    soup = BeautifulSoup(x, 'html.parser')
    date_time=soup.find_all('time')[0].get('datetime')
    all_divs=soup.find_all('div')
    all_labels=[link.get('aria-label') for link in all_divs]
    
    replies,retweets,likes,done=None,None,None,False
    for x in all_labels :
        if x==None :
            continue 
        if done==False and (('like') in x or ('reply') in x or 'Retweet' in x ):
            try : 
                replies=int(re.findall('([0-9]+) repl',x)[0])
            except :
                replies=0
            try : 
                retweets=int(re.findall('([0-9]+) Retweet',x)[0])
            except :
                retweets=0 
            try : 
                likes=int(re.findall('([0-9]+) like',x)[0])
            except :
                likes=0 
            done=True
        if 'share' in x :
            test=True
        else : 
            test=False
    return date_time,replies,retweets,likes,test

In [None]:
tqdm.pandas()
tabs_add= df_all['html'].progress_apply(lambda x:get_infos(x))  

## Convert the data obtained via the parsing into a dataframe
df_new = pd.DataFrame(list(tabs_add.values), columns =['datetime', 'replies', 'retweets','likes','is_share']) 
## Add the columns
df_all= pd.concat( [df_all,df_new] ,axis=1)
del df_all['html'],df_all['timestamp']

## view the five first example with head 
df_all.head()

In [None]:
## save into a pickle 
path_to_save='projects/filiere/saved_data/df_processed.pickle'
pickle.dump(df_all,open(path_to_save,'wb'))