In [3]:
from extractors import gov_extract, local_extract, web_scrapping, twitter_extract
import extractors.config as config
import pandas as pd
import numpy as np
from functools import reduce

# Extract

The extraction process is given to components in extractors folder that use the config files in configs folder to extract data from apis, local files and web scrapping. This is useful because I can modularize the code and make them more scalable.

The complete process consist in use gov_extract, local_extract and web_scrapping to get date from the different data sources specified in the [README](../README.md) and finally loading the raw data in memory for later use in transfrom phase.

In [4]:
daily_cases, daily_test = gov_extract.load()
employe, inflation = local_extract.load()

2020-01-01 00:00:00
2020-01-02 00:00:00
2020-01-03 00:00:00
2020-01-04 00:00:00
2020-01-05 00:00:00
2020-01-06 00:00:00
2020-01-07 00:00:00
2020-01-08 00:00:00
2020-01-09 00:00:00
2020-01-10 00:00:00
2020-01-11 00:00:00
2020-01-12 00:00:00
2020-01-13 00:00:00
2020-01-14 00:00:00
2020-01-15 00:00:00
2020-01-16 00:00:00
2020-01-17 00:00:00
2020-01-18 00:00:00
2020-01-19 00:00:00
2020-01-20 00:00:00
2020-01-21 00:00:00
2020-01-22 00:00:00
2020-01-23 00:00:00
2020-01-24 00:00:00
2020-01-25 00:00:00
2020-01-26 00:00:00
2020-01-27 00:00:00
2020-01-28 00:00:00
2020-01-29 00:00:00
2020-01-30 00:00:00
2020-01-31 00:00:00
2020-02-01 00:00:00
2020-02-02 00:00:00
2020-02-03 00:00:00
2020-02-04 00:00:00
2020-02-05 00:00:00
2020-02-06 00:00:00
2020-02-07 00:00:00
2020-02-08 00:00:00
2020-02-09 00:00:00
2020-02-10 00:00:00
2020-02-11 00:00:00
2020-02-12 00:00:00
2020-02-13 00:00:00
2020-02-14 00:00:00
2020-02-15 00:00:00
2020-02-16 00:00:00
2020-02-17 00:00:00
2020-02-18 00:00:00
2020-02-19 00:00:00


In [5]:
news = web_scrapping.load()



404 Client Error: Not Found for url: https://www.eltiempo.com/buscar/88?q=covid&category=salud&publishedAt%5Bfrom%5D=20-02-01&publishedAt%5Buntil%5D=20-12-01&contentTypes%5B0%5D=article


In [6]:
tweets = twitter_extract.load()

INFO:searchtweets.result_stream:using bearer token for authentication
INFO:searchtweets.result_stream:paging; total requests read so far: 1
INFO:searchtweets.result_stream:paging; total requests read so far: 2
INFO:searchtweets.result_stream:paging; total requests read so far: 3
INFO:searchtweets.result_stream:paging; total requests read so far: 4
INFO:searchtweets.result_stream:paging; total requests read so far: 5
INFO:searchtweets.result_stream:ending stream at 543 tweets
INFO:searchtweets.result_stream:using bearer token for authentication
INFO:searchtweets.result_stream:paging; total requests read so far: 1
INFO:searchtweets.result_stream:paging; total requests read so far: 2
INFO:searchtweets.result_stream:paging; total requests read so far: 3
INFO:searchtweets.result_stream:paging; total requests read so far: 4
INFO:searchtweets.result_stream:ending stream at 474 tweets
INFO:searchtweets.result_stream:using bearer token for authentication
INFO:searchtweets.result_stream:paging; 

In [7]:
news = reduce(lambda x, y: x + y, news)

# Transform

In [8]:
import itertools

year_list = [x for x in range(2001, 2021)]
month_list = [x for x in range(1, 13)]
column_names = ['Concepto']
column_names = column_names + ['{}/{}/{}'.format(1, str(date[1]),str(date[0])) for date in itertools.product(year_list, month_list)]
column_names = column_names[:-2]

In [9]:
employe_processed = employe.dropna().copy()
column_dict = dict(zip(list(employe_processed.columns), column_names))

employe_processed.rename(columns = column_dict, inplace = True)
employe_processed = employe_processed.transpose()
employe_processed.drop(['Concepto'], inplace = True)
employe_processed.rename(
    columns = {
        12: "TGP",
        13: "TO",
        14: "TD",
        16: "Ocupados",
        17: "Desocupados",
        18: "Inactivos"
    }, inplace = True)

# Get the data from the last 8 months, to fit with daily positive cases and test.
employe_processed = employe_processed[-8:]
employe_processed.reset_index(inplace = True)
employe_processed.rename(columns = {'index': 'fecha'}, inplace = True)

employe_processed

Unnamed: 0,fecha,TGP,TO,TD,Ocupados,Desocupados,Inactivos
0,1/3/2020,60.1793,52.7533,12.3397,20859.9,2936.39,15746.1
1,1/4/2020,51.4751,41.2136,19.9349,16374.0,4076.86,19278.8
2,1/5/2020,55.3874,43.6973,21.1061,17375.4,4648.35,17739.3
3,1/6/2020,57.2966,45.4439,20.6866,18214.6,4750.77,17116.2
4,1/7/2020,57.2245,45.9524,19.698,18191.5,4462.35,16933.9
5,1/8/2020,59.2239,49.1646,16.9851,19608.7,4012.0,16263.0
6,1/9/2020,60.2381,50.5068,16.1547,20172.7,3886.71,15881.1
7,1/10/2020,60.9711,51.4164,15.6708,20626.3,3832.98,15656.9


In [15]:
inflation_processed = inflation[7:341].drop(['Unnamed: 5'], axis = 1).copy()
inflation_processed.rename(
    columns = {
        "Meta de inflación e inflación total al consumidor": "fecha",
        "Unnamed: 1": "Inflación total",
        "Unnamed: 2": "Límite superior",
        "Unnamed: 3": "Meta de inflación",
        "Unnamed: 4": "Límite inferior"
    }, inplace = True)
# Get the data from the last 8 months, to fit with daily positive cases and test.
inflation_processed = inflation_processed[:8].reset_index().drop(['index'], axis = 1)
inflation_processed['fecha'] = inflation_processed['fecha'].apply(lambda x: '{}/{}/{}'.format(1, int(str(x)[4:6]), str(x)[0:4]))
inflation_processed = inflation_processed.iloc[::-1]
inflation_processed

Unnamed: 0,fecha,Inflación total,Límite superior,Meta de inflación,Límite inferior
7,1/3/2020,3.86,4,3,2
6,1/4/2020,3.51,4,3,2
5,1/5/2020,2.85,4,3,2
4,1/6/2020,2.19,4,3,2
3,1/7/2020,1.97,4,3,2
2,1/8/2020,1.88,4,3,2
1,1/9/2020,1.97,4,3,2
0,1/10/2020,1.75,4,3,2


In [11]:
daily_test_processed = daily_test.drop([0], axis = 0).copy()
daily_test_processed['fecha'] = daily_test_processed['fecha'].apply(lambda x: '{}/{}/{}'.format(int(x.split("-")[2][:2]), int(x.split("-")[1]), x.split("-")[0]))
daily_test_processed.drop(['acumuladas', 'positivas_acumuladas', 'negativas_acumuladas', 'positividad_acumulada', 'indeterminadas'], axis = 1, inplace = True)
daily_test_processed.fillna(value = 0, inplace = True)

cities = list(list(daily_test_processed.columns)[1:])
daily_test_transposed = pd.DataFrame(columns = ['fecha', 'cantidad', 'procedenia'])

transposed_data = []
for index, row in daily_test_processed.iterrows():
    for _, city in enumerate(cities):
        transposed_data.append([row['fecha'], row[city], str.upper(city)])

transposed_data = np.array(transposed_data)
daily_test_transposed['fecha'] = transposed_data[:,0]
daily_test_transposed['cantidad'] = transposed_data[:,1]
daily_test_transposed['procedenia'] = transposed_data[:,2]

daily_test_transposed = daily_test_transposed.astype({'cantidad': 'float64'})
daily_test_transposed

Unnamed: 0,fecha,cantidad,procedenia
0,5/3/2020,0.0,AMAZONAS
1,5/3/2020,0.0,ANTIOQUIA
2,5/3/2020,0.0,ARAUCA
3,5/3/2020,0.0,ATLANTICO
4,5/3/2020,0.0,BOGOTA
...,...,...,...
10331,1/12/2020,12007.0,PROCEDENCIA_DESCONOCIDA
10332,1/12/2020,0.0,BARRANQUILA
10333,1/12/2020,177291.0,CARTAGENA
10334,1/12/2020,77010.0,SANTA_MARTA


In [12]:
daily_cases_processed = daily_cases.copy()
daily_cases_processed.reset_index(inplace = True)
daily_cases_processed.drop(['index', 'fecha_de_notificaci_n', 'id_de_caso', 'departamento', 'pais_viajo_1_cod',
                            'ciudad_municipio', 'unidad_medida', 'fecha_reporte_web', 'per_etn_'], 
                           axis =  1, inplace = True)

column_date_list = ['fecha_diagnostico', 'fecha_inicio_sintomas', 'fecha_recuperado', 'fecha_muerte']

for column in column_date_list:
    daily_cases_processed[column] = daily_cases_processed[column].apply(lambda x: '{}'.format(x.split(' ')[0]) if type(x) == str else x)
    
daily_cases_processed

Unnamed: 0,departamento_nom,ciudad_municipio_nom,edad,sexo,fuente_tipo_contagio,ubicacion,estado,pais_viajo_1_nom,recuperado,fecha_inicio_sintomas,fecha_muerte,fecha_diagnostico,fecha_recuperado,tipo_recuperacion,nom_grupo_
0,BOGOTA,BOGOTA,19,F,Importado,Casa,Leve,ITALIA,Recuperado,27/2/2020,,6/3/2020,13/3/2020,PCR,
1,VALLE,BUGA,34,M,Importado,Casa,Leve,ESPAÑA,Recuperado,4/3/2020,,9/3/2020,19/3/2020,PCR,
2,ANTIOQUIA,MEDELLIN,50,F,Importado,Casa,Leve,ESPAÑA,Recuperado,29/2/2020,,9/3/2020,15/3/2020,PCR,
3,ANTIOQUIA,MEDELLIN,55,M,Relacionado,Casa,Leve,,Recuperado,6/3/2020,,11/3/2020,26/3/2020,PCR,
4,ANTIOQUIA,MEDELLIN,25,M,Relacionado,Casa,Leve,,Recuperado,8/3/2020,,11/3/2020,23/3/2020,PCR,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1316714,ANTIOQUIA,MEDELLIN,36,M,En estudio,Casa,Leve,,Activo,21/11/2020,,28/11/2020,,,
1316715,ANTIOQUIA,RIONEGRO,11,M,En estudio,Casa,Leve,,Activo,22/11/2020,,29/11/2020,,,
1316716,ANTIOQUIA,RIONEGRO,41,F,En estudio,Casa,Leve,,Activo,24/11/2020,,29/11/2020,,,
1316717,ANTIOQUIA,RIONEGRO,39,F,En estudio,Casa,Leve,,Activo,23/11/2020,,29/11/2020,,,


In [13]:
news_array = [[x['title'].replace('\n', ''), x['publised_time'].strftime('%-d/%-m/%Y'), x['category'], x['resume'].replace('\n', '')] for x in news]
news_df = pd.DataFrame(data = news_array, columns = ['titulo', 'fecha', 'categoria', 'resumen'])
news_df.drop_duplicates(subset = ["titulo", "resumen"], inplace = True)
news_df = news_df.iloc[::-1]
news_df.reset_index(inplace = True)
news_df.drop(['index'], inplace = True, axis = 1)
news_df

Unnamed: 0,titulo,fecha,categoria,resumen
0,Coronavirus tiene nombre oficial: OMS lo bauti...,11/2/2020,Salud,Nace del acrónimo en inglés a partir de la exp...
1,"El Covid-19, una amenaza mundial que deja más ...",11/2/2020,Salud,Según advirtió la Organización Mundial de la S...
2,Así sería la cuarentena en el país para los co...,18/2/2020,Salud,Los 14 connacionales que serán evacuados de Ch...
3,No se ha confirmado ningún caso de coronavirus...,19/2/2020,Salud,Directivas del centro hospitalario indicaron q...
4,Así será la evacuación y llegada de los colomb...,21/2/2020,Salud,Ministerio de Salud sostiene que el viaje part...
...,...,...,...,...
863,Europa decide sobre primera vacuna covid-19 an...,1/12/2020,Salud,Agencia Europea de Medicamentos (EMA) puso pla...
864,Colombia registra 182 muertes más y 8.430 nuev...,1/12/2020,Salud,El Ministerio de Salud reportó también 6.037 r...
865,Colombia registra 168 muertes más y 7.986 nuev...,1/12/2020,Salud,El Ministerio de Salud reportó también 7.158 r...
866,Vacuna a toda costa: reflexiones tras días de ...,1/12/2020,Salud,"Análisis de Alejandro Gaviria, Tatiana Andia, ..."


In [14]:
new_tweets = [[x['id'], x['text'], x['fecha']] if 'id' in x else ['nan', 'nan', 'nan'] for x in tweets]
tweets_df = pd.DataFrame(new_tweets, columns = ['id', 'text', 'fecha'])
tweets_df.drop_duplicates(subset = ['text'], inplace = True)
tweets_df = tweets_df.iloc[::-1]
tweets_df.reset_index(inplace = True)
tweets_df.drop(['index'], inplace = True, axis = 1)
tweets_df

Unnamed: 0,id,text,fecha
0,1332112416973459457,@DarthVaderBaq @joel_andrus @MinSaludCol @Ivan...,2020-11-27
1,1332112888421625858,#Colombia | Juzgado ordena a Minsalud exigir p...,2020-11-27
2,1332113472851677184,#26Nov #Coronavirus #Colombia \nLas autoridad...,2020-11-27
3,1332114509356560385,Interesante análisis de los datos #Covid19 en ...,2020-11-27
4,1332117366860345345,"En el informe de hoy, #Bogota aumentó sus caso...",2020-11-27
...,...,...,...
956,1333921344996708352,"Somos tan vulnerables, que Venezuela es uno de...",2020-12-01
957,1333921455814438913,RT @NoticiasONU: La pandemia de #COVID19 ha di...,2020-12-01
958,1333923117387943941,"RT @CoronavirusNewv: 🇨🇴 | COLOMBIA \n\n7,986 n...",2020-12-01
959,1333923715466326022,#Entérate || Demandan a ministro de Salud de #...,2020-12-01
