In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df_sika_original = pd.read_csv('cdc_zika.csv', low_memory=False)

In [3]:
df = df_sika_original.drop(columns=['time_period','time_period_type'])
df = df.astype('str')
df_codes= pd.read_excel('countries_codes.xlsx')
df_codes.set_index(['Alfa-2'],inplace=True)

In [4]:
df.dtypes

report_date        object
location           object
location_type      object
data_field         object
data_field_code    object
value              object
unit               object
dtype: object

In [5]:
df.columns

Index(['report_date', 'location', 'location_type', 'data_field',
       'data_field_code', 'value', 'unit'],
      dtype='object')

In [6]:
df_codes.columns

Index(['Nombre del país', 'ISO 3166-1', 'Alfa-3'], dtype='object')

In [7]:
df = df[df['report_date'] != 'nan'].copy()
df.reset_index(drop=True,inplace=True)

In [8]:
df['report_date']= df['report_date'].str.replace('[_-]','-')
df['report_date'] = pd.to_datetime(df['report_date'])

In [9]:
df['country']= df['location'].str.split('-').apply(lambda x: ' '.join(x[0].split('_')))

In [10]:
def remove_under_line(x):
    array= x.split('_')
    return ' '.join(array)

In [11]:
df['location']=(
    df['location']
    .apply(lambda x:' '.join(x.split('-')))
    .apply(remove_under_line)
)

In [12]:
df['data_field']= df['data_field'].apply(remove_under_line)

In [13]:
def elimina_asteriscos(x):
    if(x.find('*')>0):
        return 0.0
    else:
        return x

In [14]:
df['value']= df['value'].apply(elimina_asteriscos)

In [15]:
df['value']=pd.to_numeric(df['value'],errors='coerce')

In [16]:
def define_country(data_code):
    alpha_code = data_code[:2]
    if(alpha_code == 'HA'):
        alpha_code = 'HT'
    return df_codes.loc[alpha_code]['Nombre del país']

In [17]:
df['country']= df['data_field_code'].apply(define_country)

In [21]:
df.to_csv('zika_cleaded.csv',index=False)

In [22]:
df.columns

Index(['report_date', 'location', 'location_type', 'data_field',
       'data_field_code', 'value', 'unit', 'country'],
      dtype='object')

In [56]:
df_filtered= df[~df['data_field'].str.contains('not|no')]
df_filtered.reset_index(drop=True,inplace=True)

In [76]:
substrings=[
    'rash',
    'arthritis',
    'arthralgia',
    'fever',
    'conjunctivitis',
    'eyepain',
    'headache',
    'malaise',
    'male',
    'local',
    'discarded',
    'microcephaly',
    'suspected',
    'study',
    'gbs'
] 
(df_filtered[~df_filtered['data_field']
             .str.contains('|'.join(substrings))]['data_field']
             .unique()
)

array(['cumulative confirmed imported cases',
       'cumulative probable imported cases', 'zika reported',
       'zika confirmed laboratory', 'zika confirmed clinic',
       'total zika new confirmed pcr', 'zika new confirmed pcr f',
       'zika new confirmed pcr m', 'efe reported',
       'zika confirmed pcr cumulative',
       'zika confirmed pregnant cumulative',
       'total zika confirmed cumulative', 'total zika confirmed imported',
       'total zika confirmed pregnant',
       'total zika confirmed ages 0-11mo F',
       'total zika confirmed ages 0-11mo M',
       'total zika confirmed ages 1-4yrs F',
       'total zika confirmed ages 1-4yrs M',
       'total zika confirmed ages 5-9yrs F',
       'total zika confirmed ages 5-9yrs M',
       'total zika confirmed ages 10-14yrs F',
       'total zika confirmed ages 10-14yrs M',
       'total zika confirmed ages 15-19yrs F',
       'total zika confirmed ages 15-19yrs M',
       'total zika confirmed ages 20-49yrs F',
       '