## LOAD LIBRARIES


In [0]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd 
import multiprocessing
import random

from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder
import timeit
import operator
import itertools
from collections import Counter

## DATA FRAME

Firstly, a dataset will be created, containing the following variables: symptoms, direct contact with someone infected with Covid_19, exposure, protection, transport (public or private), area and interaction of the hands with the face.

In [0]:
data= pd.DataFrame()

Create variables: 'protección', 'transporte', 'contacto_directo'. Each of them will contain binary variables.

In [3]:
data['proteccion']= ['si', 'no']
data['transporte']= ['si', 'no']
data['contacto_directo'] = ['si', 'no']
data

Unnamed: 0,proteccion,transporte,contacto_directo
0,si,si,si
1,no,no,no


Create variable 'exposición'. This variable is related with the type of work condiction over confinement period. Its values are:


*   Alta: Telework.
*   Media: Working from the office.
*   Baja: Public attention.




In [0]:
data1=pd.DataFrame()

In [5]:
data1['exposicion']= ['alta', 'media', 'baja']
data1

Unnamed: 0,exposicion
0,alta
1,media
2,baja


Create variable 'síntoma', which referse to the type of symptom.

In [6]:
data2=pd.DataFrame()
data2['sintoma'] = ['fiebre', 'tos_seca', 'fatiga', 'flemas_esputos', 'dif_resp','dolor_garganta', 'dolor_cabeza', 'dolor_muscular',
                    'escalofrios', 'nausea_vomito', 'cong_nasal','diarrea', 'expect_sangre', 'conjuntivitis', 'dermatitis', 'falta_gusto_olfato']
data2

Unnamed: 0,sintoma
0,fiebre
1,tos_seca
2,fatiga
3,flemas_esputos
4,dif_resp
5,dolor_garganta
6,dolor_cabeza
7,dolor_muscular
8,escalofrios
9,nausea_vomito


data_final is created with the combination of the datasets created above.

In [7]:
lista = [data, data1, data2]
data_final = pd.concat(lista, axis=1)
data_final

Unnamed: 0,proteccion,transporte,contacto_directo,exposicion,sintoma
0,si,si,si,alta,fiebre
1,no,no,no,media,tos_seca
2,,,,baja,fatiga
3,,,,,flemas_esputos
4,,,,,dif_resp
5,,,,,dolor_garganta
6,,,,,dolor_cabeza
7,,,,,dolor_muscular
8,,,,,escalofrios
9,,,,,nausea_vomito


Here we clean de dataset removing the NaNs. Also, total possible combinations is computed.

In [8]:
proteccion = len(data_final.proteccion.dropna().unique())
transporte = len(data_final.transporte.dropna().unique())
contacto_directo = len(data_final.contacto_directo.dropna().unique())
exposicion= len(data_final.exposicion.dropna().unique())
sintoma= len(data_final.sintoma.dropna().unique())

print("Total Combination Possible: ",proteccion * transporte * contacto_directo * exposicion * sintoma)

Total Combination Possible:  384


We define a formula that calculates and returns the power set of the set c.

In [0]:
def potencia(c):
   
    if len(c) == 0:
        return [[]]
    r = potencia(c[:-1])
    return r + [s + [c[-1]] for s in r]

def imprime_ordenado(c):
    
    for e in sorted(c, key=lambda s: (len(s), s)):
        print(e)

All symptoms combinations are created.

In [10]:
comb_sintomas=potencia(['fiebre', 'tos_seca', 'fatiga', 'flemas_esputos', 'dif_resp','dolor_garganta', 'dolor_cabeza', 'dolor_muscular',
                    'escalofrios', 'nausea_vomito', 'cong_nasal','diarrea', 'expect_sangre', 'conjuntivitis', 'dermatitis', 'falta_gusto_olfato'])

comb_sintomas

[[],
 ['fiebre'],
 ['tos_seca'],
 ['fiebre', 'tos_seca'],
 ['fatiga'],
 ['fiebre', 'fatiga'],
 ['tos_seca', 'fatiga'],
 ['fiebre', 'tos_seca', 'fatiga'],
 ['flemas_esputos'],
 ['fiebre', 'flemas_esputos'],
 ['tos_seca', 'flemas_esputos'],
 ['fiebre', 'tos_seca', 'flemas_esputos'],
 ['fatiga', 'flemas_esputos'],
 ['fiebre', 'fatiga', 'flemas_esputos'],
 ['tos_seca', 'fatiga', 'flemas_esputos'],
 ['fiebre', 'tos_seca', 'fatiga', 'flemas_esputos'],
 ['dif_resp'],
 ['fiebre', 'dif_resp'],
 ['tos_seca', 'dif_resp'],
 ['fiebre', 'tos_seca', 'dif_resp'],
 ['fatiga', 'dif_resp'],
 ['fiebre', 'fatiga', 'dif_resp'],
 ['tos_seca', 'fatiga', 'dif_resp'],
 ['fiebre', 'tos_seca', 'fatiga', 'dif_resp'],
 ['flemas_esputos', 'dif_resp'],
 ['fiebre', 'flemas_esputos', 'dif_resp'],
 ['tos_seca', 'flemas_esputos', 'dif_resp'],
 ['fiebre', 'tos_seca', 'flemas_esputos', 'dif_resp'],
 ['fatiga', 'flemas_esputos', 'dif_resp'],
 ['fiebre', 'fatiga', 'flemas_esputos', 'dif_resp'],
 ['tos_seca', 'fatiga', 'flema

We are defining a new DataFrame with the values obtained in the last step. This list contains all possible combinations of 'síntoma'. The one with no value stands for no symptoms.

In [11]:
comb_sint=comb_sintomas
comb_sint=pd.DataFrame({'sintomas': comb_sintomas})
comb_sint

Unnamed: 0,sintomas
0,[]
1,[fiebre]
2,[tos_seca]
3,"[fiebre, tos_seca]"
4,[fatiga]
...,...
65531,"[fiebre, tos_seca, flemas_esputos, dif_resp, d..."
65532,"[fatiga, flemas_esputos, dif_resp, dolor_garga..."
65533,"[fiebre, fatiga, flemas_esputos, dif_resp, dol..."
65534,"[tos_seca, fatiga, flemas_esputos, dif_resp, d..."


We link the combined symptoms with the rest of variables.

In [12]:
final_data=pd.concat([data_final, comb_sint], axis=1, sort=False)
del final_data['sintoma']
final_data

Unnamed: 0,proteccion,transporte,contacto_directo,exposicion,sintomas
0,si,si,si,alta,[]
1,no,no,no,media,[fiebre]
2,,,,baja,[tos_seca]
3,,,,,"[fiebre, tos_seca]"
4,,,,,[fatiga]
...,...,...,...,...,...
65531,,,,,"[fiebre, tos_seca, flemas_esputos, dif_resp, d..."
65532,,,,,"[fatiga, flemas_esputos, dif_resp, dolor_garga..."
65533,,,,,"[fiebre, fatiga, flemas_esputos, dif_resp, dol..."
65534,,,,,"[tos_seca, fatiga, flemas_esputos, dif_resp, d..."


Variable "sintomas" is modified. This variable is adequated to the post application of dummies.

In [0]:
def try_join(l):
    try:
        return ','.join(map(str, l))
    except TypeError:
        return np.nan

final_data['Sintomas'] = [try_join(l) for l in final_data['sintomas']]
del final_data['sintomas']

Here we see the result after applying the formula.

In [14]:
final_data

Unnamed: 0,proteccion,transporte,contacto_directo,exposicion,Sintomas
0,si,si,si,alta,
1,no,no,no,media,fiebre
2,,,,baja,tos_seca
3,,,,,"fiebre,tos_seca"
4,,,,,fatiga
...,...,...,...,...,...
65531,,,,,"fiebre,tos_seca,flemas_esputos,dif_resp,dolor_..."
65532,,,,,"fatiga,flemas_esputos,dif_resp,dolor_garganta,..."
65533,,,,,"fiebre,fatiga,flemas_esputos,dif_resp,dolor_ga..."
65534,,,,,"tos_seca,fatiga,flemas_esputos,dif_resp,dolor_..."


NaNs are removed. Also, total possible combination is computed. As we can see, total combinations are 65.536x2x3x2x2=1.572.864


In [15]:
proteccion = len(final_data.proteccion.dropna().unique())
transporte = len(final_data.transporte.dropna().unique())
contacto_directo = len(final_data.contacto_directo.dropna().unique())
exposicion= len(final_data.exposicion.dropna().unique())
sintomas= len(final_data.Sintomas.dropna().unique())


print("Total Combination Possible: ",proteccion * transporte * contacto_directo * exposicion * sintomas)

Total Combination Possible:  1572864


Now, let's convert the series we created into list.

In [0]:
columns = [final_data.proteccion.dropna().unique().tolist(),
          final_data.transporte.dropna().unique().tolist(),
          final_data.contacto_directo.dropna().unique().tolist(),
          final_data.exposicion.dropna().unique().tolist(),
          final_data.Sintomas.dropna().unique().tolist()]

final_data2 = pd.DataFrame(list(itertools.product(*columns)), columns=final_data.columns)

As we can see, the dataset named 'final_data2' is created including all the possible combinations of the dataset's variables. Thus, the dataset 'final_data' has a total of possible combinations of 1.572.864, which is the number of rows obtained.

In [17]:
final_data2

Unnamed: 0,proteccion,transporte,contacto_directo,exposicion,Sintomas
0,si,si,si,alta,
1,si,si,si,alta,fiebre
2,si,si,si,alta,tos_seca
3,si,si,si,alta,"fiebre,tos_seca"
4,si,si,si,alta,fatiga
...,...,...,...,...,...
1572859,no,no,no,baja,"fiebre,tos_seca,flemas_esputos,dif_resp,dolor_..."
1572860,no,no,no,baja,"fatiga,flemas_esputos,dif_resp,dolor_garganta,..."
1572861,no,no,no,baja,"fiebre,fatiga,flemas_esputos,dif_resp,dolor_ga..."
1572862,no,no,no,baja,"tos_seca,fatiga,flemas_esputos,dif_resp,dolor_..."


We create a new column "zona". "zona" include the different Spanish regions. This new variable is added to the last dataset created, "final_data2".

In [18]:
zona= ['madrid', 'cataluña', 'castilla_la_mancha', 'castilla_y_leon', 'pais_vasco', 'andalucia', 'com_val', 'galicia', 'aragon', 'navarra', 'larioja',
                'extremadura', 'asturias', 'cantabria', 'canarias', 'baleares', 'murcia', 'ceuta', 'melilla']
lista=random.choices(zona, k=1572864)
final_data2['zona']=lista
final_data2

Unnamed: 0,proteccion,transporte,contacto_directo,exposicion,Sintomas,zona
0,si,si,si,alta,,canarias
1,si,si,si,alta,fiebre,andalucia
2,si,si,si,alta,tos_seca,castilla_y_leon
3,si,si,si,alta,"fiebre,tos_seca",navarra
4,si,si,si,alta,fatiga,madrid
...,...,...,...,...,...,...
1572859,no,no,no,baja,"fiebre,tos_seca,flemas_esputos,dif_resp,dolor_...",madrid
1572860,no,no,no,baja,"fatiga,flemas_esputos,dif_resp,dolor_garganta,...",castilla_la_mancha
1572861,no,no,no,baja,"fiebre,fatiga,flemas_esputos,dif_resp,dolor_ga...",cantabria
1572862,no,no,no,baja,"tos_seca,fatiga,flemas_esputos,dif_resp,dolor_...",galicia


Now our DataFrame has 6 columns and a total of 1.572.864 cases.

In [19]:
final_data2.shape

(1572864, 6)

Let's check the first 5 rows of the dataset "final_data2".

In [20]:
final_data2.head()

Unnamed: 0,proteccion,transporte,contacto_directo,exposicion,Sintomas,zona
0,si,si,si,alta,,canarias
1,si,si,si,alta,fiebre,andalucia
2,si,si,si,alta,tos_seca,castilla_y_leon
3,si,si,si,alta,"fiebre,tos_seca",navarra
4,si,si,si,alta,fatiga,madrid


"Sintomas" values are converted into dummies. As you can see, we cannot apply the direct dummies function because it is a list of several diseases. Thus, we use a counter.

In [21]:
sintom_list = final_data2['Sintomas'].str.split(',')

sintom_counter = Counter(([a for b in sintom_list.tolist() for a in b]))

for Sintomas in sintom_counter.keys():
    final_data2[Sintomas] = 0
    final_data2.loc[final_data2['Sintomas'].str.contains(Sintomas), Sintomas] = 1

final_data2.head()

Unnamed: 0,proteccion,transporte,contacto_directo,exposicion,Sintomas,zona,Unnamed: 7,fiebre,tos_seca,fatiga,flemas_esputos,dif_resp,dolor_garganta,dolor_cabeza,dolor_muscular,escalofrios,nausea_vomito,cong_nasal,diarrea,expect_sangre,conjuntivitis,dermatitis,falta_gusto_olfato
0,si,si,si,alta,,canarias,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,si,si,si,alta,fiebre,andalucia,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,si,si,si,alta,tos_seca,castilla_y_leon,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,si,si,si,alta,"fiebre,tos_seca",navarra,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,si,si,si,alta,fatiga,madrid,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


After applying the counter to create the dummie variables, we remove variables "Sintomas" and the default column "" which randomly appears with the creation of the dummies.

In [0]:
del final_data2['']
del final_data2['Sintomas']

Check the dataset after removing the unnecessary variables.

In [23]:
final_data2

Unnamed: 0,proteccion,transporte,contacto_directo,exposicion,zona,fiebre,tos_seca,fatiga,flemas_esputos,dif_resp,dolor_garganta,dolor_cabeza,dolor_muscular,escalofrios,nausea_vomito,cong_nasal,diarrea,expect_sangre,conjuntivitis,dermatitis,falta_gusto_olfato
0,si,si,si,alta,canarias,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,si,si,si,alta,andalucia,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,si,si,si,alta,castilla_y_leon,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,si,si,si,alta,navarra,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,si,si,si,alta,madrid,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1572859,no,no,no,baja,madrid,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
1572860,no,no,no,baja,castilla_la_mancha,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1572861,no,no,no,baja,cantabria,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1572862,no,no,no,baja,galicia,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


We also convert "contacto directo", "exposicion", "proteccion", "transporte" and "zona" into dummies.

In [24]:
dummies = pd.get_dummies(final_data2)
final_data2 = dummies
final_data2.head()

Unnamed: 0,fiebre,tos_seca,fatiga,flemas_esputos,dif_resp,dolor_garganta,dolor_cabeza,dolor_muscular,escalofrios,nausea_vomito,cong_nasal,diarrea,expect_sangre,conjuntivitis,dermatitis,falta_gusto_olfato,proteccion_no,proteccion_si,transporte_no,transporte_si,contacto_directo_no,contacto_directo_si,exposicion_alta,exposicion_baja,exposicion_media,zona_andalucia,zona_aragon,zona_asturias,zona_baleares,zona_canarias,zona_cantabria,zona_castilla_la_mancha,zona_castilla_y_leon,zona_cataluña,zona_ceuta,zona_com_val,zona_extremadura,zona_galicia,zona_larioja,zona_madrid,zona_melilla,zona_murcia,zona_navarra,zona_pais_vasco
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [25]:
final_data2.shape

(1572864, 44)

Here we have all the variables in our Dataframe

In [26]:
final_data2.columns

Index(['fiebre', 'tos_seca', 'fatiga', 'flemas_esputos', 'dif_resp',
       'dolor_garganta', 'dolor_cabeza', 'dolor_muscular', 'escalofrios',
       'nausea_vomito', 'cong_nasal', 'diarrea', 'expect_sangre',
       'conjuntivitis', 'dermatitis', 'falta_gusto_olfato', 'proteccion_no',
       'proteccion_si', 'transporte_no', 'transporte_si',
       'contacto_directo_no', 'contacto_directo_si', 'exposicion_alta',
       'exposicion_baja', 'exposicion_media', 'zona_andalucia', 'zona_aragon',
       'zona_asturias', 'zona_baleares', 'zona_canarias', 'zona_cantabria',
       'zona_castilla_la_mancha', 'zona_castilla_y_leon', 'zona_cataluña',
       'zona_ceuta', 'zona_com_val', 'zona_extremadura', 'zona_galicia',
       'zona_larioja', 'zona_madrid', 'zona_melilla', 'zona_murcia',
       'zona_navarra', 'zona_pais_vasco'],
      dtype='object')

## APPLY PROBABILITY OF INITIAL CONTAGION FUCTION

Now let's give each of those variables a weight to compute the contagion.

Definition of variables:

*   a1 to a16: symptoms.
https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/alertasActual/nCov-China/documentos/20200417_ITCoronavirus.pdf pag 19

*   b1, c2, e1, e2, e3: life style.
b= proteccion
c= transporte publico
e=exposicion

*   f1 to f19: region.
"https://github.com/datadista/datasets/blob/master/COVID%2019/ccaa_covid19_datos_isciii.csv"
Casos activos= PCR-Fallecidos- Recuperados
Ministerto de Sanidad

*   d2: contacto_directo_si.


We are not including in this formula b2 (proteccion_si), c1 (transporte_no), and d1 (contacto_directo_no) because these are binary variables. Thus, we include the contrary variables.



In [0]:
def infection (a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,b1,b2,c1,c2,d1,d2,e1,e2,e3,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19): 

    # SYMPTOMATOLOGY = symptom n * weight n
    síntomas = [a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16]
    peso_por_síntoma = [27.95,21.53,12.11,10.62,5.91,4.42,4.32,4.71,3.62,1.59,1.53,1.18,0.29,0.22,0,0]
    peso_total_síntomas = list(map(operator.mul, síntomas, peso_por_síntoma))
    sintomatologia=sum(peso_total_síntomas)

    # LIFE STYLE = Protección no + Transporte sí + exposición baja + exposición media + exposición alta
    variables_estilo_de_vida=[b1,c2,e1,e2,e3]  
    peso_estilo_de_vida=[15,15,17.5,35,70]
    peso_total_estilo_de_vida = list(map(operator.mul, variables_estilo_de_vida, peso_estilo_de_vida))
    estilo_de_vida=sum(peso_total_estilo_de_vida)

    # RISK ZONE = region n * weight n
    CCAA=[f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19]    
    peso_por_CCAA = [26.71,11.60,6.78,2.16,3.70,3.28,47.71,47.30,100,0.06,15.52,4.75,13.75,10.71,81.95,0.11,2.22,14.10,12.96]
    peso_total_CCAA = list(map(operator.mul, CCAA, peso_por_CCAA))
    riesgo_de_zona=sum(peso_total_CCAA)

    # TOTAL CONTACT = Contacto_directo_si * 100 
    contacto_total=d2*100


    # In case of having symptoms
    if (sintomatologia >=30):         
      tasa_contagio=contacto_total*0.25+sintomatologia*0.6+estilo_de_vida*0.1+riesgo_de_zona*0.05  

    # In case of not having symptoms
    else:                             
      tasa_contagio=contacto_total*0.6+sintomatologia*0.25+estilo_de_vida*0.1+riesgo_de_zona*0.05
         
    return (tasa_contagio)


def Prob_infection_initial(): 
      
    final_data2['Prob_infection_initial'] = final_data2.apply(lambda row : infection(row['fiebre'],row['tos_seca'],row['fatiga'],row['flemas_esputos'],row['dif_resp'],
                                                                           row['dolor_garganta'],row['dolor_cabeza'],row['dolor_muscular'],row['escalofrios'],
                                                                           row['nausea_vomito'],row['cong_nasal'],row['diarrea'], row['expect_sangre'], row['conjuntivitis'],
                                                                           row['dermatitis'],row['falta_gusto_olfato'],row['proteccion_no'],
                                                                           row['proteccion_si'],row['transporte_no'],row['transporte_si'],
                                                                           row['contacto_directo_no'],row['contacto_directo_si'],row['exposicion_alta'],
                                                                           row['exposicion_baja'],row['exposicion_media'],row['zona_andalucia'],
                                                                           row['zona_aragon'],row['zona_asturias'],row['zona_baleares'],
                                                                           row['zona_canarias'],row['zona_cantabria'],row['zona_castilla_la_mancha'],
                                                                           row['zona_castilla_y_leon'],row['zona_cataluña'],row['zona_ceuta'],
                                                                           row['zona_com_val'],row['zona_extremadura'],row['zona_galicia'],
                                                                           row['zona_larioja'],row['zona_madrid'],row['zona_melilla'],
                                                                           row['zona_murcia'],row['zona_navarra'],row['zona_pais_vasco']), axis = 1) 
   
    return final_data2

In [28]:
final_data2=Prob_infection_initial()
final_data2

Unnamed: 0,fiebre,tos_seca,fatiga,flemas_esputos,dif_resp,dolor_garganta,dolor_cabeza,dolor_muscular,escalofrios,nausea_vomito,cong_nasal,diarrea,expect_sangre,conjuntivitis,dermatitis,falta_gusto_olfato,proteccion_no,proteccion_si,transporte_no,transporte_si,contacto_directo_no,contacto_directo_si,exposicion_alta,exposicion_baja,exposicion_media,zona_andalucia,zona_aragon,zona_asturias,zona_baleares,zona_canarias,zona_cantabria,zona_castilla_la_mancha,zona_castilla_y_leon,zona_cataluña,zona_ceuta,zona_com_val,zona_extremadura,zona_galicia,zona_larioja,zona_madrid,zona_melilla,zona_murcia,zona_navarra,zona_pais_vasco,Prob_infection_initial
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,63.4350
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,71.5730
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,70.9975
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,58.6430
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,70.3750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1572859,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,61.8315
1572860,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,37.6975
1572861,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,52.2460
1572862,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,48.9175


In [0]:
def sympotm (a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16): 

    # SYMPTOMATOLOGY = symptom n * weight n
    síntomas = [a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16]
    peso_por_síntoma = [27.95,21.53,12.11,10.62,5.91,4.42,4.32,4.71,3.62,1.59,1.53,1.18,0.29,0.22,0,0]
    peso_total_síntomas = list(map(operator.mul, síntomas, peso_por_síntoma))
    sintomatologia=sum(peso_total_síntomas)

    return (sintomatologia)

def Prob_sintomatologia(): 
      
    final_data2['Prob_sintomatologia'] = final_data2.apply(lambda row : sympotm(row['fiebre'],row['tos_seca'],row['fatiga'],row['flemas_esputos'],row['dif_resp'],
                                                                           row['dolor_garganta'],row['dolor_cabeza'],row['dolor_muscular'],row['escalofrios'],
                                                                           row['nausea_vomito'],row['cong_nasal'],row['diarrea'], row['expect_sangre'], row['conjuntivitis'],
                                                                           row['dermatitis'],row['falta_gusto_olfato']), axis = 1) 
    return final_data2

In [0]:
final_data2=Prob_sintomatologia()

We include in the "final_data2" the variable "Prob_infection_initial" containing the probability of infection.

Add an id to all records in the generated dataset.

In [31]:
final_data2['user_id']=range(0,1572864)
final_data2

Unnamed: 0,fiebre,tos_seca,fatiga,flemas_esputos,dif_resp,dolor_garganta,dolor_cabeza,dolor_muscular,escalofrios,nausea_vomito,cong_nasal,diarrea,expect_sangre,conjuntivitis,dermatitis,falta_gusto_olfato,proteccion_no,proteccion_si,transporte_no,transporte_si,contacto_directo_no,contacto_directo_si,exposicion_alta,exposicion_baja,exposicion_media,zona_andalucia,zona_aragon,zona_asturias,zona_baleares,zona_canarias,zona_cantabria,zona_castilla_la_mancha,zona_castilla_y_leon,zona_cataluña,zona_ceuta,zona_com_val,zona_extremadura,zona_galicia,zona_larioja,zona_madrid,zona_melilla,zona_murcia,zona_navarra,zona_pais_vasco,Prob_infection_initial,Prob_sintomatologia,user_id
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,63.4350,0.00,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,71.5730,27.95,1
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,70.9975,21.53,2
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,58.6430,49.48,3
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,70.3750,12.11,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1572859,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,61.8315,87.89,1572859
1572860,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,37.6975,50.52,1572860
1572861,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,52.2460,78.47,1572861
1572862,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,48.9175,72.05,1572862


In [0]:
final_data2.loc[final_data2['Prob_infection_initial'] >30 & (final_data2['Prob_sintomatologia'] >=30), 'Classification'] = 'Sospechoso_con_sintomas' 
final_data2.loc[(final_data2['Prob_infection_initial'] >30) & (final_data2['Prob_sintomatologia'] <30), 'Classification'] = 'Sospechoso_sin_sintomas'  
final_data2.loc[final_data2['Prob_infection_initial'] <= 30, 'Classification'] = 'No sospechoso' 

## Export Dataset

In [34]:
from google.colab import drive
drive.mount('drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at drive


In [0]:
final_data2.to_csv('first_final_infection_dataset.csv', index=False)
!cp first_final_infection_dataset.csv "/content/drive/My Drive"