In [1]:
import pandas as pd

covid_data = pd.read_csv('covid_19_clean_complete.csv')
covid_data.drop_duplicates()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.000000,65.000000,1/22/20,0,0,0
1,,Albania,41.153300,20.168300,1/22/20,0,0,0
2,,Algeria,28.033900,1.659600,1/22/20,0,0,0
3,,Andorra,42.506300,1.521800,1/22/20,0,0,0
4,,Angola,-11.202700,17.873900,1/22/20,0,0,0
...,...,...,...,...,...,...,...,...
27451,,Western Sahara,24.215500,-12.885800,5/4/20,6,0,5
27452,,Sao Tome and Principe,0.186360,6.613081,5/4/20,23,3,4
27453,,Yemen,15.552727,48.516388,5/4/20,12,2,0
27454,,Comoros,-11.645500,43.333300,5/4/20,3,0,0


In [2]:
covid_data = covid_data.drop(covid_data[covid_data['Province/State']=='Grand Princess'].index)
covid_data = covid_data.drop(covid_data[covid_data['Province/State']=='Diamond Princess'].index)
covid_data = covid_data.drop(covid_data[covid_data['Country/Region']=='Diamond Princess'].index)
covid_data = covid_data.drop(covid_data[covid_data['Country/Region']=='MS Zaandam'].index)
covid_data.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            27446, 27447, 27448, 27449, 27450, 27451, 27452, 27453, 27454,
            27455],
           dtype='int64', length=27040)

In [3]:
covid_data.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            27446, 27447, 27448, 27449, 27450, 27451, 27452, 27453, 27454,
            27455],
           dtype='int64', length=27040)

In [4]:
import numpy as np

covid_data['Province/State'] = covid_data.replace(np.nan, '', regex=True)

covid_data

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.000000,65.000000,1/22/20,0,0,0
1,,Albania,41.153300,20.168300,1/22/20,0,0,0
2,,Algeria,28.033900,1.659600,1/22/20,0,0,0
3,,Andorra,42.506300,1.521800,1/22/20,0,0,0
4,,Angola,-11.202700,17.873900,1/22/20,0,0,0
...,...,...,...,...,...,...,...,...
27451,,Western Sahara,24.215500,-12.885800,5/4/20,6,0,5
27452,,Sao Tome and Principe,0.186360,6.613081,5/4/20,23,3,4
27453,,Yemen,15.552727,48.516388,5/4/20,12,2,0
27454,,Comoros,-11.645500,43.333300,5/4/20,3,0,0


In [5]:
cols = ['Province/State', 'Country/Region']
covid_data['Local'] = covid_data[cols].apply(lambda row: ''.join(row.values.astype(str)), axis=1)

In [6]:
covid_data


Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Local
0,,Afghanistan,33.000000,65.000000,1/22/20,0,0,0,Afghanistan
1,,Albania,41.153300,20.168300,1/22/20,0,0,0,Albania
2,,Algeria,28.033900,1.659600,1/22/20,0,0,0,Algeria
3,,Andorra,42.506300,1.521800,1/22/20,0,0,0,Andorra
4,,Angola,-11.202700,17.873900,1/22/20,0,0,0,Angola
...,...,...,...,...,...,...,...,...,...
27451,,Western Sahara,24.215500,-12.885800,5/4/20,6,0,5,Western Sahara
27452,,Sao Tome and Principe,0.186360,6.613081,5/4/20,23,3,4,Sao Tome and Principe
27453,,Yemen,15.552727,48.516388,5/4/20,12,2,0,Yemen
27454,,Comoros,-11.645500,43.333300,5/4/20,3,0,0,Comoros


In [7]:
del covid_data['Province/State']
del covid_data['Country/Region']


In [8]:
covid_data

Unnamed: 0,Lat,Long,Date,Confirmed,Deaths,Recovered,Local
0,33.000000,65.000000,1/22/20,0,0,0,Afghanistan
1,41.153300,20.168300,1/22/20,0,0,0,Albania
2,28.033900,1.659600,1/22/20,0,0,0,Algeria
3,42.506300,1.521800,1/22/20,0,0,0,Andorra
4,-11.202700,17.873900,1/22/20,0,0,0,Angola
...,...,...,...,...,...,...,...
27451,24.215500,-12.885800,5/4/20,6,0,5,Western Sahara
27452,0.186360,6.613081,5/4/20,23,3,4,Sao Tome and Principe
27453,15.552727,48.516388,5/4/20,12,2,0,Yemen
27454,-11.645500,43.333300,5/4/20,3,0,0,Comoros


In [9]:
with pd.ExcelWriter('covid_19_distance.xlsx') as writer:
    covid_data.to_excel(writer)

## K Nearest Neighbours
Este dataset que processamos é ideal para se usar KNN com as coordenadas geográficas para distinguir os vizinhos ao epicentro original da pandemia (Província de Wuhan, na China).
Já que pretendemos construir um modelo de regressão destes dados, vamos recorrer à ferramenta *scikit-learn* para utilizar os seus algoritmos de regressão, nos quais se encontra o **KNeighborsRegressor**, que será o escolhido para aplicar o KNN.

### KNeighborsRegressor
#### Parâmetros da pesquisa
* **n_neighbors**: número de vizinhos para usar, por defeito, 5.
* **weights**: função de peso utilizado na previsão. Valores a testar:
    * *uniform*: pesos uniformes, todos os pontos na vizinhança são pesados igualmente
    * *distance*: pesos influenciados pela distância ao ponto de pesquisa, em que pontos vizinhos mais pertos do ponto de pesquisa terão mais influência do que aqueles mais afastados
* **algorithm**: algoritmo usado para computar os nearest neighbors. Opção a usar:
    * *auto*: tenta decidir qual a melhor escolha, tendo em conta os valores passados para a função de **fit()** dentro das possíveis escolhas, *ball_tree*, *kd_tree* ou *brute*
* **metric**: definir que tipo de distância a utilizar. Métrica a testar:
    * *euclidean*: utilizar a distância euclideana (mais apropriado)
* **n_jobs**: definir nº de processos para paralelizar os trabalhos
    * *None*: não há paralelismo (usar em debug)
    * *-1*: todos os cpu's são usados

In [13]:
#data de teste: 4/5/20
#criar set de treino 
train = covid_data[covid_data['Date']!='5/4/20']
train

Unnamed: 0,Lat,Long,Date,Confirmed,Deaths,Recovered,Local
0,33.000000,65.000000,1/22/20,0,0,0,Afghanistan
1,41.153300,20.168300,1/22/20,0,0,0,Albania
2,28.033900,1.659600,1/22/20,0,0,0,Algeria
3,42.506300,1.521800,1/22/20,0,0,0,Andorra
4,-11.202700,17.873900,1/22/20,0,0,0,Angola
...,...,...,...,...,...,...,...
27187,24.215500,-12.885800,5/3/20,6,0,5,Western Sahara
27188,0.186360,6.613081,5/3/20,16,1,4,Sao Tome and Principe
27189,15.552727,48.516388,5/3/20,10,2,0,Yemen
27190,-11.645500,43.333300,5/3/20,3,0,0,Comoros


In [14]:
#criar set de teste
test = covid_data[covid_data['Date']=='5/4/20']
test

Unnamed: 0,Lat,Long,Date,Confirmed,Deaths,Recovered,Local
27192,33.000000,65.000000,5/4/20,2894,90,397,Afghanistan
27193,41.153300,20.168300,5/4/20,803,31,543,Albania
27194,28.033900,1.659600,5/4/20,4648,465,1998,Algeria
27195,42.506300,1.521800,5/4/20,750,45,499,Andorra
27196,-11.202700,17.873900,5/4/20,35,2,11,Angola
...,...,...,...,...,...,...,...
27451,24.215500,-12.885800,5/4/20,6,0,5,Western Sahara
27452,0.186360,6.613081,5/4/20,23,3,4,Sao Tome and Principe
27453,15.552727,48.516388,5/4/20,12,2,0,Yemen
27454,-11.645500,43.333300,5/4/20,3,0,0,Comoros


In [24]:
#colunas em que vamos basear as previsões
x_columns = ['Lat','Long']
#colunas que queremos 
y_columns = ['Confirmed','Deaths','Recovered']

from sklearn.neighbors import KNeighborsRegressor
#criar o modelo e usar o número de vizinhos default, 5.
knn = KNeighborsRegressor(n_neighbors=5)
#aplicar a função de fit ao set de treino
knn.fit(train[x_columns],train[y_columns])
#fazer previsões do set de teste usando 
predictions = knn.predict(test[x_columns])
predictions = pd.DataFrame(data=predictions,columns=['Confirmed','Deaths','Recovered'])
predictions['Local'] = test['Local'].tolist()
predictions

Unnamed: 0,Confirmed,Deaths,Recovered,Local
0,24.4,0.8,0.4,Afghanistan
1,237.6,10.8,123.8,Albania
2,31.6,3.0,8.8,Algeria
3,246.8,11.6,74.0,Andorra
4,6.2,0.8,1.2,Angola
...,...,...,...,...
255,0.0,0.0,0.0,Western Sahara
256,0.8,0.0,0.0,Sao Tome and Principe
257,0.4,0.0,0.0,Yemen
258,0.0,0.0,0.0,Comoros
