# Covid 19 - Expansion Prediction

by: Leandro Arruda

## Exploring Distance between Countries and  Confirmed cases

## Importing Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from datetime import datetime
from geopy.distance import great_circle, geodesic
import os

## Reading the dataset

In [2]:
# Input data files are available in the "../data/" directory.

for dirname, _, filenames in os.walk('.\data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

.\data\2019_nCoV_data.csv
.\data\countries-lat-lon.csv
.\data\countries.csv
.\data\countries_and_continents.csv
.\data\covid_19_data.csv
.\data\covid_19_processed.csv
.\data\Population.csv
.\data\sars.csv
.\data\time_series_covid_19_confirmed.csv
.\data\time_series_covid_19_deaths.csv
.\data\time_series_covid_19_recovered.csv


In [3]:
covid = pd.read_csv('./data/covid_19_processed.csv')
countries_loc = pd.read_csv('./data/countries-lat-lon.csv')
covid.head()

Unnamed: 0.1,Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,longitude,latitude,SARS_Cases,SARS_Deaths,SARS_Fatality,SARS,Population,Population Density
0,0,1,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0,35.86166,104.195397,5327.0,349.0,6.6,1.0,1386395000.0,148.0
1,1,2,2020-01-22,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0,35.86166,104.195397,5327.0,349.0,6.6,1.0,1386395000.0,148.0
2,2,3,2020-01-22,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0,35.86166,104.195397,5327.0,349.0,6.6,1.0,1386395000.0,148.0
3,3,4,2020-01-22,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0,35.86166,104.195397,5327.0,349.0,6.6,1.0,1386395000.0,148.0
4,4,6,2020-01-22,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0,35.86166,104.195397,5327.0,349.0,6.6,1.0,1386395000.0,148.0



**Counting the number of duplicated rows for each country
to predict an expansion of the desease, I just need to know if a country had confirmed cases.**

In [4]:
covid = covid[covid['Confirmed']>0]
covid['Country/Region'].value_counts()

Mainland China          1160
US                       340
Australia                134
Canada                    92
South Korea               38
Thailand                  38
Japan                     38
Taiwan                    38
Macau                     38
Vietnam                   37
Singapore                 37
Hong Kong                 37
France                    36
Nepal                     35
Malaysia                  35
Cambodia                  33
Sri Lanka                 33
Germany                   32
United Arab Emirates      31
Finland                   31
Philippines               30
India                     30
Russia                    29
Italy                     29
UK                        29
Sweden                    29
Spain                     28
Belgium                   25
Egypt                     15
Iran                      10
                        ... 
Israel                     8
Oman                       5
Bahrain                    5
Afghanistan   

In [5]:
covid = covid.drop_duplicates("Country/Region")
covid = covid.reset_index(drop=True)

### Preparing World Countries & Location to Merge into COVID dataset

In [6]:
covid['SARS']  = covid['SARS'].fillna(0)
covid.head()

Unnamed: 0.1,Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,longitude,latitude,SARS_Cases,SARS_Deaths,SARS_Fatality,SARS,Population,Population Density
0,0,1,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0,35.86166,104.195397,5327.0,349.0,6.6,1.0,1386395000.0,148.0
1,15,21,2020-01-22,Macau,Macau,1/22/2020 17:00,1.0,0.0,0.0,22.198745,113.543873,1.0,0.0,0.0,1.0,,
2,21,29,2020-01-22,Taiwan,Taiwan,1/22/2020 17:00,1.0,0.0,0.0,23.69781,120.960515,346.0,73.0,21.1,1.0,22701080.0,704.0
3,23,32,2020-01-22,Washington,US,1/22/2020 17:00,1.0,0.0,0.0,40.760537,-73.97889,27.0,0.0,0.0,1.0,325719200.0,36.0
4,26,36,2020-01-22,,Japan,1/22/2020 17:00,2.0,0.0,0.0,36.204824,138.252924,0.0,,,0.0,126785800.0,348.0


In [7]:
covid['Population Density'] = covid['Population Density'].fillna(covid['Population Density'].min())

In [8]:
covid['ObservationDate'] = pd.to_datetime(covid['ObservationDate'], format='%Y/%m/%d')

**Renaming Name column**

In [9]:
countries_loc = countries_loc.rename(columns={"name": "Country/Region"})
countries_loc = countries_loc.dropna()
countries_loc.head()

Unnamed: 0.1,Unnamed: 0,longitude,latitude,Country/Region
2,2,33.93911,67.709953,Afghanistan
3,3,41.153332,20.168331,Albania
4,4,28.033886,1.659626,Algeria
5,5,-14.270972,-170.132217,American Samoa
6,6,42.506285,1.521801,Andorra


**Correcting country names**

In [10]:
#countries_loc.loc[countries_loc['Country/Region'] == 'China', 'Country/Region'] = 'Mainland China'
#countries_loc.loc[countries_loc['Country/Region'] == 'Côte d’Ivoire', 'Country/Region'] = 'Ivory Coast'

#### Assigning Longitude and Latitude to listed countries

## Auxiliary Functions

**Using spherical geometry to calculate the surface distance between two
points(Countries).**

In [11]:
# Calculate the distance between confirmed locations and informed lon, Lat
def dist_to_confirmed(lon, lat, confirmed_loc):
    dists = []
    for lon2, lat2, country in confirmed_loc.values:
        d = geodesic((lon, lat), (lon2, lat2))
        dists.append(d.kilometers)
    return np.array(dists)

# Count the number of confirmed locations in a radius of 1k, 2k, and 3k kilometers
def calc_confirmed_radius(lon, lat, country, confirmed_loc):
    res = {"Country/Region": country}
    
    dists = dist_to_confirmed(lon, lat, confirmed_loc)
    
    res['avg_dist_to_confirmed'] = np.mean(dists)
    res['min_dist_to_confirmed'] = np.min(dists)
    res['confirmed_1k'] = (dists < 1000).sum()
    res['confirmed_2k'] = (dists < 2000).sum()
    res['confirmed_3k'] = (dists < 3000).sum()
    
    return res

### Country Dataset

In [12]:
dates = pd.date_range("2020-01-22", "2020-02-26")

In [13]:
#covid['SARS_Cases'] = covid['SARS_Cases'].astype(float).fillna(0)
covid


Unnamed: 0.1,Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,longitude,latitude,SARS_Cases,SARS_Deaths,SARS_Fatality,SARS,Population,Population Density
0,0,1,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0,35.861660,104.195397,5327.0,349.0,6.6,1.0,1.386395e+09,148.0
1,15,21,2020-01-22,Macau,Macau,1/22/2020 17:00,1.0,0.0,0.0,22.198745,113.543873,1.0,0.0,0.0,1.0,,3.0
2,21,29,2020-01-22,Taiwan,Taiwan,1/22/2020 17:00,1.0,0.0,0.0,23.697810,120.960515,346.0,73.0,21.1,1.0,2.270108e+07,704.0
3,23,32,2020-01-22,Washington,US,1/22/2020 17:00,1.0,0.0,0.0,40.760537,-73.978890,27.0,0.0,0.0,1.0,3.257192e+08,36.0
4,26,36,2020-01-22,,Japan,1/22/2020 17:00,2.0,0.0,0.0,36.204824,138.252924,0.0,,,0.0,1.267858e+08,348.0
5,27,37,2020-01-22,,Thailand,1/22/2020 17:00,2.0,0.0,0.0,15.870032,100.992541,9.0,2.0,22.2,1.0,6.903751e+07,135.0
6,28,38,2020-01-22,,South Korea,1/22/2020 17:00,1.0,0.0,0.0,35.907757,127.766922,3.0,0.0,0.0,1.0,5.146620e+07,528.0
7,41,51,2020-01-23,Hong Kong,Hong Kong,1/23/20 17:00,2.0,0.0,0.0,22.396428,114.109497,1755.0,299.0,17.0,1.0,,3.0
8,64,77,2020-01-23,,Singapore,1/23/20 17:00,1.0,0.0,0.0,1.352083,103.819836,238.0,33.0,13.9,1.0,5.612253e+06,7916.0
9,65,80,2020-01-23,,Vietnam,1/23/20 17:00,2.0,0.0,0.0,14.058324,108.277199,63.0,5.0,7.9,1.0,9.554080e+07,308.0


In [14]:
all_data = dict()
for date in dates:
    print(date)
    confirmed_countries = covid[covid['ObservationDate'] <= date]['Country/Region']
    
    confirmed_loc = covid[covid['Country/Region'].isin(confirmed_countries)]
    confirmed_loc = confirmed_loc[['longitude', 'latitude', 'Country/Region']]

    data = covid.copy() ##
    
    next_confirmed = covid[covid['ObservationDate'] == date + pd.Timedelta(1,'D')]['Country/Region']
    
    if next_confirmed.shape[0] == 0:
        continue
    
    data['y'] = 0
    data.loc[data['Country/Region'].isin(next_confirmed), 'y'] = 1
    data.loc[data['Country/Region'].isin(confirmed_countries), 'y'] = 1 ##
    
    data['confirmed_past'] = 0 ##
    data.loc[data['Country/Region'].isin(confirmed_countries), 'confirmed_past'] = 1  ##
    data['date'] = date
    
    features = []
    for lon, lat, country in data[['longitude', 'latitude','Country/Region']].values:
        features.append(calc_confirmed_radius(lon, lat, country, confirmed_loc))
    
    features = pd.DataFrame(features)
    data = data.merge(features, how='left', on='Country/Region')
    data['SARS_Cases'] = data['SARS_Cases'].astype(float).fillna(0)
   
    #print(data.head())
    #data['Deaths'] = data['Deaths'].str.replace(',', '').astype(float).fillna(0)
    
    all_data[date] = data
    #print(all_data[date])
    print(data.shape[0], '\n')

2020-01-22 00:00:00
61 

2020-01-23 00:00:00
61 

2020-01-24 00:00:00
61 

2020-01-25 00:00:00
61 

2020-01-26 00:00:00
61 

2020-01-27 00:00:00
61 

2020-01-28 00:00:00
61 

2020-01-29 00:00:00
61 

2020-01-30 00:00:00
61 

2020-01-31 00:00:00
61 

2020-02-01 00:00:00
2020-02-02 00:00:00
2020-02-03 00:00:00
61 

2020-02-04 00:00:00
2020-02-05 00:00:00
2020-02-06 00:00:00
2020-02-07 00:00:00
2020-02-08 00:00:00
2020-02-09 00:00:00
2020-02-10 00:00:00
2020-02-11 00:00:00
2020-02-12 00:00:00
2020-02-13 00:00:00
61 

2020-02-14 00:00:00
2020-02-15 00:00:00
2020-02-16 00:00:00
2020-02-17 00:00:00
2020-02-18 00:00:00
61 

2020-02-19 00:00:00
2020-02-20 00:00:00
61 

2020-02-21 00:00:00
2020-02-22 00:00:00
2020-02-23 00:00:00
61 

2020-02-24 00:00:00
61 

2020-02-25 00:00:00
61 

2020-02-26 00:00:00
61 



## Predicting new cases
#### Calculating the distance between countries & confirmed cases 
#### Adding SARS features

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [16]:
features = ['avg_dist_to_confirmed', 'confirmed_1k', 'confirmed_2k', 'confirmed_3k', 'min_dist_to_confirmed', 'SARS']#, 'Cases']
sorted_date = sorted(all_data.keys())
print(sorted_date)
correct = {'Total': 0,'Baseline': 0, 'ML': 0, 'Rank Avg': 0, 'Baseline SARS': 0}
for i in range(len(sorted_date)-1) :
    date = sorted_date[i]
    next_date = sorted_date[i+1]
    
    Xtrain, ytrain = all_data[date][features], all_data[date]['y']
    print(Xtrain)
    
    Xval, yval = all_data[next_date][features], all_data[next_date]['y']
    Xval, yval = Xval[all_data[next_date]['confirmed_past'] == 0], yval[all_data[next_date]['confirmed_past'] == 0] ##
    
    pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0, C=0.2, class_weight='balanced'))
    #pipe = DecisionTreeClassifier(max_depth=3,class_weight='balanced', ccp_alpha=0.0)
    #pipe = ExtraTreesClassifier(n_estimators=1000, n_jobs=6, max_depth=3, class_weight='balanced', random_state=0)
    #pipe = RandomForestClassifier(n_estimators=100, n_jobs=6, max_depth=3, class_weight='balanced', random_state=0)
    
    
    pipe.fit(Xtrain, ytrain)
    p = pipe.predict_proba(Xval)[:,1]
    
    Xval['p'] = p
    Xval['y'] = yval
    Xval['country'] = all_data[next_date]['Country/Region']
    
    top_n = 40
    baseline = Xval.sort_values("avg_dist_to_confirmed").head(top_n)['y'].sum()
    baseline_ = Xval.sort_values("SARS", ascending=False).head(top_n)['y'].sum()
    lr_top_20 = Xval.sort_values("p", ascending=False).head(top_n)['y'].sum()# / yval.sum()
    
    Xval['rank_avg'] = 0.8 * Xval['avg_dist_to_confirmed'].rank() + 0.2*Xval['p'].rank(ascending=False)
    avg_in_top_20 = Xval.sort_values("rank_avg", ascending=True).head(top_n)['y'].sum() #/ yval.sum()
    
    correct['Total'] += yval.sum()
    correct['Baseline'] += baseline
    correct['ML'] += lr_top_20
    correct['Rank Avg'] += avg_in_top_20
    correct['Baseline SARS'] += baseline_
    
    str_result = "Prediction date: {}\nPositive in train: {}\nConfirmed next date: {}\nBaseline: {}\nML: {}\nRank avg top 20: {}\nBaseline SARS: {}\n".format( 
        date, ytrain.sum(), yval.sum(), baseline, lr_top_20, avg_in_top_20, baseline_)
    print(str_result)

[Timestamp('2020-01-22 00:00:00', freq='D'), Timestamp('2020-01-23 00:00:00', freq='D'), Timestamp('2020-01-24 00:00:00', freq='D'), Timestamp('2020-01-25 00:00:00', freq='D'), Timestamp('2020-01-26 00:00:00', freq='D'), Timestamp('2020-01-27 00:00:00', freq='D'), Timestamp('2020-01-28 00:00:00', freq='D'), Timestamp('2020-01-29 00:00:00', freq='D'), Timestamp('2020-01-30 00:00:00', freq='D'), Timestamp('2020-01-31 00:00:00', freq='D'), Timestamp('2020-02-03 00:00:00', freq='D'), Timestamp('2020-02-13 00:00:00', freq='D'), Timestamp('2020-02-18 00:00:00', freq='D'), Timestamp('2020-02-20 00:00:00', freq='D'), Timestamp('2020-02-23 00:00:00', freq='D'), Timestamp('2020-02-24 00:00:00', freq='D'), Timestamp('2020-02-25 00:00:00', freq='D'), Timestamp('2020-02-26 00:00:00', freq='D')]
    avg_dist_to_confirmed  confirmed_1k  confirmed_2k  confirmed_3k  \
0             3257.136416             1             2             5   
1             3132.465454             2             4            

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Prediction date: 2020-01-26 00:00:00
Positive in train: 18
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline SARS: 1

    avg_dist_to_confirmed  confirmed_1k  confirmed_2k  confirmed_3k  \
0             4393.106957             1             3             9   
1             4158.214786             3             7            11   
2             4416.511580             3             5            10   
3            11325.225921             1             1             2   
4             5323.552852             2             2             5   
5             4238.053017             3             7            11   
6             4813.551949             2             4             6   
7             4169.634036             3             8            11   
8             4921.355467             2             5             8   
9             4245.530581             3             8            10   
10            8650.108236             1             1             1   
11         

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)


Prediction date: 2020-02-03 00:00:00
Positive in train: 29
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline SARS: 1

    avg_dist_to_confirmed  confirmed_1k  confirmed_2k  confirmed_3k  \
0             4981.669545             1             3            11   
1             5306.333781             3             8            12   
2             5584.248711             3             6            11   
3            10108.770603             1             1             2   
4             6213.756659             2             2             5   
5             5269.581245             3             7            13   
6             5695.241780             2             4             7   
7             5320.059061             3             9            12   
8             6168.313534             2             5             9   
9             5462.655514             3             9            11   
10            6691.272045             5             7             8   
11         

Prediction date: 2020-02-25 00:00:00
Positive in train: 49
Confirmed next date: 4
Baseline: 4
ML: 4
Rank avg top 20: 4
Baseline SARS: 4



  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [17]:
correct
#{'Baseline': 8, 'LR': 4, 'Rank Avg': 6} = 50/50

{'Total': 43, 'Baseline': 40, 'ML': 42, 'Rank Avg': 42, 'Baseline SARS': 41}

### List the predicted Countries  

#### What countries are the most probable to have a confirmed case of COVID-19. (Ordered by Rank Average)

In [18]:
Xval.sort_values("rank_avg", ascending=True).head(25)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,SARS,p,y,country,rank_avg
53,4548.607762,1,9,19,842.999332,0.0,0.252248,0,Azerbaijan,2.6
54,4760.180989,3,12,19,889.966432,0.0,0.22514,0,Belarus,3.6
50,4916.05703,3,11,16,373.103938,0.0,0.340756,1,Estonia,4.0
52,5036.012161,9,14,20,230.178919,0.0,0.413739,1,San Marino,4.4
56,4840.637016,2,14,18,634.220658,0.0,0.214084,0,Lithuania,4.6
49,5163.60251,5,13,15,472.581056,0.0,0.323935,1,Denmark,6.0
51,5248.446916,7,15,16,190.033447,0.0,0.369965,1,Netherlands,6.2
60,5803.61153,2,11,16,372.508638,1.0,0.565536,0,Ireland,6.6
55,6200.95181,0,3,10,1366.659629,0.0,0.29594,0,Iceland,8.6
59,6967.653189,0,1,2,1575.829132,0.0,0.331185,0,Nigeria,9.0


#### What countries are the most probable to have a confirmed case of COVID-19. (Ordered by Distance of a confirmed case)

In [19]:
Xval.sort_values("avg_dist_to_confirmed").head(10)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,SARS,p,y,country,rank_avg
53,4548.607762,1,9,19,842.999332,0.0,0.252248,0,Azerbaijan,2.6
54,4760.180989,3,12,19,889.966432,0.0,0.22514,0,Belarus,3.6
56,4840.637016,2,14,18,634.220658,0.0,0.214084,0,Lithuania,4.6
50,4916.05703,3,11,16,373.103938,0.0,0.340756,1,Estonia,4.0
52,5036.012161,9,14,20,230.178919,0.0,0.413739,1,San Marino,4.4
49,5163.60251,5,13,15,472.581056,0.0,0.323935,1,Denmark,6.0
51,5248.446916,7,15,16,190.033447,0.0,0.369965,1,Netherlands,6.2
60,5803.61153,2,11,16,372.508638,1.0,0.565536,0,Ireland,6.6
55,6200.95181,0,3,10,1366.659629,0.0,0.29594,0,Iceland,8.6
59,6967.653189,0,1,2,1575.829132,0.0,0.331185,0,Nigeria,9.0


In [20]:
Xval2 = Xval.sort_values("p", ascending=False).reset_index(drop=True)
Xval2.head(50)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,SARS,p,y,country,rank_avg
0,5803.61153,2,11,16,372.508638,1.0,0.565536,0,Ireland,6.6
1,5036.012161,9,14,20,230.178919,0.0,0.413739,1,San Marino,4.4
2,5248.446916,7,15,16,190.033447,0.0,0.369965,1,Netherlands,6.2
3,4916.05703,3,11,16,373.103938,0.0,0.340756,1,Estonia,4.0
4,6967.653189,0,1,2,1575.829132,0.0,0.331185,0,Nigeria,9.0
5,5163.60251,5,13,15,472.581056,0.0,0.323935,1,Denmark,6.0
6,6200.95181,0,3,10,1366.659629,0.0,0.29594,0,Iceland,8.6
7,11518.39953,0,0,1,2148.627849,0.0,0.252456,0,Mexico,10.4
8,4548.607762,1,9,19,842.999332,0.0,0.252248,0,Azerbaijan,2.6
9,4760.180989,3,12,19,889.966432,0.0,0.22514,0,Belarus,3.6


## Predicting new cases
#### Calculating the distance between countries & confirmed cases 
#### Adding Population features

In [21]:
features = ['avg_dist_to_confirmed', 'confirmed_1k', 'confirmed_2k', 'confirmed_3k', 'min_dist_to_confirmed', 'Population Density']#, 'Cases']
sorted_date = sorted(all_data.keys())

correct = {'Total': 0,'Baseline': 0, 'ML': 0, 'Rank Avg': 0, 'Baseline Pop.Density': 0}
for i in range(len(sorted_date)-1) :

    date = sorted_date[i]
    next_date = sorted_date[i+1]
    
    Xtrain, ytrain = all_data[date][features], all_data[date]['y']
    Xval, yval = all_data[next_date][features], all_data[next_date]['y']
    Xval, yval = Xval[all_data[next_date]['confirmed_past'] == 0], yval[all_data[next_date]['confirmed_past'] == 0] ##
    
    pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0, C=0.2, class_weight='balanced'))
    #pipe = DecisionTreeClassifier(max_depth=3,class_weight='balanced', ccp_alpha=0.0)
    #pipe = ExtraTreesClassifier(n_estimators=1000, n_jobs=6, max_depth=3, class_weight='balanced', random_state=0)
    #pipe = RandomForestClassifier(n_estimators=100, n_jobs=6, max_depth=3, class_weight='balanced', random_state=0)
    
    
    pipe.fit(Xtrain, ytrain)
    p = pipe.predict_proba(Xval)[:,1]
    
    Xval['p'] = p
    Xval['y'] = yval
    Xval['country'] = all_data[next_date]['Country/Region']
    
    top_n = 40
    baseline = Xval.sort_values("avg_dist_to_confirmed").head(top_n)['y'].sum()# / yval.sum()
    baseline_ = Xval.sort_values("Population Density", ascending=False).head(top_n)['y'].sum()
    lr_top_20 = Xval.sort_values("p", ascending=False).head(top_n)['y'].sum()# / yval.sum()
    
    Xval['rank_avg'] = 0.9*Xval['avg_dist_to_confirmed'].rank() + 0.1*Xval['p'].rank(ascending=False)
    avg_in_top_20 = Xval.sort_values("rank_avg", ascending=True).head(top_n)['y'].sum() #/ yval.sum()
    
    correct['Total'] += yval.sum()
    correct['Baseline'] += baseline
    correct['ML'] += lr_top_20
    correct['Rank Avg'] += avg_in_top_20
    correct['Baseline Pop.Density'] += baseline_
    
    str_result = "Prediction date: {}\nPositive in train: {}\nConfirmed next date: {}\nBaseline: {}\nML: {}\nRank avg top 20: {}\nBaseline Pop.Density: {}\n".format( 
        date, ytrain.sum(), yval.sum(), baseline, lr_top_20, avg_in_top_20, baseline_)
    print(str_result)

Prediction date: 2020-01-22 00:00:00
Positive in train: 10
Confirmed next date: 1
Baseline: 0
ML: 1
Rank avg top 20: 0
Baseline Pop.Density: 1

Prediction date: 2020-01-23 00:00:00
Positive in train: 11
Confirmed next date: 3
Baseline: 3
ML: 2
Rank avg top 20: 3
Baseline Pop.Density: 2



  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Prediction date: 2020-01-24 00:00:00
Positive in train: 14
Confirmed next date: 1
Baseline: 0
ML: 1
Rank avg top 20: 0
Baseline Pop.Density: 0

Prediction date: 2020-01-25 00:00:00
Positive in train: 15
Confirmed next date: 3
Baseline: 2
ML: 2
Rank avg top 20: 2
Baseline Pop.Density: 2

Prediction date: 2020-01-26 00:00:00
Positive in train: 18
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline Pop.Density: 1

Prediction date: 2020-01-27 00:00:00
Positive in train: 19
Confirmed next date: 2
Baseline: 2
ML: 2
Rank avg top 20: 2
Baseline Pop.Density: 2

Prediction date: 2020-01-28 00:00:00
Positive in train: 21
Confirmed next date: 2
Baseline: 2
ML: 2
Rank avg top 20: 2
Baseline Pop.Density: 2

Prediction date: 2020-01-29 00:00:00
Positive in train: 23
Confirmed next date: 4
Baseline: 4
ML: 4
Rank avg top 20: 4
Baseline Pop.Density: 4



  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Prediction date: 2020-01-30 00:00:00
Positive in train: 27
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline Pop.Density: 1

Prediction date: 2020-01-31 00:00:00
Positive in train: 28
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline Pop.Density: 1

Prediction date: 2020-02-03 00:00:00
Positive in train: 29
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline Pop.Density: 1

Prediction date: 2020-02-13 00:00:00
Positive in train: 30
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline Pop.Density: 1

Prediction date: 2020-02-18 00:00:00
Positive in train: 31
Confirmed next date: 2
Baseline: 2
ML: 2
Rank avg top 20: 2
Baseline Pop.Density: 2

Prediction date: 2020-02-20 00:00:00
Positive in train: 33
Confirmed next date: 5
Baseline: 5
ML: 5
Rank avg top 20: 5
Baseline Pop.Density: 5

Prediction date: 2020-02-23 00:00:00
Positive in train: 38
Confirmed next date: 4
Baseline: 4
ML: 4
Rank avg top 20: 4
Baseline Pop.Dens

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [22]:
correct
#{'Baseline': 8, 'LR': 4, 'Rank Avg': 6} = 50/50

{'Total': 43,
 'Baseline': 40,
 'ML': 41,
 'Rank Avg': 40,
 'Baseline Pop.Density': 40}

#### What countries are the most probable to have a confirmed case of COVID-19. (Ordered by Rank Average)

In [23]:
Xval.sort_values("rank_avg", ascending=True).head(10)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,Population Density,p,y,country,rank_avg
53,4548.607762,1,9,19,842.999332,119.0,0.295133,0,Azerbaijan,1.8
54,4760.180989,3,12,19,889.966432,47.0,0.266653,0,Belarus,2.9
56,4840.637016,2,14,18,634.220658,45.0,0.272331,0,Lithuania,3.7
50,4916.05703,3,11,16,373.103938,31.0,0.408164,1,Estonia,3.9
52,5036.012161,9,14,20,230.178919,557.0,0.483711,1,San Marino,4.6
49,5163.60251,5,13,15,472.581056,137.0,0.39348,1,Denmark,6.0
51,5248.446916,7,15,16,190.033447,509.0,0.46042,1,Netherlands,6.5
60,5803.61153,2,11,16,372.508638,70.0,0.405252,0,Ireland,7.6
55,6200.95181,0,3,10,1366.659629,3.0,0.34316,0,Iceland,8.9
59,6967.653189,0,1,2,1575.829132,210.0,0.394659,0,Nigeria,9.5


#### What countries are the most probable to have a confirmed case of COVID-19. (Ordered by Distance of a confirmed case)

In [24]:
Xval.sort_values("avg_dist_to_confirmed").head(10)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,Population Density,p,y,country,rank_avg
53,4548.607762,1,9,19,842.999332,119.0,0.295133,0,Azerbaijan,1.8
54,4760.180989,3,12,19,889.966432,47.0,0.266653,0,Belarus,2.9
56,4840.637016,2,14,18,634.220658,45.0,0.272331,0,Lithuania,3.7
50,4916.05703,3,11,16,373.103938,31.0,0.408164,1,Estonia,3.9
52,5036.012161,9,14,20,230.178919,557.0,0.483711,1,San Marino,4.6
49,5163.60251,5,13,15,472.581056,137.0,0.39348,1,Denmark,6.0
51,5248.446916,7,15,16,190.033447,509.0,0.46042,1,Netherlands,6.5
60,5803.61153,2,11,16,372.508638,70.0,0.405252,0,Ireland,7.6
55,6200.95181,0,3,10,1366.659629,3.0,0.34316,0,Iceland,8.9
59,6967.653189,0,1,2,1575.829132,210.0,0.394659,0,Nigeria,9.5


In [25]:
Xval2 = Xval.sort_values("p", ascending=False).reset_index(drop=True)
Xval2.head(50)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,Population Density,p,y,country,rank_avg
0,5036.012161,9,14,20,230.178919,557.0,0.483711,1,San Marino,4.6
1,5248.446916,7,15,16,190.033447,509.0,0.46042,1,Netherlands,6.5
2,4916.05703,3,11,16,373.103938,31.0,0.408164,1,Estonia,3.9
3,5803.61153,2,11,16,372.508638,70.0,0.405252,0,Ireland,7.6
4,6967.653189,0,1,2,1575.829132,210.0,0.394659,0,Nigeria,9.5
5,5163.60251,5,13,15,472.581056,137.0,0.39348,1,Denmark,6.0
6,11518.39953,0,0,1,2148.627849,66.0,0.355936,0,Mexico,10.6
7,6200.95181,0,3,10,1366.659629,3.0,0.34316,0,Iceland,8.9
8,4548.607762,1,9,19,842.999332,119.0,0.295133,0,Azerbaijan,1.8
9,4840.637016,2,14,18,634.220658,45.0,0.272331,0,Lithuania,3.7


## Predicting new cases
#### Calculating the distance between countries & confirmed cases 
#### Adding SARS and Population features

In [26]:
features = ['avg_dist_to_confirmed', 'confirmed_1k', 'confirmed_2k', 'confirmed_3k', 'min_dist_to_confirmed', 'SARS', 'Population Density']#, 'Cases']
sorted_date = sorted(all_data.keys())

correct = {'Total': 0,'Baseline': 0, 'ML': 0, 'Rank Avg': 0, 'Baseline SARS + Pop.Density': 0}
for i in range(len(sorted_date)-1) :

    date = sorted_date[i]
    next_date = sorted_date[i+1]
    
    Xtrain, ytrain = all_data[date][features], all_data[date]['y']
    Xval, yval = all_data[next_date][features], all_data[next_date]['y']
    Xval, yval = Xval[all_data[next_date]['confirmed_past'] == 0], yval[all_data[next_date]['confirmed_past'] == 0] ##
    
    pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0, C=0.2, class_weight='balanced'))
    #pipe = DecisionTreeClassifier(max_depth=3,class_weight='balanced', ccp_alpha=0.0)
    #pipe = ExtraTreesClassifier(n_estimators=1000, n_jobs=6, max_depth=3, class_weight='balanced', random_state=0)
    #pipe = RandomForestClassifier(n_estimators=100, n_jobs=6, max_depth=3, class_weight='balanced', random_state=0)
    
    
    pipe.fit(Xtrain, ytrain)
    p = pipe.predict_proba(Xval)[:,1]
    
    Xval['p'] = p
    Xval['y'] = yval
    Xval['country'] = all_data[next_date]['Country/Region']
    
    top_n = 40
    baseline = Xval.sort_values("avg_dist_to_confirmed").head(top_n)['y'].sum()# / yval.sum()
    baseline_ = Xval.sort_values("SARS", ascending=False).head(top_n)['y'].sum()
    lr_top_20 = Xval.sort_values("p", ascending=False).head(top_n)['y'].sum()# / yval.sum()
    
    Xval['rank_avg'] = 0.9*Xval['avg_dist_to_confirmed'].rank() + 0.1*Xval['p'].rank(ascending=False)
    avg_in_top_20 = Xval.sort_values("rank_avg", ascending=True).head(top_n)['y'].sum() #/ yval.sum()
    
    correct['Total'] += yval.sum()
    correct['Baseline'] += baseline
    correct['ML'] += lr_top_20
    correct['Rank Avg'] += avg_in_top_20
    correct['Baseline SARS + Pop.Density'] += baseline_
    
    str_result = "Prediction date: {}\nPositive in train: {}\nConfirmed next date: {}\nBaseline: {}\nML: {}\nRank avg top 20: {}\nBaseline SARS + Pop.Density: {}\n".format( 
        date, ytrain.sum(), yval.sum(), baseline, lr_top_20, avg_in_top_20, baseline_)
    print(str_result)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)


Prediction date: 2020-01-22 00:00:00
Positive in train: 10
Confirmed next date: 1
Baseline: 0
ML: 1
Rank avg top 20: 0
Baseline SARS + Pop.Density: 1

Prediction date: 2020-01-23 00:00:00
Positive in train: 11
Confirmed next date: 3
Baseline: 3
ML: 3
Rank avg top 20: 3
Baseline SARS + Pop.Density: 2

Prediction date: 2020-01-24 00:00:00
Positive in train: 14
Confirmed next date: 1
Baseline: 0
ML: 1
Rank avg top 20: 1
Baseline SARS + Pop.Density: 1

Prediction date: 2020-01-25 00:00:00
Positive in train: 15
Confirmed next date: 3
Baseline: 2
ML: 2
Rank avg top 20: 2
Baseline SARS + Pop.Density: 2

Prediction date: 2020-01-26 00:00:00
Positive in train: 18
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline SARS + Pop.Density: 1

Prediction date: 2020-01-27 00:00:00
Positive in train: 19
Confirmed next date: 2
Baseline: 2
ML: 2
Rank avg top 20: 2
Baseline SARS + Pop.Density: 2



  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Prediction date: 2020-01-28 00:00:00
Positive in train: 21
Confirmed next date: 2
Baseline: 2
ML: 2
Rank avg top 20: 2
Baseline SARS + Pop.Density: 2

Prediction date: 2020-01-29 00:00:00
Positive in train: 23
Confirmed next date: 4
Baseline: 4
ML: 4
Rank avg top 20: 4
Baseline SARS + Pop.Density: 4

Prediction date: 2020-01-30 00:00:00
Positive in train: 27
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline SARS + Pop.Density: 1

Prediction date: 2020-01-31 00:00:00
Positive in train: 28
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline SARS + Pop.Density: 1

Prediction date: 2020-02-03 00:00:00
Positive in train: 29
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline SARS + Pop.Density: 1

Prediction date: 2020-02-13 00:00:00
Positive in train: 30
Confirmed next date: 1
Baseline: 1
ML: 1
Rank avg top 20: 1
Baseline SARS + Pop.Density: 1

Prediction date: 2020-02-18 00:00:00
Positive in train: 31
Confirmed next date: 2
Baseline: 2


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [27]:
correct
#{'Baseline': 8, 'LR': 4, 'Rank Avg': 6} = 50/50

{'Total': 43,
 'Baseline': 40,
 'ML': 42,
 'Rank Avg': 41,
 'Baseline SARS + Pop.Density': 41}

#### What countries are the most probable to have a confirmed case of COVID-19. (Ordered by Rank Average)

In [28]:
Xval.sort_values("rank_avg", ascending=True).head(10)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,SARS,Population Density,p,y,country,rank_avg
53,4548.607762,1,9,19,842.999332,0.0,119.0,0.252904,0,Azerbaijan,1.7
54,4760.180989,3,12,19,889.966432,0.0,47.0,0.223704,0,Belarus,2.8
56,4840.637016,2,14,18,634.220658,0.0,45.0,0.213253,0,Lithuania,3.8
50,4916.05703,3,11,16,373.103938,0.0,31.0,0.337332,1,Estonia,4.0
52,5036.012161,9,14,20,230.178919,0.0,557.0,0.417825,1,San Marino,4.7
49,5163.60251,5,13,15,472.581056,0.0,137.0,0.321582,1,Denmark,6.0
51,5248.446916,7,15,16,190.033447,0.0,509.0,0.374252,1,Netherlands,6.6
60,5803.61153,2,11,16,372.508638,1.0,70.0,0.561411,0,Ireland,7.3
55,6200.95181,0,3,10,1366.659629,0.0,3.0,0.29375,0,Iceland,8.8
59,6967.653189,0,1,2,1575.829132,0.0,210.0,0.332304,0,Nigeria,9.5


#### What countries are the most probable to have a confirmed case of COVID-19. (Ordered by Distance of a confirmed case)

In [29]:
Xval.sort_values("avg_dist_to_confirmed").head(10)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,SARS,Population Density,p,y,country,rank_avg
53,4548.607762,1,9,19,842.999332,0.0,119.0,0.252904,0,Azerbaijan,1.7
54,4760.180989,3,12,19,889.966432,0.0,47.0,0.223704,0,Belarus,2.8
56,4840.637016,2,14,18,634.220658,0.0,45.0,0.213253,0,Lithuania,3.8
50,4916.05703,3,11,16,373.103938,0.0,31.0,0.337332,1,Estonia,4.0
52,5036.012161,9,14,20,230.178919,0.0,557.0,0.417825,1,San Marino,4.7
49,5163.60251,5,13,15,472.581056,0.0,137.0,0.321582,1,Denmark,6.0
51,5248.446916,7,15,16,190.033447,0.0,509.0,0.374252,1,Netherlands,6.6
60,5803.61153,2,11,16,372.508638,1.0,70.0,0.561411,0,Ireland,7.3
55,6200.95181,0,3,10,1366.659629,0.0,3.0,0.29375,0,Iceland,8.8
59,6967.653189,0,1,2,1575.829132,0.0,210.0,0.332304,0,Nigeria,9.5


In [30]:
Xval2 = Xval.sort_values("p", ascending=False).reset_index(drop=True)
Xval2.head(50)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,min_dist_to_confirmed,SARS,Population Density,p,y,country,rank_avg
0,5803.61153,2,11,16,372.508638,1.0,70.0,0.561411,0,Ireland,7.3
1,5036.012161,9,14,20,230.178919,0.0,557.0,0.417825,1,San Marino,4.7
2,5248.446916,7,15,16,190.033447,0.0,509.0,0.374252,1,Netherlands,6.6
3,4916.05703,3,11,16,373.103938,0.0,31.0,0.337332,1,Estonia,4.0
4,6967.653189,0,1,2,1575.829132,0.0,210.0,0.332304,0,Nigeria,9.5
5,5163.60251,5,13,15,472.581056,0.0,137.0,0.321582,1,Denmark,6.0
6,6200.95181,0,3,10,1366.659629,0.0,3.0,0.29375,0,Iceland,8.8
7,4548.607762,1,9,19,842.999332,0.0,119.0,0.252904,0,Azerbaijan,1.7
8,11518.39953,0,0,1,2148.627849,0.0,66.0,0.251239,0,Mexico,10.8
9,4760.180989,3,12,19,889.966432,0.0,47.0,0.223704,0,Belarus,2.8
