In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from datetime import date
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-4/train.csv")
test = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-4/test.csv")
submit = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-4/submission.csv")
weather = pd.read_csv("/kaggle/input/weather-data/training_data_with_weather_info_week_4.csv")
country = pd.read_csv("/kaggle/input/countryinfo/covid19countryinfo.csv")
population = pd.read_csv("/kaggle/input/population-sizes-worldwide/population_sizes.csv")
c2 = pd.read_csv("/kaggle/input/covid19-forecasting-metadata/region_metadata.csv")

/kaggle/input/covid19-global-forecasting-week-4/submission.csv
/kaggle/input/covid19-global-forecasting-week-4/test.csv
/kaggle/input/covid19-global-forecasting-week-4/train.csv
/kaggle/input/population-sizes-worldwide/population_sizes.csv
/kaggle/input/population-sizes-worldwide/sources.csv
/kaggle/input/countryinfo/covid19countryinfo.csv
/kaggle/input/countryinfo/covid19tests.csv
/kaggle/input/covid19-forecasting-metadata/region_date_metadata.csv
/kaggle/input/covid19-forecasting-metadata/region_metadata.csv
/kaggle/input/weather-data/__results__.html
/kaggle/input/weather-data/__resultx__.html
/kaggle/input/weather-data/training_data_with_weather_info_week_4.csv
/kaggle/input/weather-data/training_data_with_weather_info_week_1.csv
/kaggle/input/weather-data/__notebook__.ipynb
/kaggle/input/weather-data/custom.css
/kaggle/input/weather-data/__output__.json
/kaggle/input/weather-data/__results___files/__results___2_1.png
/kaggle/input/weather-data/__results___files/__results___3_0.png

# Data Transformation
1.  Combine the 'Country_Region' and 'Province_State' columns into 'country_province'.
2.  Calculate the cumulative cases and fatalities for each country_province as 'cumCases' and 'cumDeath'
3.  Chagnge the datatype of 'Date' to datetime  
4.  Log transform the cases and deaths
5.  Add 'prevCases' and 'prevDeath' for train data
6.  Add 'Days' since Jan-22-2020

In [3]:
# Steps 1 to 6
def dataTrans(df, purpose):
    # 1. Combine the Country_Region and Province_State columns into country_province.
    df.Province_State[df['Province_State'].isnull()] = '' # change the null to empty string
    df['country_province'] = df.apply(lambda x: x.Country_Region+'-' if x.Province_State == '' else x.Country_Region+'-'+x.Province_State, axis = 1)
    # 3. Chagnge the datatype of Date to datetime
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
   
    ########################################################################################################################################
     # 7.  Add a column for the days since the first cases
    df["Days"] = (df.Date - pd.Timestamp('2020-01-22 00:00:00')).dt.days
    ########################################################################################################################################
    
    # 4. Log transform the cases and deaths.
    df['log_ConfirmedCases'] = np.log(df['ConfirmedCases'])
    df['log_Fatalities'] = np.log(df['Fatalities'])
    
    ##################################################################################################
    # for training dataset only
    if (purpose == 'train'):
        # 2. Calculate the cumulative cases and fatalities for each country_province for the train data
        # 5.  Add column for prevCases and prevDeath
        cumCases = pd.Series()
        cumDeath = pd.Series()
        prevCases = pd.Series()
        prevDeath = pd.Series()
        
        for region in df.country_province.unique():
            cases = df.log_ConfirmedCases[df.country_province==region].cumsum() # cumulative sum of log cases
            death = df.log_Fatalities[df.country_province==region].cumsum() # cumulative sum of log deaths
            
            cumCases = pd.concat([cumCases,cases])
            cumDeath = pd.concat([cumDeath,death])
            prevCases = pd.concat([prevCases,pd.Series([0]),cases.iloc[:len(cases)-1]])
            prevDeath = pd.concat([prevDeath,pd.Series([0]),death.iloc[:len(death)-1]])
            
        prevCases = prevCases.reset_index(drop=True)
        prevDeath = prevDeath.reset_index(drop=True)
        #print(len(cumCases), len(cumDeath), df.shape[0])
        df_cum = pd.concat([df,cumCases,cumDeath,prevCases,prevDeath], axis=1)
        #print(df.shape, df_cum.shape)
        df_cum = df_cum.rename(columns={0:'cumCases', 1:'cumDeath', 2:'prevCases', 3:'prevDeath'})
        return df_cum
    ######################################################################################################
    # for testing dataset
    else:
        return df

df_train = dataTrans(train,'train')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
  result = getattr(ufunc, method)(*inputs, **kwargs)


# Additional Dataset
1. weather
2. country info
3. population

## Additional dataset: Weather
Here is the weather data:
The weather data is only available until April-11-2020

- temp: Mean temperature for the day in degrees Fahrenheit to tenths.
- max: Maximum temperature reported during the day in Fahrenheit to tenths--time of max temp report varies by country and region, so this will sometimes not be the max for the calendar day.
- min: Minimum temperature reported during the day in Fahrenheit to tenths--time of min temp report varies by country and region, so this will sometimes not be the min for the calendar day.
- stp: Mean station pressure for the day in millibars to tenths.
- slp: Mean sea level pressure for the day in millibars to tenths.
- dewp: Mean dew point for the day in degrees Fahrenheit to tenths.
- rh: relative humidity as ratio between actual vapour pressure (computed from dewpoint temperature)
- ah: absolute humidity from the gas law of vapour calcuated from the actual vapour pressure (in pascals). (ah = mass / volume = pressure / (constant * temperature))
- wdsp: Mean wind speed for the day in knots to tenths.
- prcp: Total precipitation (rain and/or melted snow) reported during the day in inches and hundredths; will usually not end with the midnight observation--i.e., may include latter part of previous day. .00 indicates no measurable precipitation (includes a trace).
- fog: Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day

In [6]:
c2.columns

Index(['Country_Region', 'Province_State', 'lat', 'lon', 'continent',
       'population', 'area', 'density'],
      dtype='object')

In [8]:
len(train.Country_Region.unique()), len(c2.Country_Region.unique())
c2.head()

Unnamed: 0,Country_Region,Province_State,lat,lon,continent,population,area,density
0,Afghanistan,,33.0,65.0,Asia,38041754,652230,58.33
1,Albania,,41.1533,20.1683,Europe,2880917,28748,100.21
2,Algeria,,28.0339,1.6596,Africa,43053054,2381741,18.08
3,Andorra,,42.5063,1.5218,Europe,77142,468,164.83
4,Angola,,-11.2027,17.8739,Africa,31825295,1246700,25.53


In [4]:

northamerica = ['Antigua and Barbuda','Bahamas','Barbados','Belize','Canada','Costa Rica','Cuba', 'Dominica', \
                'Dominican Republic','El Salvador', 'Grenada', 'Guatemala','Haiti','Honduras', 'Jamaica', \
                'Mexico', 'Nicaragua', 'Panama','Saint Kitts and Nevis', 'Saint Lucia','Saint Vincent and the Grenadines',\
                'Trinidad and Tobago', 'US']
southamerica = ['Argentina','Bolivia', 'Brazil','Chile', 'Colombia', 'Ecuador','Guyana','Paraguay','Peru',\
                'Suriname','Uruguay', 'Venezuela']
assert len(c2[c2.continent == "Americas"].Country_Region.unique()) == (len(northamerica)+len(southamerica))

In [20]:
c2['continent'] = c2.apply(lambda x: 'North_America' if x.Country_Region.isin northamerica else ('South_America' if x.Country_Region in southamerica else x.continent)) # 
c2.continent.unique()

AttributeError: ("'Series' object has no attribute 'Country_Region'", 'occurred at index Country_Region')

In [None]:
# Clean out the filler values for NAs and outliers in particular columns
weather['ah'] = weather['ah'].apply(lambda x: np.nan if x==np.inf else x)
weather['wdsp'] = weather['wdsp'].apply(lambda x: np.nan if x==999.9 else x)
weather['prcp'] = weather['prcp'].apply(lambda x: np.nan if x==99.99 else x)

# replace np.nan with mean
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
weather.loc[:,'temp':'prcp']=imputer.fit_transform(weather.loc[:,'temp':'prcp'])

# check that there is no outliers in the weather dataset
for col in weather.columns:
    if col in ['temp','min','max','stp','slp','dewp','rh', 'ah','wdsp','prcp','fog']:
        #print(col,sorted(weather[col].unique(),reverse=True)[:5])
        print(col,sum(weather[col].isna()))

## Additional data: Country info
It currently contains:

1. Population (2020)
2. Density: The number of people who lives per square meter. (2020)
3. Median age (2020)
4. Urban population: the % of the population who lives in urban areas. (2020)
5. Hospital beds per 1K people: I assume that the higher this number is, the lower the fatalities number would be. (2020, 2018)
6. Forced quarantine policy initial date: I believe that a couple of weeks after this specific date, we can assume there would be a reduction of the infection rate. (updated on a daily basis)
7. School closure policy initial date: Same as (6). (updated on a daily basis)
8. Public places (bars, restaurants, movie theatres, etc.) closure policy initial date (4/3/2020)
9. The maximum amount of people allowed in gatherings and the initial date of the policy (4/3/2020)
10. Non-essential house leaving - initial date of the restriction (4/3/2020)
11. Sex ratio grouped by age groups (amount of males per female). (2020)
12. Lung disease death rate per 100k people, separated by sex. (2020)
13. % of smokers within the population: The higher this number is, the higher the fatalities number would be. (2019)
14. Amount of COVID detection test made per day: I collected this information for about 50 countries, missing 120 more. (3/22/2020)
15. GDP-nominal (2019)
16. Health expenses in international USD (2019, 2017, 2015)
17. Health expenses divided by population (2020 - population), (2019, 2017, 2015 - health expenses)
18. Average amount of children per woman - I find it as an important feature when it comes in interaction with density and school restriction variables. (2017)
19. First patient detection date
20. Total confirmed cases (4/3/2020)
21. Total active cases (4/3/2020)
22. New confirmed cases (4/3/2020)
23. Total deaths (4/3/2020)
24. New deaths (4/3/2020)
25. Total recovered (4/3/2020)
26. Amount of patients in critical situation (4/3/2020)
27. Total cases / 1 million population (4/3/2020)
28. Total deaths / 1 million population (4/3/2020)
29. Average temperature (Celsius) measured between January and April. (2020)
30. Average percentage of humidity measured between January and April. (2020)

In [None]:
# Longitude and latitude from the weather data
weather.rename(columns={'country+province':'country_province'}, inplace=True)
df_train = pd.merge(df_train, weather[['country_province','Lat','Long']], on='country_province', how='outer')
df_train.head()

In [None]:
# Population data
population.Province_State[population.Province_State.isnull()] = ''
population['country_province'] = population.apply(lambda x: x.Country_Region+'-' 
                                                  if x.Province_State == '' 
                                                  else x.Country_Region+'-'+x.Province_State, 
                                                  axis=1)
df_train = pd.merge(df_train, population[['country_province','Population']], on='country_province', how='outer')
df_train.head()

In [None]:
df_train_weather = df_train[df_train.Date <= pd.Timestamp('2020-04-11 00:00:00')]
df_train_weather.info()

In [None]:
# check the temp, min, and max of one country
df = df_train_weather[df_train_weather.country_province == "Germany-"]
figsize = (20,12)
df.temp.plot(figsize=figsize)
df['min'].plot(figsize=figsize)
df['max'].plot(figsize=figsize)
plt.legend()

In [None]:
figsize = (10,7)
df_train_weather.prcp.plot(figsize=figsize) 

In [None]:
df_train_weather.wdsp.plot(figsize=(15,8))

In [None]:
# First, I would like to try the random forest regressor on dat before 2020-04-11 through cross validation. 
#weather.Date.max(), df_train.Date.max(), test.Date.max()

predictors =['Days','prevCases','prevDeath','temp','stp','wdsp','prcp','fog','min','max','dewp','rh','ah','slp'] # 
targets = ['cumCases']

#from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
model = RandomForestRegressor(n_estimators=100)
#kf = model_selection.KFold(n_splits=3) # shuffle = True
predictedCases = []
count = 0
for region in df_train_weather.country_province.unique():
    #print(region)
    df = (df_train_weather[df_train_weather.country_province == region]
        .reset_index(drop=True))
    
    X_train = df.loc[list(range(75)),predictors] # day 0 to day 74
    X_test = df.loc[list(range(75,81)),predictors] # day 75 to 81
    
    y_train = df.loc[list(range(75)),targets]
    y_test = df.loc[list(range(75,81)),targets]
    #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    # For training, fit() is used
    #print(X_train)
    #print(y_train)
    model.fit(X_train, y_train)
    
    #print(X_test)
    #print(y_test)
    # Default metric is R2 for regression, which can be accessed by score()
    print('Score of training model:',model.score(X_test, y_test))
    
    # For other metrics, we need the predictions of the model
    y_pred = model.predict(X_test)
    predictedCases.append(list(X_test))
    predictedCases.append(y_pred)
    #print('Mean Squared Error:',metrics.mean_squared_error(y_test, y_pred))
    #print('R2:',metrics.r2_score(y_test, y_pred))
    
    #print(predictedCases)
    #print(df.cumCases)
    count += 1
    if (count >= 5):
        break