<a href="https://colab.research.google.com/github/Magnsta/RandomForest/blob/main/Lasso%26RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [None]:
#Loading dataset for prediction and forecasting
#Last updated at 9 december 2021
#Data is downloaded from https://github.com/owid/covid-19-data/tree/master/public/data/
covid_data_all = pd.read_csv('owid-covid-data.csv')

In [None]:
<script>
    getData()

    async function getData(){
        const response = await fetch('covid_cases_norway.csv');
        const data = await response.text();
        console.log(data);

        const rows = data.split('\n');
        rows.forEach(elt => {
            const row = elt.split(',');
            const day = row[0];
            const cases = row[1];
            console.log(day,cases);
        });
}
    </script>

# Random Forest Regressor Model for Forecasting
Currently set to 7 days forcast. Every country in the world has its own forecast model. 

The features generated for the Random Forest to train on is: 
1. year
2. month
3. new_cases (Number of cases last known day)
4. lag_new_cases (Number of cases three days prior)
5. rolling_mean (Rolling mean over a window of size 3) 

The forecasting is 
1. new_cases_week (By adjusting the shift(x) the distance the model is forecasting can be adjusted. 
The accuracy of the model is good for providing a trend for most countries. Due to the way Random Forest and any forest algorithm is functioning some countried have less accurate results then other. 

In [None]:
#Returns the specified country with all columns that have less then 10% NaN values
def get_country(df,country):
  current_country = df[df.get('location')==country]
  a = percentage_of_null(current_country,10)
  a.reset_index(level=0,drop=True)
  return a

In [None]:
#Check how many NaN values the country has. 
#Discard if more then 10% is NaN values
def percentage_of_null(country,percentage):
  country_a = 100* country.isnull().sum()/len(country)
  country_a = pd.DataFrame(country_a)
  country_a.rename(columns={"0": "A"})
  country_a = country_a[country_a.values<=percentage]  #Discard if more then 5% of data is missing
  country_a = country_a.transpose()
  indexes = country_a.columns
  country = country[indexes]
  return country


In [None]:
#Function is called when training new RandomForest for forecasting for a new country
def forecastCountry(country):
  a = get_country(covid_data_all,country)
  b = prepere_data(a)
  train = b[0]
  valid = b[1]
  preditcion = trainRandomForest(valid, train)
  yval = valid['new_cases_week']
  yval = yval.reset_index(level=0,drop=True)
  plt.plot(preditcion, label = "Weekly Forecasting")
  plt.plot(yval, label = "Actual")
  plt.legend()
  plt.show()

In [None]:
#Create features for forecasting
#Shift set to -7 as we are forecasting the trend a week ahead. 
#Add lag feature
#Add diff feature 
def prepere_data(country):
  features = ['new_cases','new_cases_smoothed','date','total_cases']
  scale_features = ['new_cases','new_cases_smoothed','total_cases']
  if features[0] and features[1] in country:

    scaler = MinMaxScaler()
    country[scale_features] = scaler.fit_transform(country[scale_features])

    country_train = country[:-30]  #Divide into train and test set
    country_valid = country[-30:]

    country_valid.reset_index(level=0,drop=True)
    country_train['date'] = pd.to_datetime(country_train['date'],format='%Y-%m-%d') 
    country_valid['date'] = pd.to_datetime(country_valid['date'],format='%Y-%m-%d')

    country_train['year']=country_train['date'].dt.year 
    country_train['month']=country_train['date'].dt.month 
    country_train['day']=country_train['date'].dt.day
    country_train['week_num']=country_train['date'].dt.dayofweek  

    country_valid['year']=country_valid['date'].dt.year 
    country_valid['month']=country_valid['date'].dt.month 
    country_valid['day']=country_valid['date'].dt.day
    country_valid['week_num']=country_valid['date'].dt.dayofweek  


    country_train['rolling_mean'] = country_train['new_cases'].rolling(3,min_periods=1).mean()
    country_valid['rolling_mean'] = country_valid['new_cases'].rolling(3,min_periods=1).mean()


    country_train['new_cases_week'] = country_train['new_cases'].shift(-7)
    country_valid['new_cases_week'] = country_valid['new_cases'].shift(-7)

    country_valid = country_valid.dropna()
    country_train = country_train.dropna() 

    #Create 4 fundemental features
    #Lag1 
    country_train['lag_new_cases'] = country_train['new_cases'].shift(3)
    country_valid['lag_new_cases'] = country_valid['new_cases'].shift(3)

    country_train['lag7_new_cases'] = country_train['new_cases'].shift(5)
    country_valid['lag7_new_cases'] = country_valid['new_cases'].shift(5)

    #Difference
    country_train['diff_new_cases'] = country_train['new_cases'].diff(1)
    country_valid['diff_new_cases'] = country_valid['new_cases'].diff(1)
    
    a = [country_train,country_valid]
    return a
  else:
    return "DataFrame does not contain enough data for forecasting"

In [None]:
#Training RandomForest model
#Currently uses year, month, new_cases for that day, new cases 3 days prior and rolling mean window 3 to predict trend for the next 7 days. 

def trainRandomForest(country_valid, country_train): 

  features = ['year','new_cases','month','lag_new_cases','rolling_mean']#,'diff_new_cases']#,'diff_new_cases','month']
  #next_week = country_valid[-2:]
  imputer = SimpleImputer()
  Xtr = country_train[features]

  Xtr = imputer.fit_transform(Xtr)

  ytr = country_train['new_cases_week']

  model = RandomForestRegressor(n_estimators=1000,random_state=0,n_jobs=6,bootstrap=True)
  model.fit(Xtr,ytr)

  Xval = country_valid[features]  
  Xval = imputer.transform(Xval)
  yval = country_valid['new_cases_week']
  ##Prediction Covid trend next week
  ##If last value is greater then next week value we set the trend to be decreasing.
  ##if last value is less then next weeks forecasted value we set the trend to be increasing.
  ##if difference is minor we set trend to flat. 
  next_week = country_valid[-2:]
  next_week = next_week[features]
  next_week = imputer.transform(next_week)
  covid_trend = model.predict(next_week)
  p = model.predict(Xval)
  
  return p, covid_trend

In [None]:
def forecastCountryForest(country):
  a = get_country(covid_data_all,country)
  b = prepere_data(a)
  train = b[0]
  valid = b[1]
  prediction, next_week = trainRandomForest(valid, train)
  yval = valid['new_cases_week']
  yval = yval.reset_index(level=0,drop=True)

  plt.plot(prediction, label = "Weekly Forecasting") #Remove comment if wish to test the code. 
  plt.plot(yval, label = "Actual")
  plt.legend()
  plt.show()  
  return next_week

In [None]:
#country = "Finland"  
#country = "Sweden" 
#country = "Russia"                          #Development purpose
#country = "Norway"                          #Development purpose
country = forecastCountryForest("Sri Lanka")

In [None]:
#List of countries thta have Data that we can forecast on
item_counts=['Afghanistan', 'Africa', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia',
       'Asia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 
       'Bahrain','Bangladesh', 'Barbados', 'Belarus', 'Belgium', 
       'Belize', 'Benin','Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
        'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cameroon', 'Canada', 'Cape Verde','Chile',
       'China', 'Colombia', 'Comoros', 'Congo',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba',
       'Cyprus', 'Czechia', 'Democratic Republic of Congo', 
       'Denmark','Djibouti', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Europe', 'European Union',
        'Finland', 'France',
        'Gabon', 'Gambia', 'Georgia', 'Germany', 'Greece',
       'Haiti',  'Honduras', 'Hong Kong', 'Hungary',
       'Iceland', 'India', 'Indonesia', 'International', 'Iran', 
       'Iraq','Ireland', 'Israel', 'Italy', 'Jamaica', 
       'Japan', 'Jordan', 'Kazakhstan', 'Kenya',  'Kosovo',
       'Kuwait', 'Kyrgyzstan',  'Latvia', 'Lebanon', 'Lesotho',
       'Liberia', 'Libya', 'Liechtenstein', 'Lithuania',
       'Luxembourg','Madagascar',
       'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta',
        'Moldova', 'Monaco',
       'Montenegro', 'Morocco','Netherlands',
       'New Zealand', 'Nicaragua', 'Niger', 'Nigeria',
       'North America', 'North Macedonia', 'Norway',
       'Oceania', 'Oman', 'Pakistan',  'Palestine', 'Panama',
        'Paraguay', 'Philippines',
       'Poland', 'Portugal', 'Romania', 'Russia',
         'San Marino','Saudi Arabia', 'Senegal', 'Serbia',
       'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 
       'Somalia', 'South Africa', 'South America',
        'South Sudan', 'Spain', 'Sri Lanka', 'Sudan',
       'Suriname', 'Sweden', 'Switzerland', 'Syria', 'Taiwan',
       'Tajikistan', 'Tanzania', 'Thailand', 'Togo',
        'Trinidad and Tobago', 'Tunisia', 'Turkey',
       'Ukraine', 'United Arab Emirates', 'United Kingdom',
       'United States', 'Uruguay', 'Uzbekistan',
        'Venezuela','Yemen', 'Zambia', 'Zimbabwe']

In [None]:
#Initilize the randomForest Training. 

countryy = "United Kingdom"
forecast = forecastCountryForest(countryy)
b = forecast[1] - forecast[0]
b = b*10
print(b)
if(b>=0.1):
  data = {'United Kingdom':["Increasing Trend"]}
  forecast_model = pd.DataFrame(data)

if(b<=-0.1):
  data = {'United Kingdom':["Decreasing Trend"]}
  forecast_model = pd.DataFrame(data)

if(b<0.1 and b>-0.1):
  data = {'United Kingdom':["Flat Trend"]}
  forecast_model = pd.DataFrame(data)


In [None]:
#Train randomForest model for every country
for country in item_counts:
  print(country)
  country_true = country
  forecast = forecastCountryForest(country)
  b = forecast[1] - forecast[0]
  b = b*10
 
  if(b>=0.1):
    forecast_model[country_true] = "Increasing Trend"

  if(b<=-0.1):
    forecast_model[country_true] = "Decreasing Trend"

  if(b<0.1 and b>-0.1):
    forecast_model[country_true] = "Flat Trend"

In [None]:
#forecast_model2 = forecast_model.transpose()
#forecast_models = forecast_model.drop([1, 1])
#forecast_models = forecast_model.transpose()
#forecast_models.columns = ['Covid-19 Trend']

In [None]:
#forecast_models.to_excel('Forecast_Weekly.xlsx', index=True)

# Lasso Regression for new cases for all countries in the world
Fitted and trained a Lasso regression for all countries. Uses two sets of features due to some countried have more detailed dataset compared to others. THis was done as it was not desired to just disreagard countried with less features. 

But we still had to discard some countried who had not even the most basic features. 

In [None]:
from sklearn.linear_model import Lasso
def lasso_regression(data,true,false):

  if(false==True):
    if(true==True):
      features = ['total_deaths','new_tests','new_deaths','total_cases']
    if(true==False):
      features = ['total_deaths','new_deaths','total_cases']

    X = data[features]
    y = data[['new_cases']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

    min_max_scaler = MinMaxScaler(feature_range = (0,1))
    xtrain_ = min_max_scaler.fit_transform(X_train)
    xtest_ = min_max_scaler.fit_transform(X_test)
    X_ = min_max_scaler.fit_transform(X)
  # x_train = train
  # y_train = valid
    model = Lasso(alpha=0.1,normalize=True,max_iter=5000)
    model.fit(xtrain_,y_train)
    prediction = model.predict(xtest_)
    predict = model.predict(X_)

    y_test = y_test.reset_index(level=0,drop=True)
    y = y.reset_index(level=0,drop=True)
    #plt.plot(predict, label = "Prediction Lasso")
    #plt.plot(y, label = "Actual")
    #plt.legend()
    #plt.show()

  #  plt.plot(prediction, label = "Prediction Lasso")
  #  plt.plot(y_test, label = "Actual")
  #  plt.legend()
  #  plt.show()
  
    return y,predict


In [None]:
from sklearn.linear_model import Ridge
def ridge_regression(data,true,false):
  if(false==True):
    if(true==True):
      features = ['total_deaths','new_tests','new_deaths','total_cases']
    if(true==False):
      features = ['total_deaths','new_deaths','total_cases']

    X = data[features]
    y = data[['new_cases']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

    min_max_scaler = MinMaxScaler(feature_range = (0,1))
    xtrain_ = min_max_scaler.fit_transform(X_train)
    xtest_ = min_max_scaler.fit_transform(X_test)
    X_ = min_max_scaler.fit_transform(X)
  # x_train = train
  # y_train = valid
    model = Ridge(alpha=0.1,normalize=True,max_iter=10000)
    model.fit(xtrain_,y_train)
    prediction = model.predict(xtest_)
    predict = model.predict(X_)
    y_test = y_test.reset_index(level=0,drop=True)
    y = y.reset_index(level=0,drop=True)
    #plt.plot(predict, label = "Prediction Ridge")
    #plt.plot(y, label = "Actual")
    #plt.legend()
    #plt.show()
    #plt.plot(prediction, label = "Prediction Ridge")
    #plt.plot(y_test, label = "Actual")
    #plt.legend()
    #plt.show()
    return predict


In [None]:
def predictCountryLasso(country):
  features = ['new_cases','total_deaths','new_tests','new_deaths','total_cases']
  a = get_country(covid_data_all,country)
  if ('new_tests' in a and 'new_deaths' in a and 'total_deaths' in a):
    a = a[features]
    a = a.dropna()
    return lasso_regression(a,True,True)
  if ('new_cases' in a and 'new_deaths' in a and 'total_deaths' in a):
    features = ['new_cases','total_deaths','new_deaths','total_cases']
    a = a[features]
    a = a.dropna()
    return lasso_regression(a,False,True)
  else:
    return ridge_regression(a,False,False)

In [None]:
def predictCountryRidge(country):
  features = ['new_cases','total_deaths','new_tests','new_deaths','total_cases']
  a = get_country(covid_data_all,country)
  if ('new_tests' in a and 'new_deaths' in a and 'total_deaths' in a):
    a = a[features]
    a = a.dropna()
    return ridge_regression(a,True,True)
  if ('new_cases' in a and 'new_deaths' in a and 'total_deaths' in a):
    features = ['new_cases','total_deaths','new_deaths','total_cases']
    a = a[features]
    a = a.dropna()
    return ridge_regression(a,False,True)
  else:
    return ridge_regression(a,False,False)


In [None]:
####LASSO
####RIDGE

In [None]:
#List of countried that have enough data for our Lasso regresion model. 
item_counts = covid_data_all.location.unique()
item_counts=['Afghanistan', 'Africa', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia',
       'Asia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin'
       ,'Bolivia','Bosnia and Herzegovina', 'Botswana', 'Brazil',
        'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cameroon', 'Canada', 'Cape Verde','Chile',
       'China', 'Colombia', 'Comoros', 'Congo',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba',
       'Cyprus', 'Czechia', 'Democratic Republic of Congo', 'Denmark',
       'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Europe', 'European Union',
        'Finland', 'France',
        'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece','Haiti',  'Honduras', 'Hong Kong', 'Hungary',
       'Iceland', 'India', 'Indonesia', 'International', 'Iran', 'Iraq',
       'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 
       'Jordan', 'Kazakhstan', 'Kenya',  'Kosovo',
       'Kuwait', 'Kyrgyzstan',  'Latvia', 'Lebanon', 'Lesotho',
       'Liberia', 'Libya', 'Liechtenstein', 'Lithuania',
       'Luxembourg','Madagascar',
       'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta',
        'Moldova', 'Monaco','Montenegro', 'Morocco','Netherlands',
       'New Zealand', 'Nicaragua', 'Niger', 'Nigeria',
       'North America', 'North Macedonia', 'Norway',
       'Oceania', 'Oman', 'Pakistan',  'Palestine', 'Panama',
        'Paraguay', 'Peru', 'Philippines',
       'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia',
         'San Marino','Saudi Arabia', 'Senegal', 'Serbia',
       'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 
       'Somalia', 'South Africa', 'South America',
       'South Korea', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan',
       'Suriname', 'Sweden', 'Switzerland', 'Syria', 'Taiwan',
       'Tajikistan', 'Tanzania', 'Thailand', 'Togo',
        'Trinidad and Tobago', 'Tunisia', 'Turkey',
       'Ukraine', 'United Arab Emirates', 'United Kingdom',
       'United States', 'Uruguay', 'Uzbekistan',
        'Venezuela','Yemen', 'Zambia', 'Zimbabwe']

In [None]:
#Fitting the Lasso regression model
df_ridge = pd.DataFrame()
country_pd = covid_data_all.location.unique()

#country = "Denmark"
countryy = "Norway"
country_new_case = countryy+"_new_cases"
y,predict = predictCountryLasso(countryy)
lasso_model = pd.DataFrame(predict)

for country in item_counts:
  print(country)
  country_new_case = country+"_new_cases"
  y,predict = predictCountryLasso(country)
  lasso_model[country] = pd.DataFrame(predict)
  lasso_model[country_new_case] = pd.DataFrame(y)
#ridge_model[country]  = pd.DataFrame(predictCountryRidge(country))
lasso_model = lasso_model.iloc[: , 1:]

In [None]:
#Saving the models output. 
#lasso_model.to_excel('Latest_lasso.xlsx', index=False)
#forecast_model.to_excel('FOrecast_latest.xlsx', index=False)
