In [1]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
import pandas as pd
data = pd.read_csv('./download.csv')

# Create Dataframe
df = pd.DataFrame(data)

# Adjust below zero data
num = df._get_numeric_data()
num[num < 0] = 0

# Split data from original dataset by country 
countries = df['countriesAndTerritories'].unique().tolist()
countries.remove('Cases_on_an_international_conveyance_Japan')
training_data_list = []
testing_data_list = []
final_data_list = []
allow_country = [ 'Russia', 'Greece', 'India', 'United_States_of_America', 'Turkey' ]
for country in allow_country:
  temp = df.loc[df.countriesAndTerritories == country]
  temp = temp.iloc[::-1] # Reverse Dataset
  testing_data = temp[-7:].reset_index()
  testing_data['idx'] = range(0, len(testing_data))
  testing_data_list.append(testing_data) # Only fetch 7 datum for testing

  # Add idx to data
  training_data = temp[-57:-7] # Only fetch 90 datum for training
  training_data['idx'] = range(1, len(training_data) + 1)
  training_data = training_data.reset_index()
  training_data_list.append(training_data)

  final_data = temp[-50:]
  final_data = final_data.reset_index()
  final_data_list.append(final_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [3]:
pip install statsmodels --upgrade

Collecting statsmodels
  Downloading statsmodels-0.12.0-cp37-cp37m-macosx_10_13_x86_64.whl (9.6 MB)
[K     |████████████████████████████████| 9.6 MB 13.0 MB/s eta 0:00:01     |██████████████████████████████▊ | 9.2 MB 13.0 MB/s eta 0:00:01
Installing collected packages: statsmodels
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.11.0
    Uninstalling statsmodels-0.11.0:
      Successfully uninstalled statsmodels-0.11.0
Successfully installed statsmodels-0.12.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
from statsmodels.tsa.ar_model import AutoReg
import numpy as np
import matplotlib.pyplot as plt

def mape(y_pred, y_true):
  error = 0
  for (idx, true_item) in enumerate(y_true):
    if (true_item != 0):
      pred_item = y_pred[idx]
      error += abs((pred_item - true_item) / (true_item))
  return error/len(y_true)

# Variable
window_size = 25
country_window_map = {}
total_windows = []

for (idx, country_data) in enumerate(training_data_list):
  cases_arr = country_data['cases']
  cases_arr_testing = testing_data_list[idx]['cases']
  
  # Test different window_size and try to get the optimal result!
  window_error = [0] * window_size
  window_error[0] = 9999999

  for window_idx in range(1, window_size):
    # Execute AutoRegression: Lag(window) means that we would use how many data as the coefficient
    model = AutoReg(cases_arr, lags=window_idx)
    model_fit = model.fit()

    predict_arr = model_fit.predict(start=len(cases_arr), end=len(cases_arr)+len(cases_arr_testing)-1)
    predict_arr = predict_arr.reset_index(drop=True)
    for i in range(len(predict_arr)):
      if predict_arr[i] < 0: predict_arr[i] = 0
      predict_arr[i] = round(predict_arr[i], 0)

    # print(f'mape={mape(predict_arr, cases_arr_testing)}')
    window_error[window_idx] += mape(predict_arr, cases_arr_testing)

  # Predict the lowest error parameter
  lowest = 999999999
  lowest_idx = 999
  for (idx, window) in enumerate(window_error):
    if window < lowest:
      lowest = window
      lowest_idx = idx

  country_name = country_data['countriesAndTerritories'][0]
  print(f'country: {country_name}  lowest: {lowest}    lowest_idx: {lowest_idx}')
  country_window_map[country_data['countriesAndTerritories'][0]] = lowest_idx
  total_windows.append(lowest_idx)


  model = AutoReg(cases_arr, lags=lowest_idx)
  model_fit = model.fit()
  predict_arr = model_fit.predict(start=len(cases_arr), end=len(cases_arr)+len(cases_arr_testing)-1)
  predict_arr = predict_arr.reset_index(drop=True)
  for i in range(len(predict_arr)):
    if predict_arr[i] < 0: predict_arr[i] = 0
    predict_arr[i] = round(predict_arr[i], 0)

country: Russia  lowest: 0.021235898138066447    lowest_idx: 4
country: Greece  lowest: 0.15968116119596099    lowest_idx: 8
country: India  lowest: 0.09953979307584036    lowest_idx: 12
country: United_States_of_America  lowest: 0.08126445459665425    lowest_idx: 9
country: Turkey  lowest: 0.029257857774600864    lowest_idx: 1


In [8]:
import csv

# Variable
predict_date = 7
# lags = 7
predict_map = []
country_list = []

for (idx, country_data) in enumerate(final_data_list):
  cases_arr = country_data['cases']

  model = AutoReg(cases_arr, lags=country_window_map[country_data['countriesAndTerritories'][0]])
  model_fit = model.fit()
  predict_arr = model_fit.predict(start=len(cases_arr), end=len(cases_arr)+predict_date-1)
  predict_arr = predict_arr.reset_index(drop=True)
  for i in range(len(predict_arr)):
    if predict_arr[i] < 0: predict_arr[i] = 0
    predict_arr[i] = round(predict_arr[i], 0)

  country_list.append(country_data['countriesAndTerritories'][0])
  predict_map.append(tuple(predict_arr))

result = zip(*predict_map)
with open('output.csv', 'w', newline='') as csvfile:
  writer = csv.writer(csvfile)
  country_list = [''] +  country_list
  writer.writerow(country_list)
  for idx, val in enumerate(result):
    date_str = '10/' + str(idx+9)
    val = [date_str] + list(val)
    writer.writerow(val)