# Import

In [14]:
import numpy as np
import pandas as pd

# Data preporation

In [15]:
# Load data of previous day
file_name = 'release/covid19-russia-cases-scrf.csv'
rus_df = pd.read_csv(file_name)
rus_df.tail()

Unnamed: 0,Date,Region/City,Region/City-Eng,Region_ID,Day-Confirmed,Day-Deaths,Day-Recovered,Confirmed,Deaths,Recovered
4157,2020-05-22,Ханты-Мансийский АО,Khanty-Mansiysk,86.0,80.0,1.0,18.0,1510.0,13.0,520.0
4158,2020-05-22,Челябинская область,Chelyabinsk region,74.0,80.0,0.0,31.0,1993.0,11.0,725.0
4159,2020-05-22,Чеченская Республика,Chechen Republic,95.0,20.0,0.0,8.0,1046.0,11.0,643.0
4160,2020-05-22,Ямало-Ненецкий АО,Yamalo-Nenets Autonomous Okrug,89.0,87.0,0.0,74.0,1989.0,5.0,398.0
4161,2020-05-22,Ярославская область,Yaroslavl region,76.0,103.0,0.0,24.0,2309.0,13.0,444.0


In [16]:
# Read update data
# Result of data_parsing_SCRF.ipynb
file_name = f'covid19-russia-cases-upd-scrf.csv'
upd_df = pd.read_csv(file_name)
upd_df.sort_values('Confirmed', ascending=False).head()

Unnamed: 0,Region/City,Confirmed,Recovered,Deaths,Date,Day-Confirmed,Day-Deaths,Day-Recovered
29,Москва,161397,47413,1934,2020-05-23,3190.0,67.0,3831.0
30,Московская область,32653,5163,324,2020-05-23,846.0,12.0,464.0
64,Санкт-Петербург,12955,3124,113,2020-05-23,363.0,6.0,265.0
33,Нижегородская область,7826,2257,60,2020-05-23,211.0,3.0,251.0
47,Республика Дагестан,3982,3002,70,2020-05-23,127.0,5.0,120.0


In [17]:
# Strip text data
rus_df['Region/City'] = rus_df['Region/City'].astype('str').str.strip('\u200b') 
upd_df['Region/City'] = upd_df['Region/City'].astype('str').str.strip('\u200b')

# Date to datetime
upd_df['Date'] = pd.to_datetime(upd_df['Date'])
rus_df['Date'] = pd.to_datetime(rus_df['Date'])

# Updating

In [18]:
# Add region ID and Eng columns
regions_df = pd.read_csv('regions-info.csv')

right = regions_df[['Region_ID', 'Region', 'Region_eng']]
upd_df = upd_df.merge(right, left_on='Region/City', right_on='Region', how='left')
upd_df['Region/City-Eng'] = upd_df.Region_eng
upd_df = upd_df[['Date', 'Region/City', 'Region/City-Eng', 'Region_ID', 'Day-Confirmed', 'Day-Deaths', 'Day-Recovered', 'Confirmed', 'Deaths', 'Recovered']]

In [19]:
rus_df = pd.concat([rus_df, upd_df]).reset_index(drop=True)
rus_df.tail(10)

Unnamed: 0,Date,Region/City,Region/City-Eng,Region_ID,Day-Confirmed,Day-Deaths,Day-Recovered,Confirmed,Deaths,Recovered
4237,2020-05-23,Тюменская область,Tyumen region,72.0,35.0,0.0,21.0,1366.0,7.0,591.0
4238,2020-05-23,Удмуртская Республика,Udmurt republic,18.0,15.0,0.0,16.0,492.0,12.0,323.0
4239,2020-05-23,Ульяновская область,Ulyanovsk region,73.0,98.0,0.0,26.0,1764.0,6.0,462.0
4240,2020-05-23,Хабаровский край,Khabarovsk region,27.0,63.0,1.0,23.0,1572.0,10.0,678.0
4241,2020-05-23,Ханты-Мансийский АО,Khanty-Mansiysk,86.0,58.0,2.0,25.0,1568.0,15.0,545.0
4242,2020-05-23,Челябинская область,Chelyabinsk region,74.0,80.0,0.0,29.0,2073.0,11.0,754.0
4243,2020-05-23,Чеченская Республика,Chechen Republic,95.0,20.0,1.0,16.0,1066.0,12.0,659.0
4244,2020-05-23,Чукотский автономный округ,Chukotka Autonomous Okrug,87.0,2.0,1.0,6.0,58.0,1.0,30.0
4245,2020-05-23,Ямало-Ненецкий АО,Yamalo-Nenets Autonomous Okrug,89.0,36.0,0.0,288.0,2025.0,5.0,686.0
4246,2020-05-23,Ярославская область,Yaroslavl region,76.0,115.0,0.0,49.0,2424.0,13.0,493.0


In [20]:
# Negative values fix
# Negative values can appear if some cases was removed from statistic
def negval_fix(reg):
    last = reg.iloc[-1]
    diff_c = last['Day-Confirmed']*(-1) if last['Day-Confirmed'] < 0 else 0
    diff_d = last['Day-Deaths']*(-1) if last['Day-Deaths'] < 0 else 0
    diff_r = last['Day-Recovered']*(-1) if last['Day-Recovered'] < 0 else 0
    
    last['Day-Confirmed'] += diff_c
    last['Day-Deaths']    += diff_d
    last['Day-Recovered'] += diff_r
    reg.iloc[-1] = last
    
    if diff_c + diff_d + diff_r > 0:
        print(diff_c, diff_d, diff_r, last['Region/City'])
        prev = reg.iloc[-2]
        prev['Confirmed'] -= diff_c
        prev['Deaths']    -= diff_d
        prev['Recovered'] -= diff_r
        reg.iloc[-2] = prev
    
    return reg
    
rus_df = rus_df.groupby('Region/City').apply(negval_fix).reset_index(drop=True)

In [21]:
# Manual fixes if needed
# rus_df.loc[287, 'Confirmed'] = 5
# rus_df.loc[321, 'Confirmed'] = 5
# rus_df.loc[359, 'Day-Confirmed'] = 0

# rus_df.loc[3775, 'Deaths'] = 7
# rus_df.loc[3775, 'Day-Deaths'] = 0

# Data checking

In [22]:
rus_sum = rus_df.loc[rus_df['Region/City'] != 'Diamond Princess']

print('Russia Confirmed:', rus_sum.groupby('Region/City')['Confirmed'].max().sum(),
      '| By day:', rus_sum.groupby('Region/City')['Day-Confirmed'].sum().sum())
print('Russia Deaths:', rus_sum.groupby('Region/City')['Deaths'].max().sum(),
      '| By day:', rus_sum.groupby('Region/City')['Day-Deaths'].sum().sum())
print('Russia Recovered:', rus_sum.groupby('Region/City')['Recovered'].max().sum(),
      '| By day:', rus_sum.groupby('Region/City')['Day-Recovered'].sum().sum())

rus_regs = rus_sum.groupby('Region/City')['Confirmed'].max().reset_index()
rus_regs['Confirmed-ByDay'] = rus_sum.groupby('Region/City')['Day-Confirmed'].sum().reset_index(drop=True)
print(rus_regs.loc[rus_regs['Confirmed'] != rus_regs['Confirmed-ByDay']])

rus_regs = rus_sum.groupby('Region/City')['Recovered'].max().reset_index()
rus_regs['Recovered-ByDay'] = rus_sum.groupby('Region/City')['Day-Recovered'].sum().reset_index(drop=True)
print(rus_regs.loc[rus_regs['Recovered'] != rus_regs['Recovered-ByDay']])

rus_regs = rus_sum.groupby('Region/City')['Deaths'].max().reset_index()
rus_regs['Deaths-ByDay'] = rus_sum.groupby('Region/City')['Day-Deaths'].sum().reset_index(drop=True)
print(rus_regs.loc[rus_regs['Deaths'] != rus_regs['Deaths-ByDay']])

# rus_regs.to_csv('rus_regs.csv')

Russia Confirmed: 335882.0 | By day: 335882.0
Russia Deaths: 3393.0 | By day: 3388.0
Russia Recovered: 107944.0 | By day: 107936.0
Empty DataFrame
Columns: [Region/City, Confirmed, Confirmed-ByDay]
Index: []
        Region/City  Recovered  Recovered-ByDay
17  Камчатский край      179.0            171.0
             Region/City  Deaths  Deaths-ByDay
9    Воронежская область    16.0          14.0
17       Камчатский край     3.0           2.0
34  Новгородская область     7.0           5.0


# Data saving

In [24]:
file_name = 'release/covid19-russia-cases-scrf.csv'
rus_df.to_csv(file_name, index=False)