# Import

In [27]:
import numpy as np
import pandas as pd

# Data preporation

In [28]:
# Load previous day data
file_name = 'release/covid19-russia-cases.csv'
rus_df = pd.read_csv(file_name)
rus_df.tail()

Unnamed: 0,Date,Region/City,Region/City-Eng,Region_ID,Day-Confirmed,Day-Deaths,Day-Recovered,Confirmed,Deaths,Recovered
4494,2020-05-26,Челябинская область,Chelyabinsk region,74.0,72.0,1.0,64.0,2356.0,13.0,988.0
4495,2020-05-26,Чеченская Республика,Chechen Republic,95.0,21.0,0.0,30.0,1133.0,13.0,690.0
4496,2020-05-26,Чукотский автономный округ,Chukotka Autonomous Okrug,87.0,0.0,0.0,4.0,61.0,1.0,36.0
4497,2020-05-26,Ямало-Ненецкий АО,Yamalo-Nenets Autonomous Okrug,89.0,59.0,0.0,34.0,2151.0,6.0,758.0
4498,2020-05-26,Ярославская область,Yaroslavl region,76.0,94.0,0.0,66.0,2726.0,13.0,569.0


In [29]:
# Read update data
# Result of data_parsing.ipynb
file_name = 'covid19-russia-cases-upd.csv'
upd_df = pd.read_csv(file_name)
upd_df.sort_values('Confirmed', ascending=False).head()

Unnamed: 0,Region/City,Confirmed,Date,Deaths,Recovered,Day-Confirmed,Day-Deaths,Day-Recovered
29,Москва,171443.0,2020-05-27,2183.0,67458.0,2140.0,73.0,5839.0
30,Московская область,35956.0,2020-05-27,359.0,6440.0,793.0,10.0,288.0
64,Санкт-Петербург,14463.0,2020-05-27,158.0,4458.0,387.0,8.0,714.0
33,Нижегородская область,8735.0,2020-05-27,78.0,2905.0,231.0,6.0,253.0
47,Республика Дагестан,4455.0,2020-05-27,130.0,4161.0,161.0,12.0,352.0


In [30]:
# Strip text data
rus_df['Region/City'] = rus_df['Region/City'].astype('str').str.strip('\u200b') 
upd_df['Region/City'] = upd_df['Region/City'].astype('str').str.strip('\u200b')

# Updating

In [31]:
def upd(row):
    reg = row['Region/City']
    
    row['Confirmed'] = rus_df[rus_df['Region/City'] == reg]['Confirmed'].max() + row['Day-Confirmed']
    row['Deaths'] = rus_df[rus_df['Region/City'] == reg]['Deaths'].max() + row['Day-Deaths']
    row['Recovered'] = rus_df[rus_df['Region/City'] == reg]['Recovered'].max() + row['Day-Recovered']
    
    row['Confirmed'] = row['Day-Confirmed'] if np.isnan(row['Confirmed']) else row['Confirmed']
    row['Deaths'] = row['Day-Deaths'] if np.isnan(row['Deaths']) else row['Deaths']
    row['Recovered'] = row['Day-Recovered'] if np.isnan(row['Recovered']) else row['Recovered']
    
    return row.drop('Region/City')

# If cumulative values unknown
# upd_df = upd_df.groupby('Region/City').apply(lambda df: upd(df.iloc[0])).reset_index()

In [32]:
# Date to datetime
upd_df['Date'] = pd.to_datetime(upd_df['Date'])
rus_df['Date'] = pd.to_datetime(rus_df['Date'])

In [33]:
# Region names translation
# from googletrans import Translator
# translator = Translator()

# def translate(rus):
#     eng = translator.translate(rus, src='ru', dest='en')
#     return eng.text

# upd_df['Region/City-Eng'] = upd_df['Region/City'].apply(translate)

In [34]:
# Add region ID and Eng columns
regions_df = pd.read_csv('release/regions-info.csv')

right = regions_df[['Region_ID', 'Region', 'Region_eng']]
upd_df = upd_df.merge(right, left_on='Region/City', right_on='Region', how='left')
upd_df['Region/City-Eng'] = upd_df.Region_eng
upd_df = upd_df[['Date', 'Region/City', 'Region/City-Eng', 'Region_ID', 'Day-Confirmed', 'Day-Deaths', 'Day-Recovered', 'Confirmed', 'Deaths', 'Recovered']]

In [35]:
rus_df = pd.concat([rus_df, upd_df]).reset_index(drop=True)
rus_df.tail(10)

Unnamed: 0,Date,Region/City,Region/City-Eng,Region_ID,Day-Confirmed,Day-Deaths,Day-Recovered,Confirmed,Deaths,Recovered
4574,2020-05-27,Тюменская область,Tyumen region,72.0,42.0,0.0,45.0,1532.0,7.0,694.0
4575,2020-05-27,Удмуртская Республика,Udmurt republic,18.0,10.0,0.0,13.0,559.0,12.0,367.0
4576,2020-05-27,Ульяновская область,Ulyanovsk region,73.0,70.0,0.0,35.0,2121.0,7.0,586.0
4577,2020-05-27,Хабаровский край,Khabarovsk region,27.0,67.0,0.0,44.0,1827.0,10.0,803.0
4578,2020-05-27,Ханты-Мансийский АО,Khanty-Mansiysk,86.0,75.0,0.0,34.0,1924.0,17.0,669.0
4579,2020-05-27,Челябинская область,Chelyabinsk region,74.0,82.0,1.0,69.0,2438.0,14.0,1057.0
4580,2020-05-27,Чеченская Республика,Chechen Republic,95.0,19.0,0.0,8.0,1152.0,13.0,698.0
4581,2020-05-27,Чукотский автономный округ,Chukotka Autonomous Okrug,87.0,2.0,0.0,1.0,63.0,1.0,37.0
4582,2020-05-27,Ямало-Ненецкий АО,Yamalo-Nenets Autonomous Okrug,89.0,51.0,0.0,30.0,2202.0,6.0,788.0
4583,2020-05-27,Ярославская область,Yaroslavl region,76.0,82.0,0.0,47.0,2808.0,13.0,616.0


In [36]:
# Negative values fixing
# Negative values can appear if some cases was removed from statistic
def negval_fix(reg):
    last = reg.iloc[-1]
    diff_c = last['Day-Confirmed']*(-1) if last['Day-Confirmed'] < 0 else 0
    diff_d = last['Day-Deaths']*(-1) if last['Day-Deaths'] < 0 else 0
    diff_r = last['Day-Recovered']*(-1) if last['Day-Recovered'] < 0 else 0
    
    last['Day-Confirmed'] += diff_c
    last['Day-Deaths']    += diff_d
    last['Day-Recovered'] += diff_r
    reg.iloc[-1] = last
    
    if diff_c + diff_d + diff_r > 0:
        print(f"Conf {diff_c} Death {diff_d} Rec {diff_r} {last['Region/City']}")
        prev = reg.iloc[-2]
        prev['Confirmed'] -= diff_c
        prev['Deaths']    -= diff_d
        prev['Recovered'] -= diff_r
        reg.iloc[-2] = prev
    
    return reg
    
rus_df = rus_df.groupby('Region/City').apply(negval_fix).reset_index(drop=True)

0 1.0 0 Новгородская область


In [37]:
# Manual fixes if needed
# rus_df.loc[321, 'Confirmed'] = 5
# rus_df.loc[359, 'Day-Confirmed'] = 0


# rus_df.loc[3775, 'Deaths'] = 7
# rus_df.loc[3775, 'Day-Deaths'] = 0

# Data checking

In [38]:
rus_sum = rus_df.loc[rus_df['Region/City'] != 'Diamond Princess']

print('Russia Confirmed:', rus_sum.groupby('Region/City')['Confirmed'].max().sum(),
      rus_sum.groupby('Region/City')['Day-Confirmed'].sum().sum())
print('Russia Deaths:', rus_sum.groupby('Region/City')['Deaths'].max().sum(),
      rus_sum.groupby('Region/City')['Day-Deaths'].sum().sum())
print('Russia Recovered:', rus_sum.groupby('Region/City')['Recovered'].max().sum(),
      rus_sum.groupby('Region/City')['Day-Recovered'].sum().sum())

rus_regs = rus_sum.groupby('Region/City')['Confirmed'].max().reset_index()
rus_regs['Confirmed-ByDay'] = rus_sum.groupby('Region/City')['Day-Confirmed'].sum().reset_index(drop=True)
print(rus_regs.loc[rus_regs['Confirmed'] != rus_regs['Confirmed-ByDay']])

rus_regs = rus_sum.groupby('Region/City')['Recovered'].max().reset_index()
rus_regs['Recovered-ByDay'] = rus_sum.groupby('Region/City')['Day-Recovered'].sum().reset_index(drop=True)
print(rus_regs.loc[rus_regs['Recovered'] != rus_regs['Recovered-ByDay']])

rus_regs = rus_sum.groupby('Region/City')['Deaths'].max().reset_index()
rus_regs['Deaths-ByDay'] = rus_sum.groupby('Region/City')['Day-Deaths'].sum().reset_index(drop=True)
print(rus_regs.loc[rus_regs['Deaths'] != rus_regs['Deaths-ByDay']])

# rus_regs.to_csv('rus_regs.csv')

Russia Confirmed: 370945.0 370945.0
Russia Deaths: 3975.0 3975.0
Russia Recovered: 142336.0 142336.0
Empty DataFrame
Columns: [Region/City, Confirmed, Confirmed-ByDay]
Index: []
Empty DataFrame
Columns: [Region/City, Recovered, Recovered-ByDay]
Index: []
Empty DataFrame
Columns: [Region/City, Deaths, Deaths-ByDay]
Index: []


In [40]:
rus_df[rus_df['Region/City'] == 'Новгородская область'].tail(10)

Unnamed: 0,Date,Region/City,Region/City-Eng,Region_ID,Day-Confirmed,Day-Deaths,Day-Recovered,Confirmed,Deaths,Recovered
3775,2020-05-18,Новгородская область,Novgorod region,53.0,21.0,0.0,4.0,652.0,7.0,182.0
3859,2020-05-19,Новгородская область,Novgorod region,53.0,33.0,0.0,8.0,685.0,7.0,190.0
3943,2020-05-20,Новгородская область,Novgorod region,53.0,10.0,0.0,27.0,695.0,7.0,217.0
4026,2020-05-21,Новгородская область,Novgorod region,53.0,32.0,1.0,21.0,727.0,8.0,238.0
4111,2020-05-22,Новгородская область,Novgorod region,53.0,22.0,0.0,12.0,749.0,8.0,250.0
4195,2020-05-23,Новгородская область,Novgorod region,53.0,35.0,0.0,11.0,784.0,8.0,261.0
4279,2020-05-24,Новгородская область,Novgorod region,53.0,60.0,0.0,7.0,844.0,7.0,268.0
4363,2020-05-25,Новгородская область,Novgorod region,53.0,70.0,0.0,11.0,914.0,6.0,279.0
4448,2020-05-26,Новгородская область,Novgorod region,53.0,89.0,0.0,21.0,1003.0,6.0,300.0
4533,2020-05-27,Новгородская область,Novgorod region,53.0,46.0,0.0,19.0,1049.0,7.0,319.0


# Data saving

In [41]:
file_name = 'release/covid19-russia-cases.csv'
rus_df.to_csv(file_name, index=False)