In [50]:
import numpy as np
import pandas as pd
import datetime
import sys, os

# Data parsing

In [51]:
# Parse html file
DAY = datetime.datetime.now().strftime("%d").lstrip('0')
MONTH = datetime.datetime.now().strftime("%m").lstrip('0')
# DAY = 20
# MONTH = 5
# SUB = '/scrf'
SUB = ''
file_name = 'Оперативные данные _ Коронавирус COVID–19_ Официальная информация о коронавирусе в России на портале – стопкоронавирус.рф.html'

# Save of "https://xn--80aesfpebagmfblc0a.xn--p1ai/information/" page
path = f'html_data/{DAY}-{MONTH}{SUB}/{file_name}'
parse_df = pd.read_html(path, encoding='utf-8')[0]

parse_df.head()

Unnamed: 0,Регион,Выявлено,Новые,Активные ? Число больных в настоящее время,Выздоровело,Умерло
0,Москва,178196,2367,97464,78324,2408
1,Московская область,38238,735,30094,7720,424
2,Санкт-Петербург,15580,365,10016,5373,191
3,Нижегородская область,9533,289,5803,3637,93
4,Свердловская область,5184,261,3016,2142,26


In [52]:
# Modifi data frame
parse_df = parse_df.drop(parse_df.columns[2:4] , axis=1)
parse_df.columns = ['Region/City', 'Confirmed', 'Recovered', 'Deaths']
parse_df['Date'] = datetime.datetime.now().strftime("%Y-%m-%d")
parse_df

Unnamed: 0,Region/City,Confirmed,Recovered,Deaths,Date
0,Москва,178196,78324,2408,2020-05-30
1,Московская область,38238,7720,424,2020-05-30
2,Санкт-Петербург,15580,5373,191,2020-05-30
3,Нижегородская область,9533,3637,93,2020-05-30
4,Свердловская область,5184,2142,26,2020-05-30
...,...,...,...,...,...
80,Севастополь,161,129,3,2020-05-30
81,Республика Алтай,116,71,0,2020-05-30
82,Сахалинская область,101,53,0,2020-05-30
83,Чукотский автономный округ,78,53,1,2020-05-30


# Preparing parsed data

In [53]:
file_name = 'release/covid19-russia-cases-scrf.csv'
rus_df = pd.read_csv(file_name)
rus_df.tail()

Unnamed: 0,Date,Region/City,Region/City-Eng,Region_ID,Day-Confirmed,Day-Deaths,Day-Recovered,Confirmed,Deaths,Recovered
6730,2020-05-29,Челябинская область,Chelyabinsk region,74.0,141.0,1.0,49.0,2509.0,15.0,1106.0
6731,2020-05-29,Чеченская Республика,Chechen Republic,95.0,18.0,0.0,42.0,1192.0,13.0,787.0
6732,2020-05-29,Чукотский автономный округ,Chukotka Autonomous Okrug,87.0,7.0,0.0,10.0,73.0,1.0,50.0
6733,2020-05-29,Ямало-Ненецкий АО,Yamalo-Nenets Autonomous Okrug,89.0,31.0,2.0,42.0,2280.0,9.0,898.0
6734,2020-05-29,Ярославская область,Yaroslavl region,76.0,84.0,1.0,45.0,2976.0,14.0,698.0


In [54]:
# Add day columns
parse_df['Day-Confirmed'] = 0
parse_df['Day-Deaths'] = 0
parse_df['Day-Recovered'] = 0

# Strip text data
rus_df['Region/City'] = rus_df['Region/City'].astype('str').str.strip('\u200b') 
parse_df['Region/City'] = parse_df['Region/City'].astype('str').str.strip('\u200b')

# Rename regions
rename_dict = {
    'Ямало-Ненецкий автономный округ' : 'Ямало-Ненецкий АО',
    'Республика Северная Осетия — Алания' : 'Республика Северная Осетия - Алания',
}

parse_df['Region/City'] = parse_df['Region/City'].replace(rename_dict)

In [55]:
parse_df['Date'] = pd.to_datetime(parse_df['Date'])
# Minus a day if needed
# parse_df['Date'] = parse_df['Date'] - pd.Timedelta(days=1)

In [56]:
def upd(row):
    reg = row['Region/City']
    
    row['Day-Confirmed'] = row['Confirmed'] - rus_df[rus_df['Region/City'] == reg]['Day-Confirmed'].sum()
    row['Day-Deaths']    = row['Deaths'] - rus_df[rus_df['Region/City'] == reg]['Day-Deaths'].sum()
    row['Day-Recovered'] = row['Recovered'] - rus_df[rus_df['Region/City'] == reg]['Day-Recovered'].sum()
    
    row['Day-Confirmed'] = row['Confirmed'] if np.isnan(row['Day-Confirmed']) else row['Day-Confirmed']
    row['Day-Deaths']    = row['Deaths'] if np.isnan(row['Day-Deaths']) else row['Day-Deaths']
    row['Day-Recovered'] = row['Recovered'] if np.isnan(row['Day-Recovered']) else row['Day-Recovered']
    
    return row.drop('Region/City')

parse_df = parse_df.groupby('Region/City').apply(lambda df: upd(df.iloc[0])).reset_index()

In [57]:
# Check for missed regions
parse_df[np.isnan(parse_df['Day-Confirmed'])]

Unnamed: 0,Region/City,Confirmed,Recovered,Deaths,Date,Day-Confirmed,Day-Deaths,Day-Recovered


# Data checking

In [58]:
print('Russia Confirmed:', parse_df.groupby('Region/City')['Confirmed'].max().sum(),
      'Day-Confirmed:', parse_df.groupby('Region/City')['Day-Confirmed'].sum().sum())
print('Russia Deaths:', parse_df.groupby('Region/City')['Deaths'].max().sum(),
      'Day-Deaths:', parse_df.groupby('Region/City')['Day-Deaths'].sum().sum())
print('Russia Recovered:', parse_df.groupby('Region/City')['Recovered'].max().sum(),
      'Day-Recovered:', parse_df.groupby('Region/City')['Day-Recovered'].sum().sum())

# rus_regs = rus_sum.groupby('Region/City')['Recovered'].max().reset_index()
# rus_regs['Recovered-ByDay'] = rus_sum.groupby('Region/City')['Day-Recovered'].sum().reset_index(drop=True)
# rus_regs.loc[rus_regs['Recovered'] != rus_regs['Recovered-ByDay']]
# # rus_regs.to_csv('rus_regs.csv')

Russia Confirmed: 396575 Day-Confirmed: 8952.0
Russia Deaths: 4555 Day-Deaths: 181.0
Russia Recovered: 167469 Day-Recovered: 8212.0


# Data saving

In [59]:
# Save full update table
parse_df.to_csv(f'covid19-russia-cases-upd-full-scrf.csv', index=False)

In [60]:
# Drop rows without day changes
parse_df = parse_df[(parse_df['Day-Confirmed'] != 0) |
                (parse_df['Day-Deaths'] != 0) |
                (parse_df['Day-Recovered'] != 0)]

In [61]:
parse_df.to_csv(f'old_data/covid19-russia-cases-upd{DAY}-{MONTH}-scrf.csv', index=False)
parse_df.to_csv('covid19-russia-cases-upd-scrf.csv', index=False)