In [1]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

In [2]:
def reduce_mem_usage(df):
    """Reduce numeric

    Args:
        df (pandas data frame object): 

    Returns:
        obj: reduced pandas data frame
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    return df

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/KonstantinKlepikov/covid-kaliningrad/main/data/data.csv')
# data.index = pd.to_datetime(data.index)
# data['дата'] = pd.to_datetime(data['дата'])
data = reduce_mem_usage(data)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 73 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   дата                                             304 non-null    object 
 1   всего                                            304 non-null    float16
 2   infection rate                                   304 non-null    object 
 3   ОРВИ                                             304 non-null    float16
 4   пневмония                                        304 non-null    float16
 5   без симптомов                                    304 non-null    float16
 6   кол-во тестов                                    304 non-null    float16
 7   пенсионеры                                       304 non-null    float16
 8   мед.работники                                    304 non-null    float16
 9   пром.предприятия                

In [5]:
data[['дата', 'всего', 'ОРВИ', 'пневмония', 'без симптомов']]

Unnamed: 0,дата,всего,ОРВИ,пневмония,без симптомов
0,2020-03-08,1.0,0.0,0.0,0.0
1,2020-03-09,0.0,0.0,0.0,0.0
2,2020-03-10,0.0,0.0,0.0,0.0
3,2020-03-11,0.0,0.0,0.0,0.0
4,2020-03-12,1.0,0.0,0.0,0.0
...,...,...,...,...,...
299,2021-01-01,221.0,179.0,35.0,7.0
300,2021-01-02,208.0,162.0,37.0,9.0
301,2021-01-03,216.0,157.0,49.0,10.0
302,2021-01-04,214.0,177.0,27.0,10.0


In [6]:
data.drop(['учебные учреждения'], axis=1)

Unnamed: 0,дата,всего,infection rate,ОРВИ,пневмония,без симптомов,кол-во тестов,пенсионеры,мед.работники,пром.предприятия,...,Светлогорский городской округ,Пионерский городской округ,Неманский городской округ,Полесский городской округ,Краснознаменский городской округ,Озёрский городской округ,Янтарный городской округ,Мамоновский городской округ,Советский городской окру,другие регионы
0,2020-03-08,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-03-09,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-03-10,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-03-11,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-03-12,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,2021-01-01,221.0,105,179.0,35.0,7.0,3080.0,27.0,35.0,5.0,...,5.0,3.0,7.0,1.0,1.0,2.0,1.0,2.0,17.0,0.0
300,2021-01-02,208.0,103,162.0,37.0,9.0,3248.0,54.0,31.0,8.0,...,3.0,3.0,1.0,2.0,4.0,1.0,0.0,1.0,13.0,0.0
301,2021-01-03,216.0,101,157.0,49.0,10.0,529.0,31.0,26.0,7.0,...,3.0,1.0,0.0,3.0,11.0,1.0,4.0,2.0,3.0,0.0
302,2021-01-04,214.0,098,177.0,27.0,10.0,585.0,41.0,19.0,8.0,...,7.0,2.0,3.0,2.0,2.0,0.0,0.0,2.0,5.0,0.0


In [46]:
# df_dict = dict.fromkeys(data.columns, 'col')
data1 = data.reset_index()
data1 = data.drop(['дата', 'учебные учреждения'], axis=1)
data1.columns = range(data1.shape[1])
# data1 = data1.rename(columns = df_dict)
data1.replace(to_replace=0, value=np.nan, inplace=True)
data1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,1.0,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,1.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,221.0,171.0,38.0,12.0,2100.0,37.0,40.0,6.0,11.0,6.0,...,6.0,5.0,3.0,8.0,1.0,6.0,3.0,5.0,18.0,
298,225.0,207.0,16.0,2.0,2408.0,96.0,13.0,7.0,13.0,7.0,...,,,,1.0,,,,2.0,1.0,
299,221.0,179.0,35.0,7.0,3080.0,27.0,35.0,5.0,16.0,13.0,...,5.0,3.0,7.0,1.0,1.0,2.0,1.0,2.0,17.0,
300,208.0,162.0,37.0,9.0,3248.0,54.0,31.0,8.0,4.0,11.0,...,3.0,3.0,1.0,2.0,4.0,1.0,,1.0,13.0,


In [47]:
profile = ProfileReport(data1)

In [None]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/79 [00:00<?, ?it/s]

  ret = umr_sum(x, axis, dtype, out, keepdims)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
2021-01-04 02:39:14.139 INFO    matplotlib.category: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2021-01-04 02:39:14.140 INFO    matplotlib.category: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(data['дата'])

0      2020-03-08
1      2020-03-09
2      2020-03-10
3      2020-03-11
4      2020-03-12
5      2020-03-13
6      2020-03-14
7      2020-03-15
8      2020-03-16
9      2020-03-17
10     2020-03-18
11     2020-03-19
12     2020-03-20
13     2020-03-21
14     2020-03-22
15     2020-03-23
16     2020-03-24
17     2020-03-25
18     2020-03-26
19     2020-03-27
20     2020-03-28
21     2020-03-29
22     2020-03-30
23     2020-03-31
24     2020-04-01
25     2020-04-02
26     2020-04-03
27     2020-04-04
28     2020-04-05
29     2020-04-06
30     2020-04-07
31     2020-04-08
32     2020-04-09
33     2020-04-10
34     2020-04-11
35     2020-04-12
36     2020-04-13
37     2020-04-14
38     2020-04-15
39     2020-04-16
40     2020-04-17
41     2020-04-18
42     2020-04-19
43     2020-04-20
44     2020-04-21
45     2020-04-22
46     2020-04-23
47     2020-04-24
48     2020-04-25
49     2020-04-26
50     2020-04-27
51     2020-04-28
52     2020-04-29
53     2020-04-30
54     2020-05-01
55     202

In [8]:
data_test = data[['всего', 'ОРВИ', 'пневмония', 'без симптомов', 'кол-во тестов', 'выписали', 'умерли от ковид']].copy()

In [9]:
profile = ProfileReport(data_test)

In [10]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/21 [00:00<?, ?it/s]

  ret = umr_sum(x, axis, dtype, out, keepdims)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [13]:
data.set_index('дата', inplace=True)

In [14]:
data[['infection rate', 'всего']]

Unnamed: 0_level_0,infection rate,всего
дата,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-08,0,1.0
2020-03-09,0,0.0
2020-03-10,0,0.0
2020-03-11,0,0.0
2020-03-12,0,1.0
...,...,...
2020-12-31,1057831325,225.0
2021-01-01,1052380952,221.0
2021-01-02,1025791325,208.0
2021-01-03,1006944444,216.0


In [15]:
data['infection rate']

дата
2020-03-08               0
2020-03-09               0
2020-03-10               0
2020-03-11               0
2020-03-12               0
                  ...     
2020-12-31     1,057831325
2021-01-01     1,052380952
2021-01-02     1,025791325
2021-01-03     1,006944444
2021-01-04    0,9783599089
Name: infection rate, Length: 303, dtype: object

In [11]:
data['infection rate'].reset_index()

Unnamed: 0,index,infection rate
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
299,299,105
300,300,103
301,301,101
302,302,098


In [12]:
source = pd.DataFrame(np.cumsum(np.random.randn(100, 3), 0).round(2),
                    columns=['alcohol', 'beer', 'coke'], index=pd.RangeIndex(100, name='x'))

In [13]:
source

Unnamed: 0_level_0,alcohol,beer,coke
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.07,0.27,-1.50
1,-0.83,2.41,-0.44
2,0.43,2.96,0.45
3,0.42,2.83,0.06
4,1.14,2.30,-0.17
...,...,...,...
95,-4.23,-7.27,-7.24
96,-2.49,-6.63,-6.46
97,-3.58,-4.63,-5.99
98,-4.23,-2.76,-4.65


In [14]:
source = source.reset_index().melt('x', var_name='category', value_name='y')
source

Unnamed: 0,x,category,y
0,0,alcohol,0.07
1,1,alcohol,-0.83
2,2,alcohol,0.43
3,3,alcohol,0.42
4,4,alcohol,1.14
...,...,...,...
295,95,coke,-7.24
296,96,coke,-6.46
297,97,coke,-5.99
298,98,coke,-4.65


In [15]:
df = data[['дата', 'infection rate']]
df['infection rate'] = df['infection rate'].apply(lambda x: x.replace(',', '.'))
df['infection rate'] = df['infection rate'].apply(lambda x: float(x))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['infection rate'] = df['infection rate'].apply(lambda x: x.replace(',', '.'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['infection rate'] = df['infection rate'].apply(lambda x: float(x))


Unnamed: 0,дата,infection rate
0,2020-03-08,0.00
1,2020-03-09,0.00
2,2020-03-10,0.00
3,2020-03-11,0.00
4,2020-03-12,0.00
...,...,...
299,2021-01-01,1.05
300,2021-01-02,1.03
301,2021-01-03,1.01
302,2021-01-04,0.98


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   дата            304 non-null    object 
 1   infection rate  304 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.9+ KB


In [19]:
data['дата'].iloc[-1]

'2021-01-05'

In [20]:
data['дата'].iloc[0]

'2020-03-08'