In [1]:
# IMPORTS
import numpy as np
import pandas as pd
import os
import git
import sqlalchemy
import pyodbc 

In [40]:
# CONSTANTS
SOURCE_URL = 'https://github.com/CSSEGISandData/COVID-19'
DAILY_REPORTS_TABLE_NAME = 'covid_daily_reports'
DAILY_REPORTS_US_TABLE_NAME = 'covid_daily_reports_us'
DAILY_REPORTS_COLUMN_NAMES = ['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update',
       'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active',
       'Combined_Key', 'Incident_Rate', 'Case_Fatality_Ratio']
DAILY_REPORTS_US_COLUMN_NAMES = ['Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_', 
       'Confirmed', 'Deaths', 'Recovered', 'Active', 'FIPS', 'Incident_Rate', 'Total_Test_Results',
       'People_Hospitalized', 'Case_Fatality_Ratio', 'UID', 'ISO3', 'Testing_Rate', 'Hospitalization_Rate']
RESULT_CSV = '../processed_data/covid_daily'
RESULT_CSV_US = '../processed_data/covid_daily_US'

## **Подключение и скачивание данных с БД**

In [3]:
# DB CONFIG
SERVER_NAME = 'HOME-PC'
DB_NAME = 'covid'

In [4]:
# DB connect
from sqlalchemy.engine import URL
connection_string = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER='+SERVER_NAME+';DATABASE='+DB_NAME+';Trusted_Connection=yes;'
connection_url = URL.create("mssql+pyodbc", query={"odbc_connect": connection_string})

engine = sqlalchemy.create_engine(connection_url)

In [6]:
covidDfUs = pd.read_sql('covid_daily_reports_us', engine)
covidDf = pd.read_sql('covid_daily_reports', engine)

## **Информация по пустым значениям**

In [35]:
# Us таблица
print((covidDfUs.isnull()).sum())

Province_State              0
Country_Region              0
Last_Update                10
Lat                      1090
Long_                    1090
Confirmed                   0
Deaths                      0
Recovered                   0
Active                      0
FIPS                        1
Incident_Rate            1090
Total_Test_Results       1108
People_Hospitalized     26476
Case_Fatality_Ratio       540
UID                         0
ISO3                        0
Testing_Rate             1108
Hospitalization_Rate    26476
dtype: int64


In [36]:
# Общая таблица
print((covidDf.isnull()).sum())

FIPS                   360647
Admin2                 358208
Province_State          85874
Country_Region              0
Last_Update                 0
Lat                     40280
Long_                   40280
Confirmed                   0
Deaths                      0
Recovered                   0
Active                      0
Combined_Key                0
Incident_Rate           40898
Case_Fatality_Ratio     21225
dtype: int64


## **Функции**

In [9]:
def ChangeColumnType(column, typeName, df):
    df[column] = df[column].astype(typeName)

def DropLinesLessThanZero(df, columnName):
    return df.drop(df.index[df[columnName] < 0]) 

def FillMeanValues(df, column):
    df[column] = df[column].fillna(df[column].mean())
    return df[column]

## **Сразу удаляем строки с отсутствующими датами, так как их восстановить не получится**

In [None]:
# Удаляем строки с неопределенной датой.
covidDfUs = covidDfUs.dropna(subset=['Last_Update'])

## **Смотрим корректность дат**
Не должно быть дат ниже 20-го и больше 21-го года 

In [12]:
print(f"covidDf: \n {covidDf['Last_Update'].describe()}")
print(f"\ncovidDfUs: \n {covidDfUs['Last_Update'].describe()}")

covidDf: 
 count                 1982397
unique                    577
top       2020-11-29 05:25:55
freq                     3995
first     2020-05-30 02:32:48
last      2021-10-11 04:21:31
Name: Last_Update, dtype: object

covidDfUs: 
 count                   31609
unique                    713
top       2020-04-25 06:32:46
freq                       59
first     2020-04-12 23:18:15
last      2021-10-11 04:31:18
Name: Last_Update, dtype: object


  print(f"covidDf: \n {covidDf['Last_Update'].describe()}")
  print(f"\ncovidDfUs: \n {covidDfUs['Last_Update'].describe()}")


## **Смотрим на отрицательные значения в полях Death, Recovered, Active**

In [32]:
print(f"covidDf( death_count: {covidDf[covidDf['Deaths'] < 0]['Deaths'].size};\n recovered_count: {covidDf[covidDf['Recovered'] < 0]['Recovered'].size};\n active_count: {covidDf[covidDf['Active'] < 0]['Active'].size};\n)")
print(f"covidDfUs( death_count: {covidDfUs[covidDfUs['Deaths'] < 0]['Deaths'].size};\n recovered_count: {covidDfUs[covidDfUs['Recovered'] < 0]['Recovered'].size};\n active_count: {covidDfUs[covidDfUs['Active'] < 0]['Active'].size};\n)")

covidDf( death_count: 2;
 recovered_count: 3;
 active_count: 3529;
)
covidDfUs( death_count: 0;
 recovered_count: 0;
 active_count: 24;
)


## **Удаляем строки в которых есть отрицательные значения**

In [33]:
covidDf = DropLinesLessThanZero(covidDf, 'Deaths')
covidDfUs = DropLinesLessThanZero(covidDfUs, 'Deaths')
covidDf = DropLinesLessThanZero(covidDf, 'Recovered')
covidDfUs = DropLinesLessThanZero(covidDfUs, 'Active')
covidDfUs = DropLinesLessThanZero(covidDfUs, 'Recovered')
covidDf = DropLinesLessThanZero(covidDf, 'Active')

## **Заполняем все неизвестные поля средним значением**

In [34]:
covidDf['Deaths'] = FillMeanValues(covidDf, 'Deaths')
covidDf['Recovered'] = FillMeanValues(covidDf, 'Recovered')
covidDf['Active'] = FillMeanValues(covidDf, 'Active')
covidDfUs['Deaths'] = FillMeanValues(covidDfUs, 'Deaths')
covidDfUs['Recovered'] = FillMeanValues(covidDfUs, 'Recovered')
covidDfUs['Active'] = FillMeanValues(covidDfUs, 'Active')

## **Смотрим информацию по типам данных**

Некоторые строковые поля записаны как "object".
А некоторые целочисленные поля записаны как "float64".

In [37]:
covidDfUs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31604 entries, 0 to 31627
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Province_State        31604 non-null  object        
 1   Country_Region        31604 non-null  object        
 2   Last_Update           31594 non-null  datetime64[ns]
 3   Lat                   30514 non-null  float64       
 4   Long_                 30514 non-null  float64       
 5   Confirmed             31604 non-null  int64         
 6   Deaths                31604 non-null  int64         
 7   Recovered             31604 non-null  float64       
 8   Active                31604 non-null  float64       
 9   FIPS                  31603 non-null  float64       
 10  Incident_Rate         30514 non-null  float64       
 11  Total_Test_Results    30496 non-null  float64       
 12  People_Hospitalized   5128 non-null   float64       
 13  Case_Fatality_Ra

In [38]:
covidDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1978865 entries, 0 to 1982396
Data columns (total 14 columns):
 #   Column               Dtype         
---  ------               -----         
 0   FIPS                 float64       
 1   Admin2               object        
 2   Province_State       object        
 3   Country_Region       object        
 4   Last_Update          datetime64[ns]
 5   Lat                  float64       
 6   Long_                float64       
 7   Confirmed            int64         
 8   Deaths               float64       
 9   Recovered            float64       
 10  Active               float64       
 11  Combined_Key         object        
 12  Incident_Rate        float64       
 13  Case_Fatality_Ratio  float64       
dtypes: datetime64[ns](1), float64(8), int64(1), object(4)
memory usage: 226.5+ MB


## **Исправляем неверные типы данных**

In [39]:
ChangeColumnType('Deaths', 'int64', covidDf)
ChangeColumnType('Deaths', 'int64', covidDfUs)
ChangeColumnType('Recovered', 'int64', covidDf)
ChangeColumnType('Recovered', 'int64', covidDfUs)
ChangeColumnType('Active', 'int64', covidDfUs)
ChangeColumnType('Active', 'int64', covidDf)

for column in covidDfUs.columns:
    if (covidDfUs[column].dtype == 'object'):
        ChangeColumnType(column, 'string', covidDfUs)
        
for column in covidDf.columns:
    if (covidDf[column].dtype == 'object'):
        ChangeColumnType(column, 'string', covidDf)

## **Сохраняем данные в файлы**

In [42]:
covidDfUs.to_csv(RESULT_CSV_US, index=False)
covidDf.to_csv(RESULT_CSV, index=False)
