### Importing of modules and packages

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime


In [17]:
df_postive_cases = pd.read_excel(r'/Users/ivan/Desktop/PycharmProjects/Data_Science_Mini/Data/raw/covid19infectionsurveydatasets2020122423122020174305.xlsx',
                                 sheet_name='1a',
                                 engine='openpyxl')

# remove unwanted rows
df_postive_cases = df_postive_cases[17:41].reindex()

# remove unwanted columns
column_list = ['Unnamed: 1', 'Unnamed: 2','Unnamed: 3','Unnamed: 5','Unnamed: 6','Unnamed: 8','Unnamed: 9',]
df_postive_cases = df_postive_cases.drop(column_list, axis=1)

# remove empty columns
df_postive_cases = df_postive_cases.loc[:,:'Unnamed: 7']

# rename columns
df_postive_cases = df_postive_cases.rename(columns={'Contents': 'Date',
                                                    'Unnamed: 4': 'Estimate of number testing positive',
                                                    'Unnamed: 7': 'Proportion of population that had COVID'})

In [18]:
# function to turn ratio (like '1 in 2000') into a decimal
def convert_ratio(x):
    a,b = x.split('in')
    c = int(a) / int(b)
    return c

In [19]:
# lambda function to remove the commas (for example replace 1,200 with 1200)
df_postive_cases['Proportion of population that had COVID'] = df_postive_cases['Proportion of population that had COVID'].map(lambda x: x.replace(',',''))

df_postive_cases['Proportion of population that had COVID'] = df_postive_cases['Proportion of population that had COVID'].apply(convert_ratio)
df_postive_cases.reset_index(drop=True, inplace=True)
df_postive_cases

Unnamed: 0,Date,Estimate of number testing positive,Proportion of population that had COVID
0,06 July to 12 July,23600,0.000435
1,13 July to 19 July,27700,0.0005
2,20 July to 26 July,35700,0.000667
3,27 July to 2 August,28300,0.000526
4,03 August to 09 August,28300,0.000526
5,07 August to 13 August,24600,0.000455
6,14 August to 20 August,28200,0.000526
7,19 August to 25 August,27100,0.0005
8,30 August to 05 September,39700,0.000714
9,04 September to 10 September,59800,0.001111


In [20]:
# function to convert '28 November' to 28/11/2020
# default year of 1900 needs replacing
def convert_date(x):
    x = datetime.strptime(x,"%d %B").strftime('%d/%m/%Y')
    x = x.replace('1900','2020')
    return x

In [21]:
# make a copy of the Series (column named 'Date') to avoid the warning 'A value is trying to be set on a copy of a slice from a DataFrame'
df_copy = df_postive_cases['Date'].copy()

# iterate over 'Date' series and replace with last date (i.e. 06 July to 12 July replaced with 12/07/2020)

for i in range(df_postive_cases['Date'].size):
    date = df_postive_cases['Date'].iloc[i].split(' to ')[1]
    date = date.strip()
    df_copy.iloc[i] = convert_date(date)

# reassign Series back into dataframe
df_postive_cases['Date'] = df_copy

# colour the values in the style of a bar chart
df_postive_cases.style.bar(subset=['Estimate of number testing positive'], align='mid', color=['#ffAAAA'])

Unnamed: 0,Date,Estimate of number testing positive,Proportion of population that had COVID
0,12/07/2020,23600,0.000435
1,19/07/2020,27700,0.0005
2,26/07/2020,35700,0.000667
3,02/08/2020,28300,0.000526
4,09/08/2020,28300,0.000526
5,13/08/2020,24600,0.000455
6,20/08/2020,28200,0.000526
7,25/08/2020,27100,0.0005
8,05/09/2020,39700,0.000714
9,10/09/2020,59800,0.001111


### Save dataframe to be used in other notebooks (make Date column the index)

In [24]:
df_postive_cases.set_index('Date', inplace=True)
df_postive_cases.to_csv('positive_cases.csv')
