In [1]:
import tika
from tika import parser
import pandas as pd
from pathlib import Path

In [2]:
file_date = '03_24_2020'
filePath = Path("data")   # the file path for data
full_path = '/Users/mark/Documents/github-public/covid-19/data/covid19_case_information_' + file_date + '.pdf'
full_path

'/Users/mark/Documents/github-public/covid-19/data/covid19_case_information_03_23_2020.pdf'

#### Read the PDF file into the Tika parser

In [3]:
raw = parser.from_file(full_path)
raw = raw['content'].lstrip().rstrip() # Remove leading and trailing spaces
raw=raw.replace('\n\n','\n')  # replace all double newline characters with one
raw

'COVID-19 case information for Kentucky \nAs of 4 p.m. March 23, COVID-19 patient information includes 124 who have tested positive. \n51 M Daviess \n28 M Calloway \n27 F Harrison \n67 F Harrison \n61 F Oldham \n68 M Harrison \n26 F Fayette \n60 M Harrison \n54 F Harrison \n51 M Harrison \n66 M Bourbon \n47 M Fayette \n76 F Madison \n49 M Clark \n51 M Montgomery \n69 M Jefferson \n69 M Lyon \n88 F Bourbon \n27 F Clark \n61 F Franklin \n50 M Harrison \n73 M Warren \n55 M Warren \n73 M Warren \n80 F Warren \n F Jefferson \n96 M Kenton \n74 M Jefferson \n54 M Jefferson \n66 M Jefferson \n66 F Kenton \n59 F Pulaski \n31 F Fayette \n56 M Montgomery \n40 F Fayette \n63 F Henderson \n23 F Breathitt \n68 F Harrison \n43 F Fayette \n21 F Kenton \n77 M Madison \n79 F Jefferson \n46 M Fayette \n48 F Jefferson \n34 F Jefferson \n61 F Christian \n45 F Jefferson \n36 M Spencer \n M Menifee \n F GRDHD \n64 F Jefferson \n46 F Jefferson \n17 F Jefferson \n33 F Kenton \n51 M Fayette \n45 F Fayette \n48 

#### Turn the string into a list, breaking on newline character

In [4]:
string_list = [x.split(',') for x in raw.split(' \n')]
string_list = string_list[2:-2]
string_list[:10]

[['51 M Daviess'],
 ['28 M Calloway'],
 ['27 F Harrison'],
 ['67 F Harrison'],
 ['61 F Oldham'],
 ['68 M Harrison'],
 ['26 F Fayette'],
 ['60 M Harrison'],
 ['54 F Harrison'],
 ['51 M Harrison']]

#### Convert the list of string lists into a list of lists

In [5]:
new_list=[]
new_string=''
for indx, items in enumerate(string_list):
    new_string=new_string + (str(items).replace(' ', ',', 2))
new_string = new_string.replace("'","").replace('][','\n').replace('[','')
new_list = [x.split(',') for x in new_string.split('\n')]
new_list[:10]

[['51', 'M', 'Daviess'],
 ['28', 'M', 'Calloway'],
 ['27', 'F', 'Harrison'],
 ['67', 'F', 'Harrison'],
 ['61', 'F', 'Oldham'],
 ['68', 'M', 'Harrison'],
 ['26', 'F', 'Fayette'],
 ['60', 'M', 'Harrison'],
 ['54', 'F', 'Harrison'],
 ['51', 'M', 'Harrison']]

#### Create a dataframe from the list of lists

In [6]:
df = pd.DataFrame(new_list,columns=['Age','M/F','County'])
df.head()

Unnamed: 0,Age,M/F,County
0,51,M,Daviess
1,28,M,Calloway
2,27,F,Harrison
3,67,F,Harrison
4,61,F,Oldham


#### Write dataframe to CSV file

In [7]:
file_name = 'ky_'+file_date+'.csv'
file_out = filePath.joinpath(file_name)  # path and filename

df.to_csv(file_out)  # output to csv

In [8]:
len(df)

124