In [1]:
import os
import io
import re
import spacy
import pandas as pd

from PIL import Image
import pytesseract
from wand.image import Image as wi
import gc
from nltk.tokenize import word_tokenize
from itertools import compress

sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words
all_stopwords.add('fourteen')

In [2]:
# Provide a scanned pdf document
# scanned_pdf_path = './pdf/scanned/Update-on-COVID-19-as-at-September-18-2020.pdf'
# scanned_pdf_path = './pdf/scanned/UPDATE-ON-COVID-19-3RD-DECEMBER-1.pdf'
scanned_pdf_path = './pdf/scanned/Update-on-Coronavirus-Situation-in-the-Country.pdf'
# scanned_pdf_path = './pdf/scanned/September-4th-Covid-19-Statement.pdf'


In [3]:
def Extract_text_from_scanned_pdf(pdf_file_path):
    pdf_file = wi(filename = pdf_file_path,resolution = 300)
    pdfImage = pdf_file.convert('jpeg')
    
    imageBlobs = []
    extracted_text = []
    
    for img in pdfImage.sequence:
        page = wi(image = img)
        imageBlobs.append(page.make_blob('jpeg'))

    for imageBlob in imageBlobs:
        im = Image.open(io.BytesIO(imageBlob))
        text = pytesseract.image_to_string(im,lang = 'eng')
        extracted_text.append(text.strip().replace('\n', ' ').replace('’', '').replace('-', ''))

    return (extracted_text)

In [4]:
ExtractedText = Extract_text_from_scanned_pdf(scanned_pdf_path)
textString = ' '.join([str(elem).strip() for elem in ExtractedText])
textString  = re.sub(r"[()]",'',textString) # Clean the string data
textString = textString.replace(',','').strip() # replace commas with spaces

text_tokens = word_tokenize(textString) # Tokenize the text
tokens_without_sw = [word for word in text_tokens if not word in all_stopwords] # remove stopwords
textString = ' '.join([str(elem) for elem in tokens_without_sw]) # convert the list to strings


In [5]:
textString = textString.replace("Homa Bay", "HomaBay")
textString = textString.replace("West Pokot","WestPokot")
textString = textString.replace("Tharaka Nithi ","TharakaNithi ")
textString = textString.replace("Murang’a","Muranga") 
textString = textString.replace("Tana River","TanaRiver")
textString = textString.replace("Taita Taveta","TaitaTaveta")
textString = textString.replace("Trans Nzoia","TransNzoia")
textString = textString.replace("Uasin Gishu","UasinGishu")
textString = textString.replace("Elgeyo Marakwet","ElgeyoMarakwet")
textString = textString.replace("Scanned with CamScanner", "")

In [6]:
# List of all the counties
counties = ['Mombasa', 'Kwale', 'Kilifi', 'TanaRiver', 'Lamu', 'TaitaTaveta', 'Garissa', 'Wajir', 'Mandera', 'Marsabit', 
'Isiolo', 'Meru', 'TharakaNithi', 'Embu', 'Kitui', 'Machakos', 'Makueni', 'Nyandarua', 'Nyeri', 'Kirinyaga', 'Muranga', 
'Kiambu', 'Turkana', 'WestPokot', 'Samburu', 'TransNzoia', 'UasinGishu', 'ElgeyoMarakwet', 'Nandi', 'Baringo', 'Laikipia', 
'Nakuru', 'Narok', 'Kajiado', 'Kericho', 'Bomet', 'Kakamega', 'Vihiga', 'Bungoma', 'Busia', 'Siaya', 'Kisumu', 'HomaBay', 
'Migori', 'Kisii', 'Nyamira', 'Nairobi']

In [7]:
# check for counties that had positive cases in the day
def check_strings(counties, text):
    regexp = re.compile('|'.join([re.escape(x) for x in counties]))
    found = set(regexp.findall(text))
    return [x in found for x in counties]

In [8]:
counties_in_bool = check_strings(counties, textString)
counties_in_names = list(compress(counties, counties_in_bool)) # names of counties that had positive cases on the day

In [9]:
# Find a patter of names with numbers that correspond to county name and recorded number of positive cases
daily_nums = []
pattern = re.findall('([A-Za-z]+\s*[A-Za-z]*\s*[A-Za-z]*\s*[A-Za-z]*\s*[A-Za-z]*\s*[A-Za-z]*)\s?(\d+)',textString)
for p in pattern:
    n = len(p[0].split())
    if n == 1:
        if p[0].strip() in counties_in_names:
            daily_nums.append(p)
    else:
        m = p[0].split() # Get the number of words
        z = [i in counties_in_names for i in m] # Check if these words are in the counties_in_names
        if sum(z) == n:          
            for i in range(n):
                if (p[0].split())[i].strip() in counties_in_names:
                    daily_nums.append((p[0].split()[i],p[1]))
                    
county_numbers = set(daily_nums)

In [10]:
pd_day = pd.DataFrame(county_numbers, columns = ['County','New Positive cases'])
pd_day['New Positive cases'] = pd.to_numeric(pd_day['New Positive cases'],errors='coerce')

In [11]:
pd_day.sort_values(by='New Positive cases',ascending=False)

Unnamed: 0,County,New Positive cases
0,Nairobi,29
3,Mombasa,14
6,Turkana,6
2,Busia,4
4,TaitaTaveta,2
7,Kajiado,2
1,Kilifi,1
5,Kiambu,1
