In [None]:
# Package Author Mark Benmuvhar
# pdf_read_patent
# Version 2.0.0
# 10/16/2022

In [None]:
#pip install tabula-py
#pip install PyPDF2
import tabula
import PyPDF2
import pandas as pd
import numpy as np
import re
import os
import os.path

**Other references**
Zhu, Aaron. (2022, Match 10).  Extract PDF text while preserving whitespace using python and pytesseract.  *Towards Data Science.*  https://towardsdatascience.com/pdf-text-extraction-while-preserving-whitespace-using-python-and-pytesseract-ec142743e805  

**API references:**
https://tabula-py.readthedocs.io/en/latest/tabula.html  


**Required supporting files**  

https://tesseract-ocr.github.io/tessdoc/Downloads.html  

https://github.com/tesseract-ocr/tessdoc  

https://github.com/oschwartz10612/poppler-windows/releases?page=1  

**Programming References**  

https://thewebdev.info/2022/04/17/how-to-fix-appending-turns-my-list-to-nonetype-with-python/#:~:text=To%20fix%20appending%20turns%20my%20list%20to%20NoneType,%5B%27a%27%2C%20%27b%27%2C%20%27c%27%2C%20%27d%27%5D%20a_list%20%3D%20a_list.append%20%28%27e%27%29  

https://stackoverflow.com/questions/65822875/referencing-the-last-page-in-a-pdf-with-tabula

https://www.geeksforgeeks.org/how-to-iterate-over-files-in-directory-using-python/



In [None]:
#Upversion to v2_0 for Rough Draft Submission, 10/16/2022

In [None]:
# In v1_4 from v1_3_4
# added default start page to allow modifications due to odd parsing of graphs in OB29
def input_pdf(pdf_doc, area, col, pd_opts, pages=[], start = 27):
    
    if type(pages) == int:
        read_pages = pages
    elif len(pages) == 0:
        fh=open(pdf_doc, mode='rb')
        reader = PyPDF2.PdfFileReader(fh)
        read_pages = list(range(start, reader.getNumPages()))
        fh.close()
    else:
        read_pages = str(pages[0])+'-'+str(pages[1])
    
    return (
        tabula.read_pdf(
            input_path = pdf_doc,
            output_format = 'dataframe',
            pages = read_pages,
            guess = False,
            area = area,
            relative_area = True,
            stream = True,
            columns = col,
            pandas_options = pd_opts
        )
    )


In [None]:
def find_sections(X, opts = None):

# added options to skip drug product sections
    
# page starts and end for df
    book = None
    drug_pg = None
    otc_pg = None
    patent_pg = None
    terms_pg = None

# Set search strings based on section headers or page numbers
    p2 = re.compile(r'3.*1.*of.*\d\d\d')
    p3 = re.compile(r'OTC DRUG PRODUCT LIST')
    p4 = re.compile(r'PRESCRIPTION AND OTC DRUG PRODUCT')
    p5 = re.compile(r'EXCLUSIVITY TERMS')
    

    for j in range(0, len(X)):
        if pd.notna(X.iloc[j,1]):
            if ((drug_pg == None) & (opts == None)):
                m2 = p2.search(X.iloc[j,1])
                if m2:
                    drug_pg = j
            elif ((otc_pg == None) & (opts == None)):
                m3 = p3.search(X.iloc[j,1])
                if m3:
                    otc_pg = j
            elif  (patent_pg == None):
                m4 = p4.search(X.iloc[j,1])
                if m4:
                    patent_pg = j
            elif (terms_pg == None):
                m5 = p5.search(X.iloc[j,1])
                if m5:
                    terms_pg = j
                    return drug_pg, otc_pg, patent_pg, terms_pg

In [None]:
##functions for drug product module
def get_col(X, n):
# Returns combined string from values starting in column n
# Applys to drug substance and drug product name
    col=str()
    for i in range(n,len(X)):
        if type(X)!= list:
            if pd.notna(X.iloc[i]):
                col=col+X.iloc[i]
        else:
            if pd.notna(X[i]):
                col=col+X[i]
    return(col)

# parse_page() looks for stray page header information
def parse_page(X, mode = 0):
    if re.search(r'Footnote:',X):
        return('footer')
    if mode == 1:
        return(re.search(
            r'PATENT|REQUESTED|CODES|APPROVED|PRESCRIPTION|EXPIRATION|'
            r'report|3.*\d*.*of.*\d\d\d|ADA\.*\d*.*of.*\d\d\d',
            X) == None
              )
    else:
        return(re.search(
            r'PATENT|REQUESTED|CODES|APPROVED|PRESCRIPTION|EXPIRATION|report', 
            X)==None
              )
    
def chk_ml_head(X, i):
    return pd.notna([X.iloc[i,0],
                     X.iloc[i + 1,0]]
                   ).all()

# chk_ml_body() determines if line is part of multi-line entry
# For this, col should be present and oth should be empty
# Otherwise the next line is either a general body line or a header.  

def chk_ml_body(X, i, col, oth):
    if i < len(X):
        a = pd.isna(X.loc[i, oth]).all()
        b = pd.notna(X.loc[i, col]).any()
        Z = a & b

    else:
        Z = False
    return Z


def chk_nda_fmt(s):
    if type(s) == str:
        return re.match(r'\w\d\d\d\d',s)
    else:
        return None
    
# parse_header() splits drug substance and drug product information
# in the patent module
def parse_header(X, mode):
    
    #headers for 25th to 29 edition
    if mode == 1:
        m3 = re.split(r';', X)
        
    #headers for editions from 30 onwards
    else:            
        m3 = re.split((r'\s-\s|-\s|\s-(?!\Z)|-(?!\Z)'),X,1)

    return(get_col(m3[:-1],0), m3[-1])             

# parse_body returns a row of data frame body information
# Used for appending body text information onto row information
def parse_body(X, row, col):
    return X[col]

In [None]:
#Updated in V1_5_1
# Format parsed by column was creating incorrect values due to parsed literal "NA" text.
# This was leading to incomplete headers and string values in integer columns.  

def parse_pats(df, tf, start, end, mode=1):
    
# input is a list of dataframes. 
#X is the columnar format
#T is row-wise format.  
    X = df[0]
    T = tf[0]
    
    
    pats=pd.DataFrame(
        {
            'DS_Name':[''],
            'DP_Name':[''],
            'Application_No':[''],
            'Product_No':[''],
            'Patent_No':[''],
            'Patent_Expiration_Date':[''],
            'Patent_Codes':[''],
            'Patent_Delist':[''],
            'Exclusivity':[''],
            'Exclusive_Expire':['']  
        }
    )
    
    #temporary value holders
    DS_Name = None
    DP_Name = None
    head_line = str()
    otpt_row = pats.copy()
    
    #sequence flags
    flag_ml = False
      
    for i in range(start, end):
        ipt_line = get_col(X.iloc[i, : ],0)
        if parse_page(ipt_line, mode) == True:

# Process drug sub / route / product header information
            if pd.notna(X.iloc[i, 0]):
                head_line = head_line + get_col(T.iloc[i, : ],0)
                if not chk_ml_head(X, i):
                    DS_Name, DP_Name = parse_header(head_line, mode)
                    otpt_row.iloc[0, 0 : 2] = [DS_Name, DP_Name]
                    head_line = ''
                    
# Process body information
            else:
                ipt_line2 = X.iloc[i,:]
                #V1_5.  9/20/22
                #do direct link for name rather than use parse_body function.  
                if pd.notna(X.iloc[i,1]):
                    otpt_row.iloc[0, 2: ] = (
                        ipt_line2[~ipt_line2.index.isin(['DP_Name'])].copy()
                    )
                else:
                    otpt_row.iloc[0, 4: ] = (
                        ipt_line2[~ipt_line2.index.isin(['DP_Name',
                                                         'Application_No', 
                                                         'Product_No'])].copy())
                if pats.iloc[0,0] != '':
                    pats = pd.concat([pats, otpt_row])
                else:
                    pats = otpt_row.copy()
                head_line = ''
                
        #conditional to catch footer sections.  Currently Unused.        
        elif parse_page(ipt_line,1) == 'footer':
            return(pats)
            break
        else:
            head_line = ''          
    return(pats)


In [None]:
def main_pats():
#Updated in V1_5_1
# Added dataframe parsed by row as secondary reference for drug name headers
# Updated start page numbers for initial OB parsing.  
    
# pdf_io
    directory = ('OrangeBook\Testing')
    otpt_directory = ('OrangeBook\Testing\csv\\')
    
#for page scans
    pg_col = [10]
    pg_area = [0, 5,95,95]
    pg_pd_opt = {
            'header':0,
            'index_col':False,
            'names':[
                    'Header_Txt',
                    'other'
                ]
            }
    # for Patents
    pats_pg_area = [0, 5,95,95]

# OB25 to 28;  Note that Delist is absent
    pats_col_25 = [49,85,100,200,290,385,386,460]                   
    pats_col_26 = [65,100,125,220,295,385,386,485]
    pats_col_bf29 = [65,100,125,200,290,380,381,470]
    pats_col_29 = [67,100,125,200,290,380,381,470]
    #Edit 8/10/22 for OB30
    pats_col_bf31 = [62,105,130,200,280,355,400,475]
    pats_col_bf34 = [65,110,130,200,280,355,400,475]                
    pats_col_fr34 = [54,100,125,200,280,350,400,460]
    pats_col_fr38 = [54,100,125,200,280,350,400,475]
    
    pats_pd_opt = {
        'header':0,
        'index_col':False,
        'names':[
            'DP_Name',
            'Application_No',
            'Product_No',
            'Patent_No',
            'Patent_Expiration_Date',
            'Patent_Codes',
            'Patent_Delist',
            'Exclusivity',
            'Exclusive_Expire'
        ]
    } 
    
    # file naming variables
    p = re.compile(r'.pdf')
    p2 = re.compile(r'\\')
    p_ed = re.compile(r'\d\d(?=\D\D)')
       
    for filename in os.scandir(directory):
        if not filename.is_dir():
            try:
                print(filename)
                if int(p_ed.search(filename.name)[0]) in [25,29]:
                    tf = input_pdf(filename.path, pg_area, pg_col, pg_pd_opt, start = 871)
                elif int(p_ed.search(filename.name)[0]) in [26]:
                    tf = input_pdf(filename.path, pg_area, pg_col, pg_pd_opt, start = 854)
                else:
                    tf = input_pdf(filename.path, pg_area, pg_col, pg_pd_opt, start = 871)
                print('in main, looking for pages in : ',filename.path)
                drug_pg, otc_pg, patent_pg, terms_pg = find_sections(tf[0] , opts = 1)
                print ('Drug List between Pages: {} and {} \n\
                        Patent List between Pages: {} and {}'\
                        .format(drug_pg, otc_pg, patent_pg, terms_pg)
                 )
                
            except Exception as e:
                print('File Input error in ',filename, '\n',e)
                
                # Patent and Exclusivity Parsing Block
        #try:
            print('Patent Block')
            print('Filename is',filename.name)
            if int(p_ed.search(filename.name)[0]) == 26:
                df = input_pdf(filename.path, pats_pg_area, pats_col_26, 
                               pats_pd_opt, start = 854)
                mode = 1
            elif int(p_ed.search(filename.name)[0]) == 25:
                df = input_pdf(filename.path, pats_pg_area, pats_col_25, 
                               pats_pd_opt , start = 871)
                mode = 1  
            elif int(p_ed.search(filename.name)[0]) < 29:
                df = input_pdf(filename.path, pats_pg_area, pats_col_bf29, 
                               pats_pd_opt, start = 871)
                mode = 1
            elif int(p_ed.search(filename.name)[0]) == 29:
                df = input_pdf(filename.path, pats_pg_area, pats_col_29, 
                               pats_pd_opt, start = 871)
                mode = 1
            elif int(p_ed.search(filename.name)[0]) < 31:
                df = input_pdf(filename.path, pats_pg_area, pats_col_bf31, 
                               pats_pd_opt, start = 871)
                if int(p_ed.search(filename.name)[0]) == 29:
                    mode = 1 
                else:
                    mode = 0    
            elif int(p_ed.search(filename.name)[0])<34:
                df = input_pdf(filename.path, pats_pg_area, pats_col_bf34, 
                               pats_pd_opt, start = 871)
                mode = 0
            elif int(p_ed.search(filename.name)[0])<38:
                df = input_pdf(filename.path, pats_pg_area, pats_col_fr34, 
                               pats_pd_opt, start = 871)
                mode = 0
            else:
                df=input_pdf(filename.path, pats_pg_area, pats_col_fr38, 
                             pats_pd_opt, start = 871) 
                mode = 0
                
            pats = parse_pats(df, tf, patent_pg, terms_pg, mode)
            rn_out_doc = p.sub('_pat_2_0.csv', filename.name)
            rn_out_doc = otpt_directory + rn_out_doc
            pats.to_csv(rn_out_doc)
            print('Completed parsing drug patent info from ', filename.name)

In [None]:
main_pats()