In [None]:
# Package Author Mark Benmuvhar
# pdf_read_patent
# Version 2.0.0
# 10/16/2022

In [None]:
#pip install tabula-py
#pip install PyPDF2

In [None]:
import tabula
import PyPDF2
import pandas as pd
import numpy as np
import re
import os
import os.path

**Other references**
Zhu, Aaron. (2022, Match 10).  Extract PDF text while preserving whitespace using python and pytesseract.  *Towards Data Science.*  https://towardsdatascience.com/pdf-text-extraction-while-preserving-whitespace-using-python-and-pytesseract-ec142743e805  

**API references:**
https://tabula-py.readthedocs.io/en/latest/tabula.html  


**Required supporting files**  

https://tesseract-ocr.github.io/tessdoc/Downloads.html  

https://github.com/tesseract-ocr/tessdoc  

https://github.com/oschwartz10612/poppler-windows/releases?page=1  

**Programming References**  

https://thewebdev.info/2022/04/17/how-to-fix-appending-turns-my-list-to-nonetype-with-python/#:~:text=To%20fix%20appending%20turns%20my%20list%20to%20NoneType,%5B%27a%27%2C%20%27b%27%2C%20%27c%27%2C%20%27d%27%5D%20a_list%20%3D%20a_list.append%20%28%27e%27%29  

https://stackoverflow.com/questions/65822875/referencing-the-last-page-in-a-pdf-with-tabula

https://www.geeksforgeeks.org/how-to-iterate-over-files-in-directory-using-python/



In [None]:
# In v1_4 from v1_3_4
# added default start page to allow modifications due to odd parsing of graphs in OB29
def input_pdf(pdf_doc, area, col, pd_opts, pages=[], start = 27):
    
    if type(pages) == int:
        read_pages = pages
    elif len(pages) == 0:
        fh=open(pdf_doc, mode='rb')
        reader = PyPDF2.PdfFileReader(fh)
        read_pages = list(range(start, reader.getNumPages()))
        fh.close()
    else:
        read_pages = str(pages[0])+'-'+str(pages[1])
    
    return (
        tabula.read_pdf(
            input_path = pdf_doc,
            output_format = 'dataframe',
            pages = read_pages,
            guess = False,
            area = area,
            relative_area = True,
            stream = True,
            columns = col,
            pandas_options = pd_opts
        )
    )


In [None]:
# Updated from pdf_read_fnl_v1_4 to catch multiple sections 
# discnt_v0_3 added options to skip drug product sections (fnl_v_1_5_1)
def find_dcn_sections(X, opts = None):
    
# page starts and end for df
    book = None
    drug_pg = None
    otc_pg = None
    dcn_pg = None
    ode_pg = None
    patent_pg = None
    terms_pg = None

# Set search strings based on section headers or page numbers
    p2 = re.compile(r'3.*1.*of.*\d\d\d')
    p3 = re.compile(r'OTC DRUG PRODUCT LIST')
    p4 = re.compile(r'DISCONTINUED DRUG PRODUCT LIST')
    p5 = re.compile(r'DESIGNATIONS AND')
    p6 = re.compile(r'PRESCRIPTION AND OTC DRUG PRODUCT')
    p7 = re.compile(r'EXCLUSIVITY TERMS')
    

    for j in range(0, len(X)):
        if pd.notna(X.iloc[j,1]):
            if ((drug_pg == None) & (opts == None)):
                m2 = p2.search(X.iloc[j,1])
                if m2:
                    drug_pg = j
            elif ((otc_pg == None) & (opts == None)):
                m3 = p3.search(X.iloc[j,1])
                if m3:
                    otc_pg = j
            elif dcn_pg == None:
                m4 = p4.search(X.iloc[j,1])
                if m4:
                    dcn_pg = j
            elif ode_pg == None:
                m5 = p5.search(X.iloc[j,1])
                if m5:
                    ode_pg = j  
                    return drug_pg, otc_pg, dcn_pg, ode_pg, patent_pg, terms_pg
            elif  (patent_pg == None) & (opts == None):
                m6 = p6.search(X.iloc[j,1])
                if m6:
                    patent_pg = j
            elif (terms_pg == None) & (opts == None):
                m7 = p7.search(X.iloc[j,1])
                if m7:
                    terms_pg = j
                    return drug_pg, otc_pg, dcn_pg, ode_pg, patent_pg, terms_pg

In [None]:
##functions for drug product module
# Function get_col()
# Returns combined string from values starting in column n
# Applys to drug substance and drug product name

def get_col(X, n):
    col=str()
    for i in range(n,len(X)):
        if type(X)!= list:
            if pd.notna(X.iloc[i]):
                col=col+X.iloc[i]
        else:
            if pd.notna(X[i]):
                col=col+X[i]
    return(col)

# parse_page() looks for stray page header information
def parse_page(X, mode=0):
    if re.search(r'Footnote:',X):
        return('footer')
    if mode == 1:
        return(re.search(r'PATENT|REQUESTED|CODES|APPROVED|PRESCRIPTION|'
                         r'EXPIRATION|report|3.*\d*.*of.*\d\d\d', X)==None)
    else:
        return(re.search(r'PATENT|REQUESTED|CODES|APPROVED|PRESCRIPTION|'
                         r'EXPIRATION|report', X)==None)
    
def chk_ml_head(X,i):
    return pd.notna([X.iloc[i,0],
                     X.iloc[i+1,0]]
                   ).all()

# chk_ml_body() determines if line is part of multi-line entry
# For this, col should be present and oth should be empty
# Otherwise the next line is either a general body line or a header.  

def chk_ml_body(X,i,col, oth):
    if i < len(X):
        a = pd.isna(X.loc[i, oth]).all()
        b = pd.notna(X.loc[i,col]).any()
        Z = a & b

    else:
        Z = False
    return Z


def chk_nda_fmt(s):
    if type(s) == str:
        return re.match(r'\w\d\d\d\d',s)
    else:
        return None
    
# parse_header() splits drug substance and drug product information
# in the patent module
def parse_header(X, mode):
    
    #headers for 25th to 29 edition
    if mode == 1:
        m3 = re.split(r';', X)
        
    #headers for editions from 30 onwards
    else:            
        m3 = re.split((r'\s-\s|-\s|\s-(?!\Z)|-(?!\Z)'),X,1)

    return(get_col(m3[:-1],0), m3[-1])             

# parse_body returns a row of data frame body information
# Used for appending body text information onto row information
def parse_body(X, row, col):
    return X[col]

In [None]:
# Discontinued Drug Product Module
# Modified from Drug Product Module v 1_4 by removing checks for BE presence in entry bodies

def parse_dcn(df, tf, start, end, dpn_pos, note_pos, mode):
    
    X = df[0]
    T = tf[0]
    
    prods=pd.DataFrame(data={
                        'ds_name' : [''],
                        'route' : [''], 
                        'dp_name': [''],
                        'be' : [''],
                        'note' : [''],
                        'sponsor' : [''],
                        'strength' : [''],
                        'app_no' : [''],
                        'prod_no' : [''],
                        'app_date' : ['']  
                    }
                )
    
    #temporary value holders
    DS_Name = None
    DP_Name = None
    head_line = ''
    otpt_row = prods.copy()
    prev_head = otpt_row.iloc[0, 0:3].copy()
    old_sponsor = ''
    
    fl_first_entry = True
    fl_read_head = True
    fl_proc_body = False
    fl_ml_body = False
    fl_rpt_sponsor = False
    
    for i in range(start,end):
        ipt_line = X.iloc[i, : ]
        #print('ipt_line:',ipt_line)
        #print('Text_line:',T.iloc[i,:])
              
        if fl_proc_body:
            if fl_first_entry == True:
                prods.iloc[0, :] = otpt_row
                fl_proc_body = False
                fl_first_entry = False
            else:
                prods = pd.concat([prods, otpt_row])
                fl_proc_body = False
                head_line = ''
                           
        if parse_page(get_col(ipt_line, 0) ,mode) == True:
                        
# process header information
# Any time we get to header after first iteration, the body information should process
# Start with the lowest level header and work upwards
# Is it a drug product name, route, or drug substance
# Only drug substance can be multi-line
# Clear head_line information after updating

            if (chk_nda_fmt(X.loc[i, 'app_no'])==None) & (fl_ml_body==False):
                if pd.isna(X.loc[i, ['be', 'route']]).all():
                    otpt_row.loc[0 ,'dp_name'] = get_col(T.iloc[i, : ],1)
                    head_line = ''
                    if (prev_head != otpt_row.iloc[0, 0:3]).all():
                        fl_rpt_sponsor = False
                elif pd.isna(X.loc[i, 'be']):
                    otpt_row.loc[0, 'route'] = get_col(T.iloc[i, : ],1)
                    head_line = ''
                else:
                    if not chk_ml_head(X,i):
                        head_line = head_line + ' '+ get_col(T.iloc[i, :],1)
                        otpt_row.loc[0, 'ds_name'] = head_line
                        head_line = ''
                    else:
                        head_line = head_line + ' ' + get_col(T.iloc[i, :],1)

# process body information
# Start from bottom case and work up
# Start from far right case and work left

# Is row part of a multi-line entry?
# Is it a multi-line strength and multi-line sponsor?
# Is it a multi-line strength?
# Is it a multi-line sponsor?
# Is the line part of a block of sponsor information?
# Is it the initial body entry with expected content? 

            #Single columns to append and write

            elif pd.notna(X.loc[i, ['strength','sponsor']]).any() &\
                    (pd.isna(X.loc[i, ~X.columns.isin(['strength', 'sponsor'])]).all()):
                if pd.notna(X.loc[i, 'strength']):
                    #Possible to have literal 'N/A in text, so convert to string'
                    otpt_row.loc[0, 'strength'] = (
                        str(otpt_row.loc[0, 'strength']) + ' '+ str(X.loc[i, 'strength']))
                if pd.notna(X.loc[i, 'sponsor']):
                    otpt_row.loc[0, 'sponsor'] = (
                        otpt_row.loc[0, 'sponsor'] + ' ' + str(X.loc[i, 'sponsor']))
                if chk_ml_body(X, i+1, ['sponsor','strength'], 
                               ['dp_name','route','app_no']):
                    fl_proc_body = False
                    fl_ml_body = True
                else:
                    fl_proc_body = True
                    fl_ml_body = False
                    
            #Whole entries with or without repeated sponsor names
            else:
                old_sponsor = str(otpt_row.loc[0, 'sponsor'])
                old_strength = str(otpt_row.loc[0,'strength'])
                otpt_row.loc[0, ~otpt_row.columns.isin(['ds_name', 'route', 'dp_name'])] = \
                             X.loc[i, ~X.columns.isin(['dp_name', 'route'])]
                if chk_ml_body(X, i+1, ['sponsor','strength'], 
                               ['route','note','dp_name', 'app_no','app_date']):
                    fl_proc_body = False
                    fl_ml_body = True                  
                    if pd.isna(X.loc[i,'sponsor']):
                        otpt_row.loc[0, 'sponsor'] = old_sponsor
                
                #Alternately, add an analogous section for strength.  
                elif pd.isna(X.loc[i,'sponsor']):
                    otpt_row.loc[0, 'sponsor'] = old_sponsor
                    fl_proc_body = True
                    fl_ml_body = False
                else:
                    prev_head = otpt_row.loc[0, ['ds_name', 'route', 'dp_name']]
                    fl_proc_body = True
                    fl_ml_body = False
        else:
            fl_ml_body = False
    return(prods)

In [None]:
# Updated in v1_4 from v1_3_4
# pass initial parsed table (line-by-line rendering) to parse_prods 
# in addition to column-wise parse
# Use a defined start page for input_pdf due to odd handling of arrow graphics 
# in volume 25 and 29.

def main_dcn():
    
# pdf_io
    directory = ('OrangeBook\Testing')
    otpt_directory = ('OrangeBook\Testing\csv\\')
    
# file naming variables
    p = re.compile(r'.pdf')
    p_ed = re.compile(r'\d\d(?=\D\D)')
    p_yr = re.compile(r'\d\d\d\d')
    
#for page scans
    pg_col = [10]
    pg_area = [0, 5,99,95]
    pg_pd_opt = {
            'header':0,
            'index_col':False,
            'names':[
                    'Header_Txt',
                    'other'
                ]
            } 

    # For drug product
    #col_25 modified from read_fnl_v1_4
    dp_col_25 = [85,90,94,98,205,395,435,475]      
    dp_col_bf34 = [88,90,97,100,205,400,450,475]  
    dp_col_fr34 = [62,77,82,100,210,400,450,475]
    dp_pg_area = [0, 5,99,95]
    dp_pd_opt_bf34 = {
            'header':0,
            'index_col':False,
            'names':[
                'be',
                'route',
                'note',
                'dp_name',
                'sponsor',
                'strength',
                'app_no', 
                'prod_no',
                'app_date'
                ]
            }

    dp_pd_opt_fr34 = {
            'header':0,
            'index_col':False,
            'names':[
                'be',
                'route',
                'dp_name',
                'note',
                'sponsor',
                'strength',
                'app_no', 
                'prod_no',
                'app_date'
                ]
            } 

    for filename in os.scandir(directory):
        if not filename.is_dir():
            try:
                print(filename)
                if int(p_ed.search(filename.name)[0]) in [25, 29]:
                    tf=input_pdf(filename.path, pg_area, pg_col, pg_pd_opt, start = 398)
                elif int(p_ed.search(filename.name)[0]) in [41, 42]:
                    print('for 41 and 42 tf')
                    tf=input_pdf(filename.path, pg_area, pg_col, pg_pd_opt, start = 503)
                else:
                    tf=input_pdf(filename.path, pg_area, pg_col, pg_pd_opt, start = 407)
                    #418, 442
                print('in main, looking for pages in : ',filename.path)
                drug_pg, otc_pg, dcn_pg, ode_pg, patent_pg, terms_pg = (
                    find_dcn_sections(tf[0], opts = 1)
                )
                print ('Drug List between Pages: {} and {} \n\
                        Discontinued products between Pages: {} and {} \n\
                        Patent List between Pages: {} and {}'\
                        .format(drug_pg, otc_pg, dcn_pg, ode_pg, patent_pg, terms_pg)
                 )

            except Exception as e:
                print('File Input error in ',filename, '\n',e)

# Drug Product Parsing Block                
            #Note: difference in parsing OB41, P500 (Biologics).  df is seeing an extra line
            #Try deleting blank lines.  IF not, hard code the DCN pages...
            try:
                print('Drug Block')
                print('Filename is',filename.name)

                if int(p_ed.search(filename.name)[0])<34:
                    dpn_pos = 3
                    note_pos = 2
                    mode = 1
                    if int(p_ed.search(filename.name)[0]) == 25:
                        df=input_pdf(filename.path, 
                                     pg_area, 
                                     dp_col_25, 
                                     dp_pd_opt_bf34, 
                                     start = 398)
                    elif int(p_ed.search(filename.name)[0]) == 29:
                        df=input_pdf(filename.path, 
                                     pg_area, 
                                     dp_col_bf34, 
                                     dp_pd_opt_bf34, 
                                     start = 398)
                    else:
                        df=input_pdf(filename.path, 
                                     pg_area, 
                                     dp_col_bf34, 
                                     dp_pd_opt_bf34, 
                                     start = 407)
                else:
                    mode = 0
                    dpn_pos = 2
                    note_pos = 3
                    if int(p_ed.search(filename.name)[0]) in [41, 42]:
                        print('for 41 and 42 df')
                        df=input_pdf(filename.path, 
                                     pg_area, 
                                     dp_col_fr34, 
                                     dp_pd_opt_fr34, 
                                     start = 503)
                    else:
                        df=input_pdf(filename.path, 
                                     pg_area, 
                                     dp_col_fr34, 
                                     dp_pd_opt_fr34, 
                                     start = 407)
                
                #Remove hidden 'XX' string from pdf
                df[0].iloc[:,0].replace('XX',np.nan, inplace = True)
                tf[0].iloc[:,0].replace('XX',np.nan, inplace = True)
                df[0].dropna(axis = 0, how = 'all', inplace = True)
                tf[0].dropna(axis = 0, how = 'all', inplace = True)
                df[0].reset_index(drop = True, inplace = True)
                tf[0].reset_index(drop = True, inplace = True)
                                
                prods = parse_dcn(df, tf, dcn_pg, ode_pg, dpn_pos, note_pos, mode)
                #prods = parse_dcn(df, tf, 0, ode_pg, dpn_pos, note_pos, mode)
                prods['dcn'] = p_yr.search(filename.name)[0]
                rn_out_doc = p.sub('_dscn_v2_0.csv', filename.name)
                rn_out_doc = otpt_directory+rn_out_doc
                prods.to_csv(rn_out_doc)
                print('Completed parsing discontinued product info from ',filename.name)

            except Exception as e:
                print('Drug Product Parse error in ',filename, '\n',e)     
                pd.set_option('display.max_rows', None)    
                print(df[0].iloc[drug_pg:(drug_pg+30),:].head(30))

In [None]:
main_dcn()