In [None]:
# Package Author Mark Benmuvhar
# thesis_filter_dcn
# Version 2.0.0
# 10/16/2022

In [1]:
import numpy as np
import pandas as pd
from datetime import date
import re
import os

In [2]:
# pdf_io
directory = ('OrangeData\dcn')
start=True
for filename in os.scandir(directory):
    if not filename.is_dir():
        data_in=pd.read_csv(filename, dtype =
                               {
                                'BE_Rating':str,
                                'Route':str,
                                'Note':str,
                                'DP_Name':str,
                                'Sponsor':str,
                                'Strength':str,
                                'Application_No':str, 
                                'Product_No':int,
                                'Appl_Date':str
                              },
                            parse_dates = True
                      )
        if start == True:
            df = data_in.copy()
            start = False
        else:
            df = pd.concat([df, data_in])                

In [4]:
# find_use_pats looks for usage code flag via regex
# Intended for use in an apply statement using the Patent_Code column containing str

def find_use_pat(s):
    p=re.compile('U-\d+')
    try:
        if pd.notna(s):
            m=p.search(s)
            if m:
                return m[0]
        return np.nan
    except:
        print('error getting use code from ', s,'\n Found ',m[0])

In [5]:
def find_app_type(s):
    try:
        if pd.notna(s):
            p=re.compile(r'A|N')
            m=p.search(s)
            if m:
                return m[0]
        return np.nan
    except:
        print('error finding application type in ', s)

In [6]:
def find_app_nums(s):
    try:
        if pd.notna(s):
            p=re.compile(r'\d+')
            m=p.search(s)
            return m[0]
        else:
            return np.nan
    except:
        print('error finding application numbers in ', s)

In [8]:
dfw = df.copy()

app_type = dfw['app_no'].apply(find_app_type)
dfw['app_type'] = app_type
dfw['prod_no'].replace(np.nan, None, inplace = True)
#dfw['be'].replace(np.nan, None, inplace = True)
#dfw['note'].replace(np.nan, 'No Notes', inplace = True)
dfw['prod_no'] = dfw['prod_no'].transform(str)
dfw['prod_no'] = dfw['prod_no'].str.replace(r'00+\s*','', regex = True)
dfw['strength'] = dfw['strength'].str.replace(r'\*\*','', regex = True)
dfw['strength'] = dfw['strength'].str.replace(r'Federal.*','', regex = True)
dfw['strength'] = dfw['strength'].str.strip()
dfw['ds_name'] = dfw['ds_name'].str.lstrip()
dfw['app_num'] = dfw['app_no'].apply(find_app_nums)
dfw['app_num'] = dfw['app_num'].str.zfill(6)
dfw['prod_no'] = dfw['prod_no'].str.zfill(3)
dfw=dfw.drop(['app_no', 'be', 'Unnamed: 0'], axis=1)
dfw['route'] = dfw['route'].str.replace(pat = ';(?=\w)', repl ='; ', regex = True)
dfw['ds_name'] = dfw['ds_name'].str.replace(pat = ';(?=\w)', repl ='; ', regex = True)
dfw['ds_name'] = dfw['ds_name'].str.replace(pat = ',(?=\w)', repl =', ', regex = True)
dfw['dp_name'] = dfw['dp_name'].str.replace(pat = ';(?=\w)', repl ='; ', regex = True)
dfw['dp_name'] = dfw['dp_name'].str.replace(pat = ',(?=\w)', repl =', ', regex = True)
dfw.sort_values(by = ['app_num', 'dcn'], inplace = True)
dfw = dfw.drop_duplicates(keep = 'first', subset = dfw.columns[~dfw.columns.isin(['dcn', 'sponsor'])])
dfw.reset_index(inplace = True, drop = True)

In [10]:
# Address updated application types

anda = dfw[dfw['app_type']=='A'][['app_num', 'app_type']]
nda = dfw[dfw['app_type']=='N'][['app_num']]

to_update = nda.merge(right = anda, how = 'inner', on = ['app_num'])
to_update.drop_duplicates(keep = 'first', inplace = True)

dfw.loc[dfw['app_num'].isin(to_update['app_num']), 'app_type'] = 'A'
dfw.reset_index(drop = True, inplace = True)

In [11]:
# Cast Notes to reference substance (!) or reference product (+) boolean
# Reference products will lump together combined RS/RP designation (!+)
# For this project, only consider reference product case
# When a ref product gets pulled from market, fda can designate a ref. subs.
# This complicates the analysis and is outside the simple comparison of 
# time after reference prodcut we are considering.
ref_prod = dfw['note'].str.contains(pat = '\+', regex = True, na = False)
dfw.loc[ref_prod,'ref_prod'] = True
dfw.loc[~ref_prod,'ref_prod'] = False
dfw.drop('note', axis = 1, inplace = True)


In [12]:
# Simplify the reference product codes
# Products may be assigned reference product status later in their existence.
# Or reference category may be lost if a product exits the market
# For this study, capture if a product has been assigned reference status since application.

# look for products that were updated during lifecycle
ref_prod = dfw.loc[dfw['ref_prod'],['app_num', 'route', 'ref_prod']]    
ref_prod.drop_duplicates(keep = 'first', inplace = True)
dfw.loc[dfw['app_num'].isin(ref_prod['app_num']), 'ref_prod'] = True

In [13]:
dfw.to_csv('OrangeData\\for_analysis\dcn_v_2_0.csv')

In [21]:
dfw.head()

Unnamed: 0,ds_name,route,dp_name,sponsor,strength,prod_no,app_date,dcn,app_type,app_num,ref_prod
0,HYDROXYAMPHETAMINE HYDROBROMIDE,SOLUTION/DROPS; OPHTHALMIC,PAREDRINE,AKORN,1%,4,,2005,N,4,False
1,SULFAPYRIDINE,TABLET; ORAL,SULFAPYRIDINE,LILLY,500MG,1,,2005,N,159,False
2,HEPARIN SODIUM,INJECTABLE; INJECTION,HEPARIN SODIUM,ORGANON USA INC,"1,000 UNITS/ML",8,,2005,N,552,False
3,HEPARIN SODIUM,INJECTABLE; INJECTION,HEPARIN SODIUM,ORGANON USA INC,"5,000 UNITS/ML",9,,2005,N,552,False
4,HEPARIN SODIUM,INJECTABLE; INJECTION,HEPARIN SODIUM,ORGANON USA INC,"10,000 UNITS/ML",10,,2005,N,552,False
