In [15]:
import pandas as pd

pd.read_csv("/scratch/harsha.vasamsetti/Drug_Safety_Labels_Output.csv")

Unnamed: 0,Drug,Application Number,Date,Boxed Warning,Contraindications,Warnings and Precautions,Adverse Reactions,Drug Interactions,Use in Specific Populations,PCI/PI/MG
0,Drug Safety-related Labeling Changes,,11/30/2022(SUPPL-48),,,,x,,x,
1,Drug Safety-related Labeling Changes,,02/05/2020(SUPPL-44),,,,x,,,
2,Drug Safety-related Labeling Changes,,02/05/2020(SUPPL-45),,,,x,,,
3,Drug Safety-related Labeling Changes,,08/07/2019(SUPPL-43),,,,,,x,
4,Drug Safety-related Labeling Changes,,02/23/2017(SUPPL-42),,,x,x,,,
5,Drug Safety-related Labeling Changes,,08/18/2016(SUPPL-41),,,x,x,,,x
6,Drug Safety-related Labeling Changes,,01/15/2016(SUPPL-40),,,,x,,,
7,Drug Safety-related Labeling Changes,,02/05/2020(SUPPL-35),,,,x,,,
8,Drug Safety-related Labeling Changes,,02/05/2020(SUPPL-36),,,,x,,,
9,Drug Safety-related Labeling Changes,,08/07/2019(SUPPL-34),,,,,,x,


In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import logging
import random
import time
from tqdm import tqdm
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the input CSV file
input_file_path = '/scratch/harsha.vasamsetti/Drug Safety-related Labeling Changes (SrLC).csv'
drug_data = pd.read_csv(input_file_path)

# Define columns for the output CSV
columns = [
    'Drug', 'Application Number', 'Boxed Warning', 'Contraindications',
    'Warnings and Precautions', 'Adverse Reactions', 'Drug Interactions',
    'Use in Specific Populations', 'PCI/PI/MG'
]

# List of User-Agent strings
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
]

# Set up a requests Session
session = requests.Session()

# Common headers for a real browser
base_headers = {
    'User-Agent': random.choice(user_agents),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-User': '?1',
    'Host': 'www.accessdata.fda.gov',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache'
}

def extract_sections(url):
    section_data = {
        'Boxed Warning': '',
        'Contraindications': '',
        'Warnings and Precautions': '',
        'Adverse Reactions': '',
        'Drug Interactions': '',
        'Use in Specific Populations': '',
        'PCI/PI/MG': ''
    }
    try:
        # Randomize the User-Agent for each request
        headers = base_headers.copy()
        headers['User-Agent'] = random.choice(user_agents)
        
        # Request the detail page
        response = session.get(url, headers=headers, timeout=10, allow_redirects=True)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract Drug Name and Application Number
            h3_tags = soup.find_all('h3')
            for h3 in h3_tags:
                if 'small' in str(h3):
                    drug_app_info = h3.get_text(strip=True)
                    drug_name = drug_app_info.split(' (')[0]
                    app_number = drug_app_info.split('(NDA-')[1].split(')')[0]
                    section_data['Drug'] = drug_name
                    section_data['Application Number'] = app_number
                    break
            
            # Check for sections in h2, h3, h4
            for header in soup.find_all(['h2', 'h3', 'h4']):
                header_text = header.get_text(strip=True).lower()
                if 'boxed warning' in header_text:
                    section_data['Boxed Warning'] = 'x'
                elif 'contraindications' in header_text:
                    section_data['Contraindications'] = 'x'
                elif 'warnings and precautions' in header_text:
                    section_data['Warnings and Precautions'] = 'x'
                elif 'adverse reactions' in header_text:
                    section_data['Adverse Reactions'] = 'x'
                elif 'drug interactions' in header_text:
                    section_data['Drug Interactions'] = 'x'
                elif 'use in specific populations' in header_text:
                    section_data['Use in Specific Populations'] = 'x'
                elif 'pci' in header_text or 'patient counseling information' in header_text:
                    section_data['PCI/PI/MG'] = 'x'
        else:
            logging.error(f"Failed to retrieve URL {url}: Status code {response.status_code}")
    except Exception as e:
        logging.error(f"Error processing URL {url}: {e}")
    
    return section_data

# Define the output file path
output_file_path = 'Drug_Safety_Labels_Output.csv'

# Check if the file exists; if not, write the header
if not os.path.exists(output_file_path):
    pd.DataFrame(columns=columns).to_csv(output_file_path, index=False)

# Process each drug in the input file
for index, row in tqdm(drug_data.iterrows(), total=len(drug_data)):
    drug = row.get('Drug', '')
    application_number = row.get('Application Number', '')
    link = row.get('Link', '')

    if pd.notna(link) and isinstance(link, str) and link.strip():
        logging.info(f"Processing {drug} ({application_number}) => {link}")
        sections = extract_sections(link)
        row_data = {
            'Drug': sections.get('Drug', ''),
            'Application Number': sections.get('Application Number', ''),
            **sections
        }
        # Create a DataFrame for the single row
        df = pd.DataFrame([row_data], columns=columns)
        # Append the DataFrame to the CSV file without the header
        df.to_csv(output_file_path, mode='a', header=False, index=False)
        # Random delay (2-5 seconds) to reduce chances of being blocked
        time.sleep(random.uniform(2, 5))

logging.info(f"Data extraction complete. Output saved to {output_file_path}")

  0%|          | 0/2993 [00:00<?, ?it/s]2025-01-18 17:42:10,275 - INFO - Processing  (21436) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=38
  0%|          | 1/2993 [00:04<3:56:39,  4.75s/it]2025-01-18 17:42:15,022 - INFO - Processing  (21713) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=39
  0%|          | 2/2993 [00:08<3:25:43,  4.13s/it]2025-01-18 17:42:18,716 - INFO - Processing  (21729) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=40
  0%|          | 3/2993 [00:11<2:57:24,  3.56s/it]2025-01-18 17:42:21,602 - INFO - Processing  (21866) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=41
  0%|          | 4/2993 [00:15<3:05:16,  3.72s/it]2025-01-18 17:42:25,565 - INFO - Processing  (202971) => https://www.accessdata.

KeyboardInterrupt: 

In [None]:
Drug,                Application Number,  Date,                   Boxed Warning,  Contraindications,  Warnings and Precautions,  Adverse Reactions,  Drug Interactions,  Use in Specific Populations,  PCI/PI/MG
ABILIFY(NDA-021436), 021436,              11/30/2022,             ,                 ,                       ,                     x,                    ,                          x,                     ,
ABILIFY(NDA-021436), 021436,              02/05/2020,             ,                 ,                       ,                     x,                    ,                           ,                     ,
ABILIFY(NDA-021436), 021436,              02/05/2020,             ,                 ,                       ,                     x,                    ,                           ,                     ,
ABILIFY(NDA-021436), 021436,              08/07/2019,             ,                 ,                       ,                      ,                    ,                          x,                     ,
ABILIFY(NDA-021436), 021436,              02/23/2017,             ,                 ,                      x,                     x,                    ,                           ,                     , 
ABILIFY(NDA-021436), 021436,              08/18/2016,             ,                 ,                      x,                     x,                    ,                           ,                    x,
ABILIFY(NDA-021436), 021436,              01/15/2016,             ,                 ,                       ,                     x,                    ,                           ,                     ,
ABILIFY(NDA-021713), 021713,              02/05/2020              ,                 ,                       ,                     x,                    ,                           ,                     ,
ABILIFY(NDA-021713), 021713,              02/05/2020              ,                 ,                       ,                     x,                    ,                           ,                     ,
ABILIFY(NDA-021713), 021713,              08/07/2019              ,                 ,                       ,                      ,                    ,                          x,                     ,
ABILIFY(NDA-021713), 021713,              02/23/2017              ,                 ,                      x,                     x,                    ,                           ,                     ,
ABILIFY(NDA-021713), 021713,              08/18/2016              ,                 ,                      x,                     x,                    ,                           ,                    x,
ABILIFY(NDA-021713), 021713,              01/15/2016              ,                 ,                       ,                     x,                    ,                           ,                     ,                           

In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import logging
import random
import time
from tqdm import tqdm
import os
import re

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the input CSV file
input_file_path = '/scratch/harsha.vasamsetti/Drug Safety-related Labeling Changes (SrLC).csv'
drug_data = pd.read_csv(input_file_path)

# Define columns for the output CSV
columns = [
    'Drug', 'Application Number', 'Date', 'Boxed Warning', 'Contraindications',
    'Warnings and Precautions', 'Adverse Reactions', 'Drug Interactions',
    'Use in Specific Populations', 'PCI/PI/MG'
]

# List of User-Agent strings
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
]

# Set up a requests Session
session = requests.Session()

# Common headers for a real browser
base_headers = {
    'User-Agent': random.choice(user_agents),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-User': '?1',
    'Host': 'www.accessdata.fda.gov',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache'
}

def extract_sections(url):
    updates = []
    try:
        headers = base_headers.copy()
        headers['User-Agent'] = random.choice(user_agents)
        response = session.get(url, headers=headers, timeout=10, allow_redirects=True)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find the h3 tag containing the drug name and application number
            h3_tags = soup.find_all('h3')
            drug_name = ''
            app_number = ''
            for h3 in h3_tags:
                if 'NDA-' in h3.get_text():
                    drug_app_info = h3.get_text(strip=True)
                    drug_name = drug_app_info.split(' (')[0].strip()
                    app_number_match = re.search(r'NDA-(\d+)', drug_app_info)
                    if app_number_match:
                        app_number = app_number_match.group(1)
                    break
            # Find all update headers
            update_headers = soup.find_all('h3', class_='background_text accordion-header ui-accordion-header ui-helper-reset ui-state-default ui-accordion-icons ui-corner-all')
            for header in update_headers:
                date_str = header.find(text=True, recursive=False).strip()
                content_div = header.find_next('div', class_='ui-accordion-content ui-helper-reset ui-widget-content ui-corner-bottom')
                if content_div:
                    sections = {
                        'Boxed Warning': '',
                        'Contraindications': '',
                        'Warnings and Precautions': '',
                        'Adverse Reactions': '',
                        'Drug Interactions': '',
                        'Use in Specific Populations': '',
                        'PCI/PI/MG': ''
                    }
                    for h4 in content_div.find_all('h4'):
                        header_text = h4.get_text(strip=True).lower()
                        if 'boxed warning' in header_text:
                            sections['Boxed Warning'] = 'x'
                        elif 'contraindications' in header_text:
                            sections['Contraindications'] = 'x'
                        elif 'warnings and precautions' in header_text:
                            sections['Warnings and Precautions'] = 'x'
                        elif 'adverse reactions' in header_text:
                            sections['Adverse Reactions'] = 'x'
                        elif 'drug interactions' in header_text:
                            sections['Drug Interactions'] = 'x'
                        elif 'use in specific populations' in header_text:
                            sections['Use in Specific Populations'] = 'x'
                        elif 'pci' in header_text or 'patient counseling information' in header_text:
                            sections['PCI/PI/MG'] = 'x'
                    update_data = {
                        'Drug': drug_name,
                        'Application Number': app_number,
                        'Date': date_str,
                        **sections
                    }
                    updates.append(update_data)
        else:
            logging.error(f"Failed to retrieve URL {url}: Status code {response.status_code}")
    except Exception as e:
        logging.error(f"Error processing URL {url}: {e}")
    return updates

# Define the output file path
output_file_path = 'Drug_Safety_Labels_Output.csv'

# Check if the file exists; if not, write the header
if not os.path.exists(output_file_path):
    pd.DataFrame(columns=columns).to_csv(output_file_path, index=False, encoding='utf-8')

# Process each drug in the input file
for index, row in tqdm(drug_data.iterrows(), total=len(drug_data)):
    drug = row.get('Drug', '')
    application_number = row.get('Application Number', '')
    link = row.get('Link', '')

    if pd.notna(link) and isinstance(link, str) and link.strip():
        logging.info(f"Processing {drug} ({application_number}) => {link}")
        updates = extract_sections(link)
        for update in updates:
            df = pd.DataFrame([update], columns=columns)
            df.to_csv(output_file_path, mode='a', header=False, index=False, encoding='utf-8')
        time.sleep(random.uniform(2, 5))

logging.info(f"Data extraction complete. Output saved to {output_file_path}")

  0%|          | 0/2993 [00:00<?, ?it/s]2025-01-18 18:00:36,547 - INFO - Processing  (21436) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=38


  date_str = header.find(text=True, recursive=False).strip()
  0%|          | 1/2993 [00:04<3:53:49,  4.69s/it]2025-01-18 18:00:41,236 - INFO - Processing  (21713) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=39
  0%|          | 2/2993 [00:09<3:44:43,  4.51s/it]2025-01-18 18:00:45,619 - INFO - Processing  (21729) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=40
  0%|          | 3/2993 [00:14<3:57:05,  4.76s/it]2025-01-18 18:00:50,674 - INFO - Processing  (21866) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=41
  0%|          | 4/2993 [00:19<4:08:01,  4.98s/it]2025-01-18 18:00:55,991 - INFO - Processing  (202971) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=24
  0%|          | 5/2993 [00:24<4:02:21,  4.87s/it]2025-01-

KeyboardInterrupt: 

In [17]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import logging
import random
import time
from tqdm import tqdm
import os
import re

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the input CSV file
input_file_path = '/scratch/harsha.vasamsetti/Drug Safety-related Labeling Changes (SrLC).csv'
drug_data = pd.read_csv(input_file_path)

# Define columns for the output CSV
columns = [
    'Drug', 'Application Number', 'Date', 'Boxed Warning', 'Contraindications',
    'Warnings and Precautions', 'Adverse Reactions', 'Drug Interactions',
    'Use in Specific Populations', 'PCI/PI/MG'
]

# List of User-Agent strings
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
]

# Set up a requests Session
session = requests.Session()

# Common headers for a real browser
base_headers = {
    'User-Agent': random.choice(user_agents),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-User': '?1',
    'Host': 'www.accessdata.fda.gov',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache'
}

def extract_sections(url):
    updates = []
    try:
        headers = base_headers.copy()
        headers['User-Agent'] = random.choice(user_agents)
        response = session.get(url, headers=headers, timeout=10, allow_redirects=True)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find the h3 tag containing the drug name and application number
            h3_tags = soup.find_all('h3')
            drug_name = ''
            app_number = ''
            for h3 in h3_tags:
                if 'NDA-' in h3.get_text() or 'BLA-' in h3.get_text():
                    drug_app_info = h3.get_text(strip=True)
                    logging.debug(f"Drug application info: {drug_app_info}")
                    drug_name = drug_app_info.split(' (')[0].strip()
                    app_number_match = re.search(r'(NDA|BLA)-(\d+)', drug_app_info)
                    if app_number_match:
                        app_number = app_number_match.group(2)
                    break
            # Find all update headers
            update_headers = soup.find_all('h3', class_='background_text accordion-header ui-accordion-header ui-helper-reset ui-state-default ui-accordion-icons ui-corner-all')
            for header in update_headers:
                date_str = header.find(text=True, recursive=False).strip()
                content_div = header.find_next('div', class_='ui-accordion-content ui-helper-reset ui-widget-content ui-corner-bottom')
                if content_div:
                    sections = {
                        'Boxed Warning': '',
                        'Contraindications': '',
                        'Warnings and Precautions': '',
                        'Adverse Reactions': '',
                        'Drug Interactions': '',
                        'Use in Specific Populations': '',
                        'PCI/PI/MG': ''
                    }
                    for h4 in content_div.find_all('h4'):
                        header_text = h4.get_text(strip=True).lower()
                        if 'boxed warning' in header_text:
                            sections['Boxed Warning'] = 'x'
                        elif 'contraindications' in header_text:
                            sections['Contraindications'] = 'x'
                        elif 'warnings and precautions' in header_text:
                            sections['Warnings and Precautions'] = 'x'
                        elif 'adverse reactions' in header_text:
                            sections['Adverse Reactions'] = 'x'
                        elif 'drug interactions' in header_text:
                            sections['Drug Interactions'] = 'x'
                        elif 'use in specific populations' in header_text:
                            sections['Use in Specific Populations'] = 'x'
                        elif 'pci' in header_text or 'patient counseling information' in header_text:
                            sections['PCI/PI/MG'] = 'x'
                    update_data = {
                        'Drug': drug_name,
                        'Application Number': app_number,
                        'Date': date_str,
                        **sections
                    }
                    updates.append(update_data)
        else:
            logging.error(f"Failed to retrieve URL {url}: Status code {response.status_code}")
    except Exception as e:
        logging.error(f"Error processing URL {url}: {e}")
    return updates

# Define the output file path
output_file_path = 'Drug_Safety_Labels_Output1.csv'

# Check if the file exists; if not, write the header
if not os.path.exists(output_file_path):
    pd.DataFrame(columns=columns).to_csv(output_file_path, index=False, encoding='utf-8')

# Process each drug in the input file
for index, row in tqdm(drug_data.iterrows(), total=len(drug_data)):
    drug = row.get('Drug', '')
    application_number = row.get('Application Number', '')
    link = row.get('Link', '')

    if pd.notna(link) and isinstance(link, str) and link.strip():
        logging.info(f"Processing {drug} ({application_number}) => {link}")
        updates = extract_sections(link)
        for update in updates:
            df = pd.DataFrame([update], columns=columns)
            df.to_csv(output_file_path, mode='a', header=False, index=False, encoding='utf-8')
        time.sleep(random.uniform(2, 5))

logging.info(f"Data extraction complete. Output saved to {output_file_path}")

  0%|          | 0/2993 [00:00<?, ?it/s]2025-01-18 18:30:08,322 - INFO - Processing  (21436) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=38
  date_str = header.find(text=True, recursive=False).strip()
  0%|          | 1/2993 [00:05<4:37:52,  5.57s/it]2025-01-18 18:30:13,895 - INFO - Processing  (21713) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=39
  0%|          | 2/2993 [00:08<3:31:57,  4.25s/it]2025-01-18 18:30:17,223 - INFO - Processing  (21729) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=40
  0%|          | 3/2993 [00:14<3:57:02,  4.76s/it]2025-01-18 18:30:22,580 - INFO - Processing  (21866) => https://www.accessdata.fda.gov/scripts/cder/safetylabelingchanges/index.cfm?event=searchdetail.page&DrugNameID=41
  0%|          | 4/2993 [00:18<3:38:00,  4.38s/it]2025-01-18 18:30:26

KeyboardInterrupt: 