# SFDA_scraping project

- Step 1: get list of drugIds (6000 of them)
- Step 2: for each drugId, get the drug information from https://sdi.sfda.gov.sa/Home/Result?drugId=X


In [1]:
#!pip install ipywidgets --upgrade
#!pip install tqdm --upgrade

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm

# Step 1: 
get list of drugIds (6000 of them)


In [2]:
base_url = "https://sdi.sfda.gov.sa/Home/AdvancedSearch"

In [3]:
# # only needed once - comment out after running
# drug_ids = []
# # scrape the drug IDs from the website
# for page_number in range(1, 551):  # 551 pages
#     url = base_url + "?page=" + str(page_number)
#     print("Page", page_number, url)

#     web_response = requests.get(url)
#     soup = BeautifulSoup(web_response.content, "html.parser")

#     links = soup.find_all("a", href=True)

#     for link in links:
#         href = link["href"]
#         if href.startswith("/Home/Result?drugId="):
#             drug_id = href.split("drugId=")[-1]
#             drug_ids.append(drug_id)

#     print("Length of Drug IDs so far", len(drug_ids))

# # Save the drug IDs to a CSV file
# drug_ids_df = pd.DataFrame(drug_ids, columns=["SFDA Drug ID"])
# drug_ids_df.to_csv("drug_ids.csv", index=False)
# print("Drug IDs saved to drug_ids.csv")
# print(drug_ids)

In [4]:
# load the drug IDs from the CSV file
drug_ids = []
drug_ids_df = pd.read_csv("drug_ids.csv")
drug_ids = drug_ids_df["SFDA Drug ID"].tolist()
print("Loaded", len(drug_ids), "Drug IDs")

Loaded 8250 Drug IDs


# Step 2: 
for each drugId, get the drug information from https://sdi.sfda.gov.sa/Home/Result?drugId=X


In [5]:
# List of fields to extract
fields = {
    "رقم التسجيل": "Registration Number",
    "سنة التسجيل": "Registration Year",
    "الاسم التجاري": "Brand Name",
    "الاسم العلمي": "Generic Name",
    "التركيز": "Strength",
    "وحدة التركيز": "Unit",
    "طريقة الإستعمال": "Route of Administration",
    "الشكل الصيدلاني": "Dosage Form",
    "حجم العبوة": "Package Size",
    "انواع العبوات": "Package Types",
    "طريقة الصرف": "Dispensing Method",
    "حالة المراقبة": "Monitoring Status",
    "نوع الدواء": "Drug Type",
    "مدة الصلاحية بالأشهر": "Shelf Life (Months)",
    "شروط التخزين": "Storage Conditions",
    "سعر الجمهور بالريال": "Public Price (SAR)",
    "المصنع": "Manufacturer",
    "الوكيل": "Agent",
    "الشركة المسوقة": "Marketing Company",
}

In [6]:
drug_page_base_url = "https://sdi.sfda.gov.sa/Home/Result?drugId="

In [7]:
# Function to extract text by field label
def get_field_value(soup, label_text):
    label = soup.find("label", string=label_text)
    if label:
        value_div = label.find_next("div", class_="form-line")
        if value_div:
            return value_div.text.strip()
    return None


def get_drug_info(drug_id):
    url = drug_page_base_url + str(drug_id)

    # Send a GET request to the URL
    web_response = requests.get(url)
    soup = BeautifulSoup(web_response.content, "html.parser")

    # Extract the data
    data = {fields[field]: get_field_value(soup, field) for field in fields}
    # add drug ID and URL
    data["SFDA Drug URL"] = url
    data["SFDA Drug ID"] = drug_id
    return data

In [8]:
# List to store data for all drugs
all_drugs_data = []

# load drug data from the excel file
all_drugs_data_df = pd.read_excel("drug_info.xlsx")
all_drugs_data = all_drugs_data_df.to_dict("records")
print("Loaded", len(all_drugs_data), "Drug data")

Loaded 9214 Drug data


In [9]:
# Skip if drug ID is already processed
processed_drug_ids = [drug["SFDA Drug ID"] for drug in all_drugs_data]
drug_ids = [drug_id for drug_id in drug_ids if drug_id not in processed_drug_ids]
print("Remaining", len(drug_ids), "Drug IDs")

Remaining 152 Drug IDs


In [10]:
# helper Function to save data to an Excel file
def save_data_to_excel(drugs_data):
    # Create DataFrame
    all_drugs_data_df = pd.DataFrame(drugs_data)
    # Save DataFrame to Excel file
    all_drugs_data_df.to_excel("drug_info.xlsx", index=False)
    print("Data saved to drug_info.xlsx")


# fetch drug information from the SDFA website
i = 0

for drug_id in tqdm(drug_ids):
    try:
        # print("Drug ID", drug_id)

        # Add the data to the list
        data = get_drug_info(drug_id)
        all_drugs_data.append(data)

        # Save data to file every 10 records
        i += 1
        if i % 10 == 0:
            save_data_to_excel(all_drugs_data)

    except Exception as e:
        print(f"Error processing drug ID {drug_id}: {e}")
        continue

# Save the data to an Excel file
save_data_to_excel(all_drugs_data)

  0%|          | 0/152 [00:00<?, ?it/s]

Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx
Data saved to drug_info.xlsx


In [11]:
# view DataFrame
pd.DataFrame(all_drugs_data)

Unnamed: 0,Registration Number,Registration Year,Brand Name,Generic Name,Strength,Unit,Route of Administration,Dosage Form,Package Size,Package Types,...,Monitoring Status,Drug Type,Shelf Life (Months),Storage Conditions,Public Price (SAR),Manufacturer,Agent,Marketing Company,SFDA Drug URL,SFDA Drug ID
0,1507245577,2024.0,Rezurock,BELUMOSUDIL MESYLATE,200,mg,Oral use,Film-coated tablet,30,Bottle,...,Uncontrolled,NCE,24.0,do not store above 30°c,,UPM Pharmaceuticals,Sanofi Arabia Trading Co. Ltd,SANOFI WINTHROP INDUSTRIE,https://sdi.sfda.gov.sa/Home/Result?drugId=12641,12641
1,1407245571,2024.0,Elbag,ELTROMBOPAG OLAMINE,25,mg,Oral use,Film-coated tablet,28,Blister,...,Uncontrolled,Generic,24.0,do not store above 30°c,,Synthon Hispania SL,"SAJA Pharmaceutical Co., Ltd.",SAJA-SAUDI ARABIAN JAPANESE PHARMACEUTICAL CO,https://sdi.sfda.gov.sa/Home/Result?drugId=12635,12635
2,1407245569,2024.0,Elbag,ELTROMBOPAG OLAMINE,50,mg,Oral use,Film-coated tablet,28,Blister,...,Uncontrolled,Generic,24.0,do not store above 30°c,,Synthon Hispania SL,"SAJA Pharmaceutical Co., Ltd.",SAJA-SAUDI ARABIAN JAPANESE PHARMACEUTICAL CO,https://sdi.sfda.gov.sa/Home/Result?drugId=12634,12634
3,1407245570,2024.0,Omeprex Plus,"OMEPRAZOLE,SODIUM BICARBONATE",401100,mg,Oral use,Capsule,30,Blister,...,Uncontrolled,Generic,24.0,store below 30°c,,SAJA-SAUDI ARABIAN JAPANESE PHARMACEUTICAL CO,"Farouk, Maamoun Tamer & CO",SAJA-SAUDI ARABIAN JAPANESE PHARMACEUTICAL CO,https://sdi.sfda.gov.sa/Home/Result?drugId=12633,12633
4,1206245415,,Akeega,"NIRAPARIB,ABIRATERONE",50500,mg,Oral use,Film-coated tablet,56,Blister,...,Uncontrolled,NCE,30.0,store below 30°c,,PATHEON,Cigalah Group,Janssen-Cilag International NV,https://sdi.sfda.gov.sa/Home/Result?drugId=12579,12579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9361,0202233207,2013,AMLOR CAPS 5MG,AMLODIPINE BESILATE,5,mg,Oral use,"Capsule, hard",30,Blister,...,Uncontrolled,NCE,48,store below 30°c,,PFIZER SAUDI LIMITED,Viatris Arabia Limited,Viatris Arabia Limited,https://sdi.sfda.gov.sa/Home/Result?drugId=3169,3169
9362,0202233206,2013,AMLOR 10MG CAPSULES,AMLODIPINE BESILATE,10,mg,Oral use,"Capsule, hard",30,Blister,...,Uncontrolled,NCE,48,store below 30°c,,PFIZER SAUDI LIMITED,Viatris Arabia Limited,Viatris Arabia Limited,https://sdi.sfda.gov.sa/Home/Result?drugId=3170,3170
9363,11-539-08,2008,AMISTOP 5MG-5ML SUSPENSION,DOMPERIDONE,0.1,,Oral use,Suspension,1,Bottle,...,Uncontrolled,Generic,24,store below 30°c,,DEEF PHARMACEUTICAL INDUSTRIES,DEEF MARKITING COMPANY,DEEF PHARMACEUTICAL INDUSTRIES,https://sdi.sfda.gov.sa/Home/Result?drugId=3174,3174
9364,1005233628,2011,AMPIPLUS 1.5GM POWDER FOR INGECTION,"AMPICILLIN,SULBACTAM",1000500,mg,Parenteral use,powder for solution for injection,50,Vial,...,Uncontrolled,Generic,24,store below 25°c,,ANTIBIOTIC S.E,AL HOBAIL MEDICAL OFFICE COMPANY LTD.,ANTIBIOTIC S.E,https://sdi.sfda.gov.sa/Home/Result?drugId=3189,3189
