In [None]:
"""
Checking initial website accessibility by sending a GET request to the A-letter page of PharmNet-DZ.
"""

import requests

# Base URL for medicine lists ( For the ones starting with letter 'A')
url = "https://www.pharmnet-dz.com/alphabet.aspx?char=A"

# Adding headers to mimic a real user
headers = {"User-Agent": "Mozilla/5.0"}

# Sending the GET request
response = requests.get(url, headers=headers)

# Checking the response status
if response.status_code == 200:
    print("Access allowed! The site does not block basic requests.")
    print("First 500 characters of the page content:")
    print(response.text[:500])  

elif response.status_code == 403:
    print("Access denied. Site may block bots.")

elif response.status_code == 404:
    print("Page not found. Check URL.")

else:
    print(f"Unexpected status code: {response.status_code}")


✅ Access allowed! The site does not block basic requests.
🔹 First 500 characters of the page content:

<!DOCTYPE html>
<html>
<head><title>
	
    Liste des médicaments (Ordre alphabétique) | PharmNet - Encyclopédie des médicaments en Algérie | Propriété Sarl ESAHTI

</title><link href="/css/Pro/pharmnet_05_10.css" rel="stylesheet" /><meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1" /><meta charset="UTF-8" /><link rel="shortcut icon" href="/img/favicons/fav3.ico" /><meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta name="description" co


In [None]:
"""
Scraping all the medicine entries from PharmNet-DZ (from A to Z and from all the pages in each letter), including names, profile links, and notice links.
Then saving the results in a CSV file.
"""

from bs4 import BeautifulSoup
import pandas as pd
import string
import time


base_url = "https://www.pharmnet-dz.com/"
alphabet_url = "https://www.pharmnet-dz.com/alphabet.aspx?char={}&p={}"


headers = {"User-Agent": "Mozilla/5.0"}


all_medicines = []

# Looping over all letters A-Z
for letter in string.ascii_uppercase: 
    page = 1  

    while True:  
        print(f"craping letter {letter}, page {page}")

        url = alphabet_url.format(letter, page)  
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Could not access {url}. Status Code: {response.status_code}")
            break 
        soup = BeautifulSoup(response.text, "html.parser")
        medicines_found = 0  
        rows = soup.find_all("tr")  # Each <tr> represents a row in the table

        for row in rows:
            td_list = row.find_all("td")

            if len(td_list) >= 2: 
                med_td = td_list[0].find("a", href=True)
                if med_td:
                    medicine_name = med_td.text.strip()
                    medicine_link = base_url + med_td["href"]

                
                    notice_link = "No Notice"
                    for td in td_list:
                        notice_tag = td.find("a", href=True)
                        if notice_tag and "notice.ashx" in notice_tag["href"]:
                            notice_link = base_url + notice_tag["href"]
                            break  

                    
                    medicine_name = medicine_name.replace('"', "").replace("'", "")

                    # Storing the extracted data

                    all_medicines.append({
                        "Medicine Name": medicine_name,
                        "Medicine URL": medicine_link,
                        "Notice URL": notice_link
                    })

                    medicines_found += 1

      

        if medicines_found == 0:
            print(f"No more pages for letter {letter}. Moving to next letter...")
            break

        page += 1  
        time.sleep(1)  


df = pd.DataFrame(all_medicines)
df.to_csv("all_medicines_AZ_with_notices_final.csv", index=False)



In [None]:

# Downlaoding the file from google colab :

from google.colab import files
files.download("all_medicines_AZ_with_notices_final.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
"""
Loading the scraped CSV file and report how many medicines have notices versus those that do not.

"""

csv_filename = r"\Dataa\all_medicines_AZ_with_notices_final.csv"
df = pd.read_csv(csv_filename)


total_medicines = len(df)
medicines_with_notice = df[df["Notice URL"] != "No Notice"]
count_with_notice = len(medicines_with_notice)
count_without_notice = total_medicines - count_with_notice


print(f"Total Medicines Scraped: {total_medicines}")
print(f"Medicines with Notices: {count_with_notice}")
print(f"Medicines without Notices: {count_without_notice}")
print("Sample Medicines with Notices:")
print(medicines_with_notice.head(10))  


Total Medicines Scraped: 5032
Medicines with Notices: 1802
Medicines without Notices: 3230
Sample Medicines with Notices:
                                        Medicine Name  \
0   ABASAGLAR 100UI/ML (3,64MG/ML) SOL. INJ. EN ST...   
5                             ABILIFY 10MG COMP. B/28   
6                             ABILIFY 15MG COMP. B/28   
15                           ABUFENE 400MG COMP. B/30   
19           ACEBUTOLOL SANDOZ 200MG COMP. ENRO. B/30   
31  ACICLOVIR MYLAN 250MG/FL. DE PDRE. PDRE. SOL. ...   
32  ACICLOVIR MYLAN 500MG/FL. DE PDRE. PDRE. SOL. ...   
40  ACIFIX 175MG/200MG/5ML SUSP. BUV. FLACON FL./2...   
42                    ACIFUDAL 0.02 CREME DERM. T/15G   
44   ACLASTA 5MG/100ML SOL INJ P/PERF B/01FL DE 100ML   

                                         Medicine URL  \
0   https://www.pharmnet-dz.com/m-5866-abasaglar-1...   
5   https://www.pharmnet-dz.com/m-2288-abilify-10m...   
6   https://www.pharmnet-dz.com/m-2290-abilify-15m...   
15  https://www.pharmn

In [None]:
"""
Downloading all available medication notice PDFs from the scraped URLs and store them in a local zip folder.

"""
import os
from google.colab import files


csv_filename = "all_medicines_AZ_with_notices_final.csv"
df = pd.read_csv(csv_filename)


pdf_folder = "medicine_notices"
os.makedirs(pdf_folder, exist_ok=True)


downloaded_files = 0


for index, row in df.iterrows():
    medicine_name = row["Medicine Name"]
    notice_url = row["Notice URL"]

    
    if notice_url == "No Notice":
        continue

    try:
        response = requests.get(notice_url, stream=True)
        if response.status_code == 200:
            safe_name = "".join(c if c.isalnum() or c in " _-" else "_" for c in medicine_name)
            pdf_path = os.path.join(pdf_folder, f"{safe_name}.pdf")
 
            with open(pdf_path, "wb") as pdf_file:
                for chunk in response.iter_content(chunk_size=1024):
                    pdf_file.write(chunk)
            downloaded_files += 1
        

    except Exception as e:
        print(f"Error downloading {medicine_name}: {str(e)}")

print(f"All available notices downloaded. Total: {downloaded_files}")


zip_filename = "medicine_notices.zip"
!zip -r $zip_filename $pdf_folder
files.download(zip_filename)


In [None]:
"""
Saving the list of medicines without notices into a text file and downloading it to add some of them later from other sources
"""

from google.colab import files

csv_filename = "all_medicines_AZ_with_notices_final.csv"
df = pd.read_csv(csv_filename)


medicines_without_notice = df[df["Notice URL"] == "No Notice"]["Medicine Name"]


no_notice_filename = "medicines_without_notices.txt"
medicines_without_notice.to_csv(no_notice_filename, index=False, header=False)



files.download(no_notice_filename)


