In [None]:
pip install beautifulsoup4 requests



In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import re
import os

# Configuration
BASE_URL = "https://www.iosco.org/i-scan/"
HEADERS = {"User-Agent": "Mozilla/5.0"}

SUMMARY_FIELDS = [
    "Name_company", "Corporate Name", "site_url", "Authority_Name",
    "Authority_link", "date", "id"
]

DETAIL_FIELDS = [
    "Warning ID", "NCA", "Date Published At NCA", "Date Published At IOSCO",
    "Last Updated", "NCA URL", "Commercial Name", "URL", "Other URL",
    "Category", "Additional Information"
]

CHECKPOINT_FILE = "checkpoint.txt"

def save_checkpoint(page):
    with open(CHECKPOINT_FILE, "w") as f:
        f.write(str(page))

def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            return int(f.read().strip())
    return 1  # Start from page 1 if no checkpoint

def extract_detail_page(warning_id, retries=3):
    for attempt in range(retries):
        try:
            url = f"{BASE_URL}?id={warning_id}"
            response = requests.get(url, headers=HEADERS, timeout=10)
            time.sleep(0.5)

            if response.status_code != 200:
                raise ValueError(f"HTTP {response.status_code}")

            soup = BeautifulSoup(response.content, "html.parser")
            table_div = soup.find("div", class_="table-responsive")
            if not table_div:
                raise ValueError("No table-responsive div")

            table = table_div.find("table")
            if not table:
                raise ValueError("No table found")

            raw_data = {}
            for row in table.find_all("tr"):
                th = row.find("th")
                td = row.find("td")
                if not th or not td:
                    continue
                key = th.get_text(strip=True).replace(":", "")
                a = td.find("a")
                value = a["href"] if a else td.get_text(strip=True)
                raw_data[key] = value

            mapped_data = {key: raw_data.get(key, None) for key in DETAIL_FIELDS}
            return mapped_data

        except Exception as e:
            print(f"[Retry {attempt + 1}] Detail fetch failed for ID {warning_id}: {e}")
            time.sleep(1)

    print(f"[ERROR] Final failure for ID {warning_id}")
    return {key: None for key in DETAIL_FIELDS}


def extract_main_page_summary(page_url):
    response = requests.get(page_url, headers=HEADERS, timeout=10)
    time.sleep(0.2)
    soup = BeautifulSoup(response.content, "html.parser")
    container = soup.find("div", class_="table-responsive")
    rows = container.find_all("tr")

    all_data = []

    for row in rows:
        try:
            cells = row.find_all("td")
            if len(cells) < 6:
                continue

            name = cells[0].get_text(strip=True)
            blank = cells[1].get_text(strip=True)
            site_url = cells[2].get_text(strip=True)
            authority_elem = cells[3].find("a")
            authority_link = authority_elem["href"] if authority_elem else None
            authority_text = authority_elem.get_text(strip=True) if authority_elem else None
            date = cells[4].get_text(strip=True)

            btn = cells[5].find("button", onclick=True)
            btn_id = None
            if btn:
                onclick = btn["onclick"]
                m = re.search(r"id=(\d+)", onclick)
                if m:
                    btn_id = m.group(1)

            summary = {
                "Name_company": name,
                "Corporate Name": blank,
                "site_url": site_url,
                "Authority_Name": authority_text,
                "Authority_link": authority_link,
                "date": date,
                "id": btn_id
            }

            detail = extract_detail_page(btn_id) if btn_id else {k: None for k in DETAIL_FIELDS}
            full_record = {**summary, **detail}
            all_data.append(full_record)

        except Exception as e:
            print(f"[ERROR] Failed to parse row: {e}")
            continue

    return all_data


# 🚀 Run Script
if __name__ == "__main__":
    start_page = load_checkpoint()
    end_page = start_page + 19  # Scrape 20 pages total

    for page_number in range(start_page, end_page + 1):
        print(f"\n Scraping page {page_number}...")
        page_url = f"{BASE_URL}?SUBSECTION=main&page={page_number}"
        data = extract_main_page_summary(page_url)

        final_fields = SUMMARY_FIELDS + DETAIL_FIELDS
        output_file = f"combined_output_page{page_number}.csv"

        with open(output_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=final_fields)
            writer.writeheader()
            writer.writerows(data)

        print(f"✅ Page {page_number} done. {len(data)} records saved to {output_file}")
        save_checkpoint(page_number + 1)



 Scraping page 21...
[Retry 1] Detail fetch failed for ID 39383: HTTP 429
[Retry 1] Detail fetch failed for ID 39382: HTTP 403
[Retry 1] Detail fetch failed for ID 39381: HTTP 429
[Retry 1] Detail fetch failed for ID 39379: HTTP 429
[Retry 2] Detail fetch failed for ID 39379: HTTP 403
[Retry 1] Detail fetch failed for ID 39375: HTTP 429
[Retry 1] Detail fetch failed for ID 39373: HTTP 429
[Retry 1] Detail fetch failed for ID 39370: HTTP 429
[Retry 2] Detail fetch failed for ID 39370: HTTP 403
✅ Page 21 done. 20 records saved to combined_output_page21.csv

 Scraping page 22...
[Retry 1] Detail fetch failed for ID 39368: HTTP 429
[Retry 1] Detail fetch failed for ID 39366: HTTP 429
[Retry 1] Detail fetch failed for ID 39364: HTTP 429
[Retry 1] Detail fetch failed for ID 39362: HTTP 403
[Retry 1] Detail fetch failed for ID 39361: HTTP 429
[Retry 1] Detail fetch failed for ID 39358: HTTP 429
[Retry 1] Detail fetch failed for ID 39357: HTTP 429
[Retry 1] Detail fetch failed for ID 39355: H

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
dir=os.listdir("/content")

In [None]:
dir.sort()

In [None]:
newdir=dir[3:len(dir)-1]

In [None]:
newdir.sort()

In [None]:
newdir

['combined_output_page1.csv',
 'combined_output_page10.csv',
 'combined_output_page11.csv',
 'combined_output_page12.csv',
 'combined_output_page13.csv',
 'combined_output_page14.csv',
 'combined_output_page15.csv',
 'combined_output_page16.csv',
 'combined_output_page17.csv',
 'combined_output_page18.csv',
 'combined_output_page19.csv',
 'combined_output_page2.csv',
 'combined_output_page20.csv',
 'combined_output_page21.csv',
 'combined_output_page22.csv',
 'combined_output_page23.csv',
 'combined_output_page3.csv',
 'combined_output_page4.csv',
 'combined_output_page5.csv',
 'combined_output_page6.csv',
 'combined_output_page7.csv',
 'combined_output_page8.csv',
 'combined_output_page9.csv']

In [None]:
from google.colab import files
import zipfile
import os

# Step 1: Zip all CSV files in the current directory
zip_filename = "all_csvs.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for filename in os.listdir():
        if filename.endswith(".csv"):
            zipf.write(filename)

# Step 2: Download the zip
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import os

# Step 1: List all CSV files
csv_files = [f for f in os.listdir() if f.endswith(".csv") and f.startswith("combined_output_page")]

# Step 2: Sort filenames to keep pages in order
csv_files.sort(key=lambda x: int(x.split("page")[1].split(".")[0]))

# Step 3: Read and append all CSVs
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Step 4: Save to a final combined file
combined_df.to_csv("iosco_combined.csv", index=False)

print(f"✅ Combined {len(csv_files)} CSVs into 'iosco_combined.csv' with {len(combined_df)} rows.")


✅ Combined 23 CSVs into 'iosco_combined.csv' with 460 rows.
