In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [3]:
!pip install pikepdf

Collecting pikepdf
  Downloading pikepdf-9.9.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting Deprecated (from pikepdf)
  Downloading Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Downloading pikepdf-9.9.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Deprecated-1.2.18-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: Deprecated, pikepdf
Successfully installed Deprecated-1.2.18 pikepdf-9.9.0


In [7]:
!pip install FPDF

Collecting FPDF
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: FPDF
  Building wheel for FPDF (setup.py) ... [?25l[?25hdone
  Created wheel for FPDF: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=10ea6d90e66bea301ff4fc1fddb06a6ff5bb2f66506892bba7aeb8eb8fc74f72
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built FPDF
Installing collected packages: FPDF
Successfully installed FPDF-1.7.2


In [None]:
import os
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from googlesearch import search
import fitz
from fpdf import FPDF
import pandas as pd

OUTPUT_DIR = "nde_dataset"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function to download PDF from a URL
def download_pdf(url, save_dir):
    try:
        if not url.lower().endswith(".pdf"):
            return None
        filename = os.path.basename(url.split("?")[0])
        filepath = os.path.join(save_dir, filename)
        response = requests.get(url, timeout=10)
        with open(filepath, "wb") as f:
            f.write(response.content)
        return filepath
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Failed to extract text from {pdf_path}: {e}")
        return ""

# Function to scrape and extract text from a webpage
def scrape_text_from_web(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all('p')
        content = "\n".join([p.get_text() for p in paragraphs])
        title = soup.title.string if soup.title else "Untitled"
        return title.strip(), content.strip()
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None, None

# Function to save dataset to a final PDF
def save_dataset_to_pdf(dataset, output_pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    for i, row in dataset.iterrows():
        pdf.set_font("Arial", 'B', 14)
        pdf.multi_cell(0, 10, f"Section: {row['title']}")
        pdf.set_font("Arial", size=12)
        pdf.multi_cell(0, 10, row['content'][:3000])
        pdf.add_page()

    pdf.output(output_pdf_path)

# Google search for NDE content
topics = ["Non Destructive Evaluation", "Ultrasonic Testing", "Radiographic Testing",
          "Eddy Current Testing", "Magnetic Particle Inspection", "Thermography NDE"]

dataset = []
visited = set()

for topic in topics:
    print(f"🔍 Searching for: {topic}")
    links = list(search(f"{topic} filetype:pdf OR site:org OR site:edu", stop=50))
    for link in links:
        if link in visited:
            continue
        visited.add(link)
        if link.lower().endswith(".pdf"):
            pdf_path = download_pdf(link, OUTPUT_DIR)
            if pdf_path:
                content = extract_text_from_pdf(pdf_path)
                dataset.append({"title": os.path.basename(pdf_path), "content": content})
        else:
            title, content = scrape_text_from_web(link)
            if title and content:
                dataset.append({"title": title, "content": content})
        time.sleep(1)

# Save CSV and PDF
df = pd.DataFrame(dataset)
df.to_csv(os.path.join(OUTPUT_DIR, "nde_dataset.csv"), index=False)
save_dataset_to_pdf(df, os.path.join(OUTPUT_DIR, "nde_dataset.pdf"))

print("✅ Internet-wide NDE dataset created successfully!")


In [None]:
import os
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from googlesearch import search
import fitz  # PyMuPDF
from fpdf import FPDF
import pandas as pd
import unicodedata
import ssl
import urllib3
from urllib.error import HTTPError, URLError
import socket

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ssl._create_default_https_context = ssl._create_unverified_context

OUTPUT_DIR = "3nde_dataset"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Normalize text for PDF

def normalize_text(text):
    try:
        return unicodedata.normalize('NFKD', text).encode('latin-1', 'ignore').decode('latin-1')
    except Exception:
        return "[Text could not be normalized]"

# Save dataset to PDF

def save_dataset_to_pdf(dataset, output_pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)

    for i, row in dataset.iterrows():
        if not row['content'].strip():
            continue
        pdf.add_page()
        pdf.set_font("Arial", 'B', 14)
        title = normalize_text(f"Section: {row['title']}")
        pdf.multi_cell(0, 10, title)
        pdf.set_font("Arial", size=12)
        content = normalize_text(row['content'][:3000])
        pdf.multi_cell(0, 10, content)

    pdf.output(output_pdf_path)

# Download PDF

def download_pdf(url, save_dir):
    try:
        if not url.lower().endswith(".pdf"):
            return None
        filename = os.path.basename(url.split("?")[0])
        filepath = os.path.join(save_dir, filename)
        response = requests.get(url, timeout=10, verify=False)
        if response.status_code == 200:
            with open(filepath, "wb") as f:
                f.write(response.content)
            return filepath
        else:
            print(f"Non-200 response for {url}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None

# Extract text from PDF

def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        doc = fitz.open(pdf_path)
        for page in doc:
            page_text = page.get_text()
            if page_text:
                text += page_text + "\n"
        doc.close()
        return text.strip()
    except Exception as e:
        print(f"Failed to extract text from {pdf_path}: {e}")
        return ""

# Scrape text from web page

def scrape_text_from_web(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, timeout=10, headers=headers, verify=False)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all('p')
        content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
        title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
        return title, content.strip()
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None, None

# Topics to search
topics = [
    "Non Destructive Evaluation",
    "Ultrasonic Testing",
    "Radiographic Testing",
    "Eddy Current Testing",
    "Magnetic Particle Inspection",
    "Thermography NDE",
    "Acoustic Emission Testing",
    "Leak Testing NDT",
    "Visual Testing NDT",
    "Infrared Thermography",
    "Computed Tomography NDT",
    "Phased Array Ultrasonic Testing"
]

dataset = []
visited = set()

for topic in topics:
    print(f"🔍 Searching for: {topic}")
    try:
        query = f"{topic} filetype:pdf OR site:org OR site:edu"
        links = list(search(query, stop=20, pause=4))
    except (HTTPError, URLError, socket.gaierror, socket.timeout) as e:
        print(f"HTTP error while searching {topic}: {e}")
        continue

    for link in links:
        if link in visited:
            continue
        visited.add(link)
        if link.lower().endswith(".pdf"):
            pdf_path = download_pdf(link, OUTPUT_DIR)
            if pdf_path:
                content = extract_text_from_pdf(pdf_path)
                if content:
                    dataset.append({"title": os.path.basename(pdf_path), "content": content})
        else:
            title, content = scrape_text_from_web(link)
            if title and content:
                dataset.append({"title": title, "content": content})
        time.sleep(2)

# Save results
if dataset:
    df = pd.DataFrame(dataset)
    df = df[df['content'].str.strip().astype(bool)]
    df.to_csv(os.path.join(OUTPUT_DIR, "nde_dataset.csv"), index=False)
    save_dataset_to_pdf(df, os.path.join(OUTPUT_DIR, "nde_dataset.pdf"))
    print("✅ NDE dataset PDF and CSV saved with real content.")
else:
    print("⚠️ No usable data collected. Consider increasing pause time.")
