# pdf with image description

In [8]:
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import re

def scrape_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    content = {}
    current_heading = ""
    image_descriptions = {}

    figure_counter = 0

    for element in soup.find_all(['p', 'h2', 'figure']):
        if element.name == 'h2':
            current_heading = element.text.strip()
            content[current_heading] = ""
        elif element.name == 'p' and current_heading:
            paragraph_text = element.text.strip()
            # Remove content like [1], [2], etc.
            paragraph_text = re.sub(r'\[\d+\]', '', paragraph_text)
            content[current_heading] += paragraph_text + '\n'
        elif element.name == 'figure':
            figcaption = element.find('figcaption')
            if figcaption:
                # Extract image description from <figcaption>
                image_description = figcaption.text.strip()
                if current_heading not in image_descriptions:
                    image_descriptions[current_heading] = []
                image_descriptions[current_heading].append({'figure': f'Figure{figure_counter}', 'description': image_description})
                figure_counter += 1

    return content, image_descriptions

def count_images(image_descriptions):
    total_images = sum(len(images) for images in image_descriptions.values())
    return total_images

def create_pdf(content, image_descriptions, pdf_file):
    pdf = FPDF()
    pdf.add_page()

    for key, value in content.items():
        pdf.set_font("Arial", style='B', size=12)
        pdf.cell(200, 10, txt=key, ln=True, align='C')

        pdf.set_font("Arial", size=10)
        pdf.multi_cell(0, 10, txt=str(value).encode('latin-1', 'replace').decode('latin-1'))
        pdf.ln(5)

        # Add image descriptions to the PDF with figures
        if key in image_descriptions:
            pdf.set_font("Arial", style='I', size=10)
            for image_data in image_descriptions[key]:
                description = image_data['description']
                figure = image_data['figure']
                pdf.cell(200, 10, txt=f"{figure}: {description}", ln=True)

    total_images = count_images(image_descriptions)
    print(f"Total Number of Images: {total_images}")

    pdf.output(pdf_file)

def main():
    url = "https://en.wikipedia.org/wiki/Cat"
    content, image_descriptions = scrape_wikipedia(url)
    pdf_file = 'cat_d.pdf'
    create_pdf(content, image_descriptions, pdf_file)
    print(f'PDF generated: {pdf_file}')

if __name__ == "__main__":
    main()


Total Number of Images: 21
PDF generated: cat_d.pdf


# additional feature 

In [9]:
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import re

def scrape_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    content = {}
    current_heading = ""
    image_descriptions = {}

    figure_counter = 0

    for element in soup.find_all(['p', 'h2', 'figure', 'div']):
        if element.name == 'h2':
            current_heading = element.text.strip()
            content[current_heading] = ""
        elif element.name == 'p' and current_heading:
            paragraph_text = element.text.strip()
            # Remove content like [1], [2], etc.
            paragraph_text = re.sub(r'\[\d+\]', '', paragraph_text)
            content[current_heading] += paragraph_text + '\n'
        elif element.name == 'figure':
            figcaption = element.find('figcaption')
            if figcaption:
                # Extract image description from <figcaption>
                image_description = figcaption.text.strip()
                if current_heading not in image_descriptions:
                    image_descriptions[current_heading] = []
                image_descriptions[current_heading].append({'figure': f'Figure{figure_counter}', 'description': image_description})
                figure_counter += 1
        elif element.name == 'div' and 'infobox-caption' in element.get('class', []):
            # Extract image description from infobox-caption
            image_description = element.text.strip()
            if current_heading not in image_descriptions:
                image_descriptions[current_heading] = []
            image_descriptions[current_heading].append({'figure': f'Figure{figure_counter}', 'description': image_description})
            figure_counter += 1

    return content, image_descriptions

def count_images(image_descriptions):
    total_images = sum(len(images) for images in image_descriptions.values())
    return total_images

def create_pdf(content, image_descriptions, pdf_file):
    pdf = FPDF()
    pdf.add_page()

    for key, value in content.items():
        pdf.set_font("Arial", style='B', size=12)
        pdf.cell(200, 10, txt=key, ln=True, align='C')

        pdf.set_font("Arial", size=10)
        pdf.multi_cell(0, 10, txt=str(value).encode('latin-1', 'replace').decode('latin-1'))
        pdf.ln(5)

        # Add image descriptions to the PDF with figures
        if key in image_descriptions:
            pdf.set_font("Arial", style='I', size=10)
            for image_data in image_descriptions[key]:
                description = image_data['description']
                figure = image_data['figure']
                pdf.cell(200, 10, txt=f"{figure}: {description}", ln=True)

    total_images = count_images(image_descriptions)
    print(f"Total Number of Images: {total_images}")

    pdf.output(pdf_file)

def main():
    url = "https://en.wikipedia.org/wiki/Nike,_Inc."
    content, image_descriptions = scrape_wikipedia(url)
    pdf_file = 'Nike_Inc_Info.pdf'
    create_pdf(content, image_descriptions, pdf_file)
    print(f'PDF generated: {pdf_file}')

if __name__ == "__main__":
    main()


Total Number of Images: 24
PDF generated: Nike_Inc_Info.pdf
