In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from PIL import Image
from urllib.parse import urljoin, unquote
from io import BytesIO
import re
import json

# The URL of the webpage you want to scrape
url = 'https://en.wikipedia.org/wiki/India'

# Folder where the images and their alt texts will be stored
images_folder = 'images_india'
os.makedirs(images_folder, exist_ok=True)

# Use requests to fetch the content of the URL
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Create a Markdown file to store the scraped text
with open('scraped_content_india.md', 'w', encoding='utf-8') as md_file:
    for element in soup.find_all(['p', 'h1', 'h2', 'h3']):
        # Assuming you want to capture paragraphs and headers
        if element.name.startswith('h'):
            md_file.write(f"\n# {element.get_text()}\n\n")  # Header with markdown format
        else:
            md_file.write(f"{element.get_text()}\n\n")  # Paragraph

#to download the image and store the alt text/related text of the image

titles_set = set()
images_info = []
for img_tag in soup.find_all('img'):

    img_url = urljoin(url, img_tag['src'])
    img_response = requests.get(img_url)

    # Get the alt text
    alt_text = img_tag.get('alt')
    if not alt_text:  # Checks if alt_text is None or empty
        alt_text = " "
        # Use the 'src' attribute as the fallback
        # src_text = img_tag.get('src')
        # srcset_text = img_tag.get('srcset')
        # if not src_text:
        #     alt_text = srcset_text
        #     if not srcset_text:
        #         alt_text = " "
        # else:
        #     alt_text = src_text
        # # if (srcset_text and src_text):
        # #     alt_text = src_text + " " + srcset_text
        # print(alt_text)

    title = img_tag.get('title')
    if not title:
        image_name = os.path.basename(img_url)
        clean_title = re.sub(r'\d+px', '', unquote(os.path.splitext(image_name)[0]))
        title = clean_title.replace('_', ' ').replace('-', ' ').strip()
        # If the title ends with '.svg', remove it
        if title.endswith('.svg'):
            title = title[:-4]
    else:
        image_name = os.path.basename(img_url)

    #print(title)
    # Ensure title uniqueness
    original_title = title
    count = 1
    while title in titles_set:
        count += 1
        title = f"{original_title} ({count})"
    titles_set.add(title)

    image_name = os.path.basename(img_url)

    data = {
        "title": title,
        "url": image_name,
        "text": alt_text
    }

    images_info.append(data)

    image_path = os.path.join(images_folder, image_name)
    with open(image_path, 'wb') as img_file:
        img_file.write(img_response.content)

    #saving the alt text in a separate file with the same name but .txt extension
    alt_text_path = os.path.join(images_folder, f"{os.path.splitext(image_name)[0]}.txt")
    with open(alt_text_path, 'w', encoding='utf-8') as text_file:
        text_file.write(alt_text)

with open('images_info.json', 'w', encoding='utf-8') as json_file:
    json.dump(images_info, json_file, indent=4, ensure_ascii=False)

In [2]:
!zip -r images_india.zip images_india/

  adding: images_india/ (stored 0%)
  adding: images_india/23px-Flag_of_Iran.svg.png (deflated 6%)
  adding: images_india/27px-Wikibooks-logo.svg.png (stored 0%)
  adding: images_india/23px-Flag_of_Cambodia.svg.txt (stored 0%)
  adding: images_india/220px-Women_at_work%2C_Gujarat_%28cropped%29.txt (stored 0%)
  adding: images_india/23px-Flag_of_Kyrgyzstan.svg.txt (stored 0%)
  adding: images_india/23px-Flag_of_Belarus.svg.txt (stored 0%)
  adding: images_india/23px-Flag_of_Iraq.svg.png (deflated 9%)
  adding: images_india/180px-Nehru_gandhi.txt (stored 0%)
  adding: images_india/208px-Rashtrapati_Bhavan_Wide_New_Delhi_India.txt (stored 0%)
  adding: images_india/208px-Jawaharlal_Nehru%2C_Nasser_and_Tito_at_the_Conference_of_Non-Aligned_Nations_held_in_Belgrade.txt (stored 0%)
  adding: images_india/440px-Sachin_Tendulkar_about_to_score_14000th_run_in_test_cricket.jpg (deflated 2%)
  adding: images_india/23px-Flag_of_Uzbekistan.svg.txt (stored 0%)
  adding: images_india/152px-Qutb_minar

In [3]:
from google.colab import files
files.download('images_india.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
import requests

# The base URL of the website
base_url = 'https://igod.gov.in/'

# Construct the URL for the robots.txt file
robots_url = f'{base_url}/robots.txt'

# Fetch the content of the robots.txt file
response = requests.get(robots_url)

if response.status_code == 200:
    # Print the contents of the robots.txt file
    print(response.text)
else:
    print("Failed to retrieve robots.txt")

User-agent: *
Disallow:/docs/

