In [1]:
!pip install google-auth google-auth-oauthlib google-api-python-client requests beautifulsoup4




In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from google.oauth2 import service_account
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
from io import BytesIO
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload

# Update this path to where your service account JSON key is located
SERVICE_ACCOUNT_FILE = '/content/website-crawler-gcf-a0fa227f070b.json'

# Your Google Drive file ID for the text file that contains the start URL
FILE_ID = '1YUFAMR-28EddEz9IAdvdY-T9JPdgYOQ2'

SCOPES = ['https://www.googleapis.com/auth/drive']

visited_urls = set()

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

def get_domain(url):
    return urlparse(url).netloc

def scan_website_recursively(start_url, base_domain, max_depth, current_depth=0):
    internal_links = []
    if current_depth > max_depth:
        return internal_links

    normalized_url = urljoin(start_url, urlparse(start_url).path)
    if normalized_url in visited_urls:
        return internal_links

    visited_urls.add(normalized_url)
    print(f"Crawling: {normalized_url} (Depth: {current_depth})")

    try:
        response = requests.get(normalized_url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {normalized_url}: {e}")
        return internal_links

    soup = BeautifulSoup(response.text, 'html.parser')

    for tag in soup.find_all('a', href=True):
        href = tag['href']
        full_url = urljoin(normalized_url, href)
        clean_url = urljoin(full_url, urlparse(full_url).path)

        if not is_valid_url(clean_url):
            continue

        if get_domain(clean_url) == base_domain and clean_url not in visited_urls:
            internal_links.append(clean_url)
            internal_links += scan_website_recursively(clean_url, base_domain, max_depth, current_depth + 1)

    time.sleep(0.5)  # polite delay to avoid hammering the server
    return internal_links

def get_url_from_drive():
    creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('drive', 'v3', credentials=creds)

    request = service.files().get_media(fileId=FILE_ID)
    fh = BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()

    url_text = fh.getvalue().decode("utf-8").strip()
    print(f"Start URL read from Drive file: {url_text}")
    return url_text

def upload_to_drive(content):
    creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('drive', 'v3', credentials=creds)

    file_metadata = {'name': 'crawled_links.txt', 'mimeType': 'text/plain'}
    fh = BytesIO(content.encode('utf-8'))
    media = MediaIoBaseUpload(fh, mimetype='text/plain')
    uploaded_file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    print(f"Uploaded crawled links file to Drive with File ID: {uploaded_file['id']}")
    return uploaded_file['id']

def main():
    start_url = get_url_from_drive()
    if not is_valid_url(start_url):
        print("Invalid URL in Drive file.")
        return

    base_domain = get_domain(start_url)
    print(f"Crawling domain: {base_domain}")

    results = scan_website_recursively(start_url, base_domain, max_depth=2)
    print(f"Found {len(results)} internal links.")

    content = "\n".join(results)
    upload_to_drive(content)

if __name__ == "__main__":
    main()


Start URL read from Drive file: https://www.w3schools.com
Crawling domain: www.w3schools.com
Crawling: https://www.w3schools.com (Depth: 0)
Crawling: https://www.w3schools.com/academy/index.php (Depth: 1)
Crawling: https://www.w3schools.com/spaces/index.php (Depth: 2)
Crawling: https://www.w3schools.com/plus/index.php (Depth: 2)
Crawling: https://www.w3schools.com/html/default.asp (Depth: 2)
Crawling: https://www.w3schools.com/tags/default.asp (Depth: 2)
Crawling: https://www.w3schools.com/css/default.asp (Depth: 2)
Crawling: https://www.w3schools.com/cssref/default.asp (Depth: 2)
Crawling: https://www.w3schools.com/css/css_rwd_intro.asp (Depth: 2)
Crawling: https://www.w3schools.com/bootstrap/bootstrap_ver.asp (Depth: 2)
Crawling: https://www.w3schools.com/w3css/default.asp (Depth: 2)
Crawling: https://www.w3schools.com/w3css/w3css_references.asp (Depth: 2)
Crawling: https://www.w3schools.com/sass/default.php (Depth: 2)
Crawling: https://www.w3schools.com/sass/sass_functions_string.ph

In [8]:
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload, MediaInMemoryUpload
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
from io import BytesIO

SERVICE_ACCOUNT_FILE = '/content/website-crawler-gcf-a0fa227f070b.json'  # your path
FILE_ID = '1YUFAMR-28EddEz9IAdvdY-T9JPdgYOQ2'  # your url.txt file
SCOPES = ['https://www.googleapis.com/auth/drive']

visited_urls = set()


In [9]:
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

def get_domain(url):
    return urlparse(url).netloc

def scan_website_recursively(start_url, base_domain, max_depth, current_depth=0):
    internal_links = []
    if current_depth > max_depth:
        return internal_links

    normalized_url = urljoin(start_url, urlparse(start_url).path)
    if normalized_url in visited_urls:
        return internal_links

    visited_urls.add(normalized_url)

    try:
        response = requests.get(normalized_url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException:
        return internal_links

    soup = BeautifulSoup(response.text, 'html.parser')

    for tag in soup.find_all('a', href=True):
        href = tag['href']
        full_url = urljoin(normalized_url, href)
        clean_url = urljoin(full_url, urlparse(full_url).path)

        if not is_valid_url(clean_url):
            continue

        if get_domain(clean_url) == base_domain and clean_url not in visited_urls:
            internal_links.append(clean_url)
            internal_links += scan_website_recursively(clean_url, base_domain, max_depth, current_depth + 1)

    time.sleep(0.5)
    return internal_links


In [10]:
def get_url_from_drive():
    creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('drive', 'v3', credentials=creds)

    request = service.files().get_media(fileId=FILE_ID)
    fh = BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()
    return fh.getvalue().decode("utf-8").strip()


In [11]:
def share_file_with_user(service, file_id, user_email):
    permission = {
        'type': 'user',
        'role': 'reader',
        'emailAddress': user_email,
    }
    service.permissions().create(fileId=file_id, body=permission).execute()


In [12]:
def upload_to_drive(content):
    creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('drive', 'v3', credentials=creds)

    file_metadata = {'name': 'crawled_links.txt', 'mimeType': 'text/plain'}
    media = MediaInMemoryUpload(content.encode('utf-8'), mimetype='text/plain')
    uploaded_file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()

    file_id = uploaded_file['id']
    share_file_with_user(service, file_id, 'erigalareddy123@gmail.com')
    print(f"Uploaded and shared file with File ID: {file_id}")
    return file_id


In [13]:
start_url = get_url_from_drive()
base_domain = get_domain(start_url)
results = scan_website_recursively(start_url, base_domain, max_depth=2)
content = "\n".join(results)
upload_to_drive(content)


Uploaded and shared file with File ID: 1c24zMzHi-S5HNr2-jZls4UP60Rhs3tRa


'1c24zMzHi-S5HNr2-jZls4UP60Rhs3tRa'