## micenter

In [1]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import sys

# Set the base URL
BASE_URL = 'https://micenter.lt/en'
DOMAIN = 'micenter.lt'

# Initialize sets for visited URLs and a list for URLs to visit
visited = set()
to_visit = [BASE_URL]

# Directories
SAVE_DIR = 'micenter_site'
URL_MAP_FILE = os.path.join(SAVE_DIR, 'url_map.json')
LOG_FILE = os.path.join(SAVE_DIR, 'log.txt')

class TeeLogger:
    def __init__(self, *streams):
        self.streams = streams
    def write(self, msg):
        for s in self.streams:
            s.write(msg)
    def flush(self):
        for s in self.streams:
            s.flush()

# Create the directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)
            
log_file = open(LOG_FILE, 'w', encoding='utf-8')
original_stdout = sys.stdout
sys.stdout = TeeLogger(sys.stdout, log_file)

if os.path.exists(URL_MAP_FILE):
    shutil.copy(URL_MAP_FILE, URL_MAP_FILE + '.bak')

if os.path.exists(URL_MAP_FILE):
    with open(URL_MAP_FILE, 'r', encoding='utf-8') as f:
        url_map = json.load(f)
else:
    url_map = {}

def sanitize_filename(url):
    """
    Convert a URL into a safe filename by replacing special characters.
    """
    parsed = urlparse(url)
    query = parsed.query.replace('=', '-').replace('&', '_')
    path = parsed.path.strip('/').replace('/', '_')
    if not path:
        path = 'index'
    
    if path and query:
        filename = f"{parsed.netloc}_{path}_{query}.html"
    else:
        filename = f"{parsed.netloc}_{path}.html"
    return filename

def save_html(content, filename):
    """
    Save HTML content to a file within the SAVE_DIR directory.
    """
    filepath = os.path.join(SAVE_DIR, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

def crawl():
    while to_visit:
        print(f"Progress: {len(visited)} visited | {len(to_visit)} queued")
        is_visited = False
        url = to_visit.pop(0)
        if url in visited:
            continue
            
        filename = sanitize_filename(url)
        filepath = os.path.join(SAVE_DIR, filename)
        if os.path.exists(filepath):
            print(f"Already downloaded: {url}")
            visited.add(url)
            is_visited = True
            
        try:
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                print(f"Failed to retrieve {url} (Status code: {response.status_code})")
                continue
                
            html_content = response.text
            soup = BeautifulSoup(html_content, 'html.parser')
            if is_visited == False:
                save_html(html_content, filename)
                print(f"Saved: {url}")

            # If the url was saved before, it's overriden
            url_map[filename] = url
            with open(URL_MAP_FILE, 'w', encoding='utf-8') as f:
                json.dump(url_map, f, indent=2, ensure_ascii=False)

            visited.add(url)
            # Extract and process all anchor tags
            for link in soup.find_all('a', href=True):
                href = link['href']
                # Construct absolute URL
                full_url = urljoin(url, href)
                # Normalize URL by removing fragments
                full_url = full_url.split('#')[0]
                # Check if the URL belongs to the same domain and hasn't been visited
                if DOMAIN in urlparse(full_url).netloc and full_url not in visited and full_url not in to_visit:
                    to_visit.append(full_url)
            # Polite delay to prevent overwhelming the server
            time.sleep(1)
            
        except Exception as e:
            print(f"Error processing {url}: {e}")

if __name__ == "__main__":
    crawl()

sys.stdout = original_stdout
log_file.close()

Progress: 0 visited | 1 queued
Saved: https://micenter.lt/en
Progress: 1 visited | 71 queued
Saved: https://micenter.lt/lt
Progress: 2 visited | 137 queued
Saved: https://micenter.lt/ru
Progress: 3 visited | 204 queued
Saved: https://micenter.lt/en/learn-lithuanian
Progress: 4 visited | 207 queued
Saved: https://micenter.lt/en/about-us
Progress: 5 visited | 206 queued
Saved: https://micenter.lt/
Progress: 6 visited | 205 queued
Saved: https://micenter.lt/en/main-information
Progress: 7 visited | 204 queued
Saved: https://micenter.lt/en/interesting-facts
Progress: 8 visited | 203 queued
Saved: https://micenter.lt/en/migration-statistics
Progress: 9 visited | 202 queued
Saved: https://micenter.lt/en/travel-to-lithuania
Progress: 10 visited | 201 queued
Saved: https://micenter.lt/en/schengen-visa
Progress: 11 visited | 200 queued
Saved: https://micenter.lt/en/visa-d
Progress: 12 visited | 199 queued
Saved: https://micenter.lt/en/temporary-residence-permit
Progress: 13 visited | 198 queued

## ...