In [None]:
# Display ASCII Art
ascii_art = """
███████╗ ██████╗ ███╗   ███╗██████╗ ██╗████████╗██╗  ██╗ ██████╗ ██╗  ██╗
╚══███╔╝██╔═══██╗████╗ ████║██╔══██╗██║╚══██╔══╝╚██╗██╔╝██╔════╝ ██║  ██║
  ███╔╝ ██║   ██║██╔████╔██║██████╔╝██║   ██║    ╚███╔╝ ███████╗ ███████║
 ███╔╝  ██║   ██║██║╚██╔╝██║██╔══██╗██║   ██║    ██╔██╗ ██╔═══██╗╚════██║
███████╗╚██████╔╝██║ ╚═╝ ██║██████╔╝██║   ██║   ██╔╝ ██╗╚██████╔╝     ██║
╚══════╝ ╚═════╝ ╚═╝     ╚═╝╚═════╝ ╚═╝   ╚═╝   ╚═╝  ╚═╝ ╚═════╝      ╚═╝
"""
print('\033[95m' + ascii_art + '\033[0m')  # Print in purple color

# 🚀 ZombitX64 Web Scraper with Local Hugging Face Models

This notebook demonstrates:
1. 🔗 Connecting to Google Drive
2. 📦 Setting up Hugging Face transformers
3. 🌐 Web scraping functionality
4. 🤖 Local model inference
5. 💾 Saving results to Google Drive

## Setup and Dependencies

In [None]:
!pip install transformers beautifulsoup4 requests
!pip install torch torchvision torchaudio

## Connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create output directory in Google Drive
import os
output_dir = '/content/drive/MyDrive/ZombitX64_Output'
os.makedirs(output_dir, exist_ok=True)
print(f"✅ Output directory created at: {output_dir}")

## Import Required Libraries

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqGeneration
from bs4 import BeautifulSoup
import requests
import json
from datetime import datetime
import re
import urllib.parse

## Load Local Hugging Face Model

In [None]:
# Load model and tokenizer
model_name = "facebook/bart-large-cnn"  # You can change this to other models
print(f"🤖 Loading model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"✅ Model loaded successfully on: {device}")

## Web Scraping Functions

In [None]:
def check_robots_txt(url):
    try:
        parsed_url = urllib.parse.urlparse(url)
        if not parsed_url.netloc:
            parsed_url = urllib.parse.urlparse("http://" + url)
        robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
        response = requests.get(robots_url)
        response.raise_for_status()
        return "Disallow: /" not in response.text
    except Exception as e:
        print(f"⚠️ Error checking robots.txt: {str(e)}")
        return True

def scrape_content(url):
    if not check_robots_txt(url):
        print("❌ Crawling disallowed by robots.txt")
        return None

    try:
        print(f"🌐 Fetching content from: {url}")
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title and content
        title = soup.find('h1').text.strip() if soup.find('h1') else "No Title Found"
        content = []

        # Find main content
        article = soup.find('article') or soup.find(class_='entry-content')
        if article:
            for elem in article.find_all(['p', 'h2', 'h3', 'ul', 'ol']):
                content.append(elem.get_text(strip=True))

        return {
            "title": title,
            "text": "\n\n".join(content),
            "metadata": {
                "url": url,
                "crawled_at": datetime.now().isoformat()
            }
        }
    except Exception as e:
        print(f"❌ Error scraping content: {str(e)}")
        return None

## Text Summarization Function

In [None]:
def summarize_text(text, max_length=1024):
    try:
        print("🤖 Generating summary...")
        inputs = tokenizer(text, max_length=max_length, truncation=True, return_tensors="pt").to(device)
        
        # Generate summary
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=150,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        print("✅ Summary generated successfully!")
        return summary
    except Exception as e:
        print(f"❌ Error generating summary: {str(e)}")
        return "Summary generation failed."

## Save Results to Google Drive

In [None]:
def save_results(content, summary):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save JSON
    json_path = f"{output_dir}/scraped_{timestamp}.json"
    content_with_summary = {
        **content,
        "summary": summary
    }
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(content_with_summary, f, indent=2, ensure_ascii=False)
    
    # Save Markdown
    md_path = f"{output_dir}/scraped_{timestamp}.md"
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write(f"# {content['title']}\n\n")
        f.write(f"URL: {content['metadata']['url']}\n\n")
        f.write(f"Crawled at: {content['metadata']['crawled_at']}\n\n")
        f.write(f"## Summary\n\n{summary}\n\n")
        f.write(content['text'])
    
    print(f"✅ Results saved to Google Drive:")
    print(f"📄 JSON: {json_path}")
    print(f"📝 Markdown: {md_path}")

## Main Execution

In [None]:
# Input URL
url = input("🌐 Enter the URL to scrape (or press Enter for default): ") or "https://example.com"

# Scrape content
content = scrape_content(url)

if content:
    # Generate summary
    summary = summarize_text(content['text'])
    
    # Save results
    save_results(content, summary)
    
    # Show ASCII art again for completion
    print('\n\033[95m' + ascii_art + '\033[0m')
    print("✨ Task completed successfully! ✨")
else:
    print("❌ Failed to scrape content")