# 🌙 CrewColabDarkX64 - Web Scraping with Local AI

**Dark Mode Edition**

This notebook provides:
1. 🔗 Google Drive integration
2. 🤖 Local AI model usage
3. 🌐 Web scraping functionality
4. 💾 Automated content saving

In [None]:
# ASCII Colors for Dark Mode
class Colors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    DIM = '\033[2m'
    PURPLE = '\033[35m'
    CYAN = '\033[36m'

# Print ASCII Art
ascii_art = f"""{Colors.PURPLE}
███████╗ ██████╗ ███╗   ███╗██████╗ ██╗████████╗██╗  ██╗ ██████╗ ██╗  ██╗
╚══███╔╝██╔═══██╗████╗ ████║██╔══██╗██║╚══██╔══╝╚██╗██╔╝██╔════╝ ██║  ██║
  ███╔╝ ██║   ██║██╔████╔██║██████╔╝██║   ██║    ╚███╔╝ ███████╗ ███████║
 ███╔╝  ██║   ██║██║╚██╔╝██║██╔══██╗██║   ██║    ██╔██╗ ██╔═══██╗╚════██║
███████╗╚██████╔╝██║ ╚═╝ ██║██████╔╝██║   ██║   ██╔╝ ██╗╚██████╔╝     ██║
╚══════╝ ╚═════╝ ╚═╝     ╚═╝╚═════╝ ╚═╝   ╚═╝   ╚═╝  ╚═╝ ╚═════╝      ╚═╝
{Colors.ENDC}"""

print(ascii_art)

# Print Disclaimer
print(f"\n{Colors.WARNING}⚠️ DISCLAIMER AND WARNING / คำเตือนและข้อจำกัดความรับผิดชอบ ⚠️{Colors.ENDC}")
print(f"{Colors.WARNING}=" * 80)
print(f"{Colors.BOLD}🔒 [EN] IMPORTANT DISCLAIMER:{Colors.ENDC}")
print("This tool is for educational purposes only.")
print("Users are fully responsible for their own actions.")
print("The developers assume no liability and are not responsible for any misuse or damage.")
print("\n🚫 By using this tool, you agree to:")
print("1. Use it legally and ethically")
print("2. Respect website terms of service")
print("3. Not use for malicious purposes")
print("4. Take full responsibility for your actions")

print(f"\n{Colors.BOLD}🔒 [TH] คำเตือนสำคัญ:{Colors.ENDC}")
print("โปรแกรมนี้ถูกพัฒนาขึ้นเพื่อการศึกษาเท่านั้น")
print("ผู้ใช้ต้องรับผิดชอบต่อการกระทำทั้งหมดด้วยตนเอง")
print("ผู้พัฒนาไม่รับผิดชอบต่อความเสียหายใดๆ ที่เกิดจากการใช้งานโปรแกรมนี้")
print("\n⚠️ การใช้งานถือว่าคุณยอมรับเงื่อนไขต่อไปนี้:")
print("1. ใช้งานอย่างถูกกฎหมายและมีจริยธรรม")
print("2. เคารพข้อกำหนดการใช้งานของเว็บไซต์")
print("3. ไม่ใช้เพื่อวัตถุประสงค์ที่เป็นอันตราย")
print("4. รับผิดชอบต่อผลกระทบที่อาจเกิดขึ้นทั้งหมด")
print(f"\n{Colors.FAIL}❗ การใช้งานผิดวัตถุประสงค์อาจมีความผิดทางกฎหมาย ❗{Colors.ENDC}")
print(f"{Colors.WARNING}=" * 80 + f"{Colors.ENDC}\n")

## 📦 Setup Dependencies

In [None]:
!pip install transformers beautifulsoup4 requests
!pip install torch torchvision torchaudio

## 🔗 Connect to Google Drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# Create output directory
import os

output_dir = '/content/drive/MyDrive/ZombitX64_Output'
os.makedirs(output_dir, exist_ok=True)

print(f"{Colors.OKGREEN}✅ Output directory created: {output_dir}{Colors.ENDC}")

## 📚 Import Libraries

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqGeneration
from bs4 import BeautifulSoup
import requests
import json
from datetime import datetime
import re
import urllib.parse

print(f"{Colors.OKGREEN}✅ Libraries imported successfully{Colors.ENDC}")

## 🤖 Load Local AI Model

In [None]:
# Load model and tokenizer
model_name = "facebook/bart-large-cnn"  # You can change this to other models

print(f"{Colors.OKBLUE}🤖 Loading model: {model_name}{Colors.ENDC}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f"{Colors.OKGREEN}✨ Model loaded on {device}{Colors.ENDC}")

## 🌐 Web Scraping Functions

In [None]:
def check_robots_txt(url):
    """Check if crawling is allowed by robots.txt"""
    try:
        parsed_url = urllib.parse.urlparse(url)
        if not parsed_url.netloc:
            parsed_url = urllib.parse.urlparse("http://" + url)
        robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
        
        print(f"{Colors.DIM}🔍 Checking: {robots_url}{Colors.ENDC}")
        response = requests.get(robots_url)
        response.raise_for_status()
        
        return "Disallow: /" not in response.text
    except Exception as e:
        print(f"{Colors.WARNING}⚠️ Error checking robots.txt: {str(e)}{Colors.ENDC}")
        return True

def scrape_content(url):
    """Main scraping function"""
    print(f"\n{Colors.HEADER}🔍 Starting to scrape: {url}{Colors.ENDC}")
    
    if not check_robots_txt(url):
        print(f"{Colors.FAIL}❌ Crawling disallowed by robots.txt{Colors.ENDC}")
        return None

    try:
        print(f"{Colors.OKBLUE}📡 Fetching content...{Colors.ENDC}")
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('h1').text.strip() if soup.find('h1') else "No Title Found"
        
        content = []
        article = soup.find('article') or soup.find(class_='entry-content')
        
        if article:
            for elem in article.find_all(['p', 'h2', 'h3', 'ul', 'ol']):
                content.append(elem.get_text(strip=True))
        
        result = {
            "title": title,
            "text": "\n\n".join(content),
            "metadata": {
                "url": url,
                "crawled_at": datetime.now().isoformat()
            }
        }
        
        print(f"{Colors.OKGREEN}✨ Successfully scraped content ({len(result['text'])} characters){Colors.ENDC}")
        return result
        
    except Exception as e:
        print(f"{Colors.FAIL}❌ Error scraping content: {str(e)}{Colors.ENDC}")
        return None

print(f"{Colors.OKGREEN}✅ Functions defined successfully{Colors.ENDC}")

## 🤖 Text Summarization Function

In [None]:
def summarize_text(text, max_length=1024):
    """Generate content summary using local model"""
    try:
        print(f"{Colors.OKBLUE}📊 Processing text...{Colors.ENDC}")
        inputs = tokenizer(text, max_length=max_length, truncation=True, return_tensors="pt").to(device)
        
        print(f"{Colors.DIM}⏳ Generating summary...{Colors.ENDC}")
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=150,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        print(f"{Colors.OKGREEN}✨ Summary generated successfully!{Colors.ENDC}")
        return summary
        
    except Exception as e:
        print(f"{Colors.FAIL}❌ Error generating summary: {str(e)}{Colors.ENDC}")
        return "Failed to generate summary."

print(f"{Colors.OKGREEN}✅ Summarization function ready{Colors.ENDC}")

## 💾 Save Results Function

In [None]:
def save_results(content, summary):
    """Save results to Google Drive"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save JSON
    print(f"\n{Colors.OKBLUE}💾 Saving JSON...{Colors.ENDC}")
    json_path = f"{output_dir}/scraped_{timestamp}.json"
    content_with_summary = {
        **content,
        "summary": summary
    }
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(content_with_summary, f, indent=2, ensure_ascii=False)
    
    # Save Markdown
    print(f"{Colors.OKBLUE}📝 Saving Markdown...{Colors.ENDC}")
    md_path = f"{output_dir}/scraped_{timestamp}.md"
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write(f"# {content['title']}\n\n")
        f.write(f"URL: {content['metadata']['url']}\n\n")
        f.write(f"Crawled at: {content['metadata']['crawled_at']}\n\n")
        f.write(f"## Summary\n\n{summary}\n\n")
        f.write(content['text'])
    
    print(f"\n{Colors.OKGREEN}✨ Results saved successfully:{Colors.ENDC}")
    print(f"{Colors.DIM}    📄 JSON: {json_path}")
    print(f"    📝 Markdown: {md_path}{Colors.ENDC}")

print(f"{Colors.OKGREEN}✅ Save function ready{Colors.ENDC}")

## 🚀 Main Execution

In [None]:
print(f"{Colors.CYAN}🌙 Dark Mode Web Scraping Tool{Colors.ENDC}")

# Input URL
url = input(f"{Colors.OKBLUE}📌 Enter URL (or press Enter for default): {Colors.ENDC}") or "https://example.com"
print(f"\n{Colors.CYAN}🔗 URL: {url}{Colors.ENDC}")

# Scrape content
content = scrape_content(url)

if content:
    # Generate summary
    summary = summarize_text(content['text'])
    
    # Save results
    save_results(content, summary)
    
    print(f"\n{Colors.OKGREEN}🎉 Operation completed successfully!{Colors.ENDC}")
else:
    print(f"\n{Colors.FAIL}❌ Failed to crawl content{Colors.ENDC}")