# 🧠 Smart Web Crawler Assistant – Full Colab Backend

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

CONFIG_DIR = '/content/drive/MyDrive/assistant_configs'
OUTPUT_DIR = '/content/drive/MyDrive/assistant_outputs'

os.makedirs(CONFIG_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
print('✅ Folders ready.')

In [None]:
import json

# List available config files
config_files = [f for f in os.listdir(CONFIG_DIR) if f.endswith('.json')]
if not config_files:
    raise FileNotFoundError('❌ No config files found in assistant_configs.')

print('📁 Available Config Files:')
for i, file in enumerate(config_files):
    print(f"{i+1}: {file}")

selection = int(input('Select a config file number: ')) - 1
CONFIG_PATH = os.path.join(CONFIG_DIR, config_files[selection])

try:
    with open(CONFIG_PATH, 'r') as f:
        config = json.load(f)
    print('✅ Config Loaded Successfully!')
except FileNotFoundError:
    print('❌ File not found.')
except json.JSONDecodeError:
    print('❌ Invalid JSON format.')

In [None]:
prompt = config.get('prompt', '')
url = config.get('url', '')
filters = [f.strip() for f in config.get('filters', '').split(',') if f.strip()]

print(f"Prompt: {prompt}\nURL: {url}\nFilters: {filters}")

In [None]:
# Placeholder for scraping logic using requests/BeautifulSoup or Scrapy
import requests
from bs4 import BeautifulSoup

response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(response.text, 'html.parser')

# Example logic (customize per site)
items = []
for item in soup.select('.item-cell'):
    title = item.select_one('.item-title')
    price = item.select_one('.price-current')
    data = {
        'title': title.text.strip() if title else None,
        'price': price.text.strip() if price else None,
    }
    filtered = {k: v for k, v in data.items() if k in filters or not filters}
    items.append(filtered)

print(f'✅ Scraped {len(items)} items.')

In [None]:
import pandas as pd
from datetime import datetime

df = pd.DataFrame(items)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = os.path.join(OUTPUT_DIR, f'smart_web_crawler_output_{timestamp}.csv')
df.to_csv(output_path, index=False)
print(f'📁 Output saved to: {output_path}')

In [None]:
df.head()