In [None]:
import threading
import queue
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import sqlite3
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import os
import json
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import networkx as nx
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

class WebCrawler:
    def __init__(self, start_urls, max_depth=3, max_urls=100, num_threads=10, respect_robots=True, user_agent='PythonWebCrawler/1.0'):
        self.start_urls = start_urls
        self.max_depth = max_depth
        self.max_urls = max_urls
        self.num_threads = num_threads
        self.respect_robots = respect_robots
        self.user_agent = user_agent
        self.queue = queue.PriorityQueue()
        self.visited = set()
        self.lock = threading.Lock()
        self.url_count = 0
        self.db_conn = sqlite3.connect('crawler_results.db', check_same_thread=False)
        self.create_database()
        self.robots_cache = {}
        self.link_graph = nx.DiGraph()
        self.session = requests.Session()
        retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
        self.session.mount('http://', HTTPAdapter(max_retries=retries))
        self.session.mount('https://', HTTPAdapter(max_retries=retries))

    def create_database(self):
        cursor = self.db_conn.cursor()
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS pages (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            url TEXT UNIQUE,
            title TEXT,
            content TEXT,
            links INTEGER,
            depth INTEGER,
            status_code INTEGER,
            content_type TEXT,
            crawl_time REAL,
            keywords TEXT
        )
        ''')
        self.db_conn.commit()


    def save_to_database(self, url, title, content, links, depth, status_code, content_type, crawl_time, keywords):
        cursor = self.db_conn.cursor()
        cursor.execute('''
        INSERT OR REPLACE INTO pages (url, title, content, links, depth, status_code, content_type, crawl_time, keywords)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (url, title, content, links, depth, status_code, content_type, crawl_time, json.dumps(keywords)))
        self.db_conn.commit()

    def crawl(self):
        for url in self.start_urls:
            self.queue.put((0, url, 0))  # (priority, url, depth)

        futures = []
        with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
            while not self.queue.empty() and self.url_count < self.max_urls:
                _, url, depth = self.queue.get()
                if url not in self.visited and depth <= self.max_depth:
                    future = executor.submit(self.process_url, url, depth)
                    futures.append(future)

            for future in as_completed(futures):
                future.result()  # This will raise any exceptions that occurred

        logging.info(f"Crawling complete. Processed {self.url_count} URLs.")

    def process_url(self, url, depth):
        if url in self.visited:
            return

        with self.lock:
            if self.url_count >= self.max_urls:
                return
            self.visited.add(url)
            self.url_count += 1

        try:
            if self.respect_robots and not self.can_fetch(url):
                logging.info(f"Skipping {url} due to robots.txt restrictions")
                return

            start_time = time.time()
            response = self.session.get(url, timeout=10, headers={'User-Agent': self.user_agent})
            content_type = response.headers.get('Content-Type', '').split(';')[0]

            if 'text/html' not in content_type:
                logging.info(f"Skipping non-HTML content: {url} (Content-Type: {content_type})")
                return

            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.title.string if soup.title else "No title"
            content = soup.get_text()
            links = self.extract_links(soup, url)
            keywords = self.extract_keywords(content)

            crawl_time = time.time() - start_time

            self.save_to_database(url, title, content[:1000], len(links), depth, response.status_code, content_type, crawl_time, keywords)

            logging.info(f"Processed: {url} (Depth: {depth}, Links: {len(links)}, Status: {response.status_code})")

            with self.lock:
                for link in links:
                    self.link_graph.add_edge(url, link)
                    if link not in self.visited and depth < self.max_depth:
                        priority = self.calculate_priority(link)
                        self.queue.put((priority, link, depth + 1))

        except Exception as e:
            logging.error(f"Error processing {url}: {str(e)}")

    def extract_links(self, soup, base_url):
        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_url = urllib.parse.urljoin(base_url, href)
            if full_url.startswith('http'):
                links.append(full_url)
        return links

    def can_fetch(self, url):
        parsed_url = urllib.parse.urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        
        if base_url not in self.robots_cache:
            robots_url = urllib.parse.urljoin(base_url, "/robots.txt")
            try:
                response = self.session.get(robots_url, timeout=5)
                if response.status_code == 200:
                    self.robots_cache[base_url] = response.text
                else:
                    self.robots_cache[base_url] = ""
            except:
                self.robots_cache[base_url] = ""

        robots_txt = self.robots_cache[base_url]
        can_fetch = True
        for line in robots_txt.split('\n'):
            if line.lower().startswith('user-agent:') and '*' in line:
                can_fetch = True
            elif line.lower().startswith('disallow:'):
                disallow_path = line.split(':', 1)[1].strip()
                if parsed_url.path.startswith(disallow_path):
                    can_fetch = False
                    break

        return can_fetch

    def extract_keywords(self, content):
        words = word_tokenize(content.lower())
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
        return Counter(filtered_words).most_common(10)

    def calculate_priority(self, url):
        # Implement your own prioritization logic here
        # For example, prioritize shorter URLs or specific domains
        return len(url)

class ResultAnalyzer:
    def __init__(self, db_path, link_graph):
        self.db_conn = sqlite3.connect(db_path)
        self.link_graph = link_graph

    def analyze(self):
        self.basic_stats()
        self.generate_visualizations()
        self.analyze_content()
        self.analyze_link_structure()
        self.cluster_pages()

    def basic_stats(self):
        cursor = self.db_conn.cursor()
        
        # Most linked pages
        cursor.execute("SELECT url, links FROM pages ORDER BY links DESC LIMIT 5")
        most_linked = cursor.fetchall()
        logging.info("Top 5 most linked pages:")
        for url, links in most_linked:
            logging.info(f"{url}: {links} links")

        # Average links per page
        cursor.execute("SELECT AVG(links) FROM pages")
        avg_links = cursor.fetchone()[0]
        logging.info(f"Average links per page: {avg_links:.2f}")

        # Pages by depth
        cursor.execute("SELECT depth, COUNT(*) FROM pages GROUP BY depth")
        depth_distribution = cursor.fetchall()
        logging.info("Pages by depth:")
        for depth, count in depth_distribution:
            logging.info(f"Depth {depth}: {count} pages")

        # Status code distribution
        cursor.execute("SELECT status_code, COUNT(*) FROM pages GROUP BY status_code")
        status_distribution = cursor.fetchall()
        logging.info("Status code distribution:")
        for status, count in status_distribution:
            logging.info(f"Status {status}: {count} pages")

        # Average crawl time
        cursor.execute("SELECT AVG(crawl_time) FROM pages")
        avg_crawl_time = cursor.fetchone()[0]
        logging.info(f"Average crawl time per page: {avg_crawl_time:.3f} seconds")

    def generate_visualizations(self):
        cursor = self.db_conn.cursor()

        # Depth distribution
        cursor.execute("SELECT depth, COUNT(*) FROM pages GROUP BY depth")
        depths, counts = zip(*cursor.fetchall())
        plt.figure(figsize=(10, 5))
        plt.bar(depths, counts)
        plt.title('Page Distribution by Depth')
        plt.xlabel('Depth')
        plt.ylabel('Number of Pages')
        plt.savefig('depth_distribution.png')
        plt.close()

        # Status code distribution
        cursor.execute("SELECT status_code, COUNT(*) FROM pages GROUP BY status_code")
        status_codes, counts = zip(*cursor.fetchall())
        plt.figure(figsize=(10, 5))
        plt.bar(status_codes, counts)
        plt.title('Page Distribution by Status Code')
        plt.xlabel('Status Code')
        plt.ylabel('Number of Pages')
        plt.savefig('status_distribution.png')
        plt.close()

        # Link graph visualization
        plt.figure(figsize=(12, 8))
        pos = nx.spring_layout(self.link_graph)
        nx.draw(self.link_graph, pos, node_size=10, node_color='blue', with_labels=False, arrows=True)
        plt.title('Web Page Link Structure')
        plt.savefig('link_graph.png')
        plt.close()

    def analyze_content(self):
        cursor = self.db_conn.cursor()
        cursor.execute("SELECT content FROM pages")
        all_content = ' '.join([row[0] for row in cursor.fetchall()])

        # Tokenize and remove stop words
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(all_content.lower())
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

        # Get most common words
        word_freq = Counter(filtered_words)
        most_common = word_freq.most_common(10)

        logging.info("Most common words across all pages:")
        for word, count in most_common:
            logging.info(f"{word}: {count}")

        # Generate word cloud
        try:
            from wordcloud import WordCloud
            wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title('Word Cloud of Crawled Content')
            plt.savefig('word_cloud.png')
            plt.close()
        except ImportError:
            logging.warning("WordCloud not installed. Skipping word cloud generation.")

    def analyze_link_structure(self):
        # Calculate PageRank
        pagerank = nx.pagerank(self.link_graph)
        top_pages = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
        
        logging.info("Top 10 pages by PageRank:")
        for url, rank in top_pages:
            logging.info(f"{url}: {rank:.4f}")

        # Identify strongly connected components
        components = list(nx.strongly_connected_components(self.link_graph))
        logging.info(f"Number of strongly connected components: {len(components)}")
        logging.info(f"Largest component size: {len(max(components, key=len))}")

    def cluster_pages(self):
        cursor = self.db_conn.cursor()
        cursor.execute("SELECT url, content FROM pages")
        pages = cursor.fetchall()

        urls, contents = zip(*pages)
        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(contents)

        num_clusters = min(5, len(pages))
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(tfidf_matrix)

        clusters = defaultdict(list)
        for url, cluster in zip(urls, kmeans.labels_):
            clusters[cluster].append(url)

        logging.info("Page clusters:")
        for cluster, urls in clusters.items():
            logging.info(f"Cluster {cluster}:")
            for url in urls[:5]:  # Show up to 5 URLs per cluster
                logging.info(f"  {url}")
            if len(urls) > 5:
                logging.info(f"  ... and {len(urls) - 5} more")

    def export_results(self, filename='crawler_results.json'):
        cursor = self.db_conn.cursor()
        cursor.execute("SELECT * FROM pages")
        columns = [description[0] for description in cursor.description]
        results = [dict(zip(columns, row)) for row in cursor.fetchall()]

        with open(filename, 'w') as f:
            json.dump(results, f, indent=2)

        logging.info(f"Results exported to {filename}")

    def close(self):
        self.db_conn.close()

def main():
    start_time = time.time()

    start_urls = [
        "https://en.wikipedia.org/wiki/Python_(programming_language)",
        "https://www.python.org/",
        "https://docs.python.org/3/",
    ]

    crawler = WebCrawler(start_urls, max_depth=3, max_urls=100, num_threads=10, respect_robots=True)
    crawler.crawl()

    analyzer = ResultAnalyzer('crawler_results.db', crawler.link_graph)
    analyzer.analyze()
    analyzer.export_results()
    analyzer.close()

    end_time = time.time()
    logging.info(f"Total execution time: {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()