In [11]:
import requests
from bs4 import BeautifulSoup
from collections import deque
from urllib.parse import urljoin
from elasticsearch import Elasticsearch
from datetime import datetime
# real-----------------------------------------

class Spider:
    def __init__(self, start_url, max_pages):
        self.start_url = start_url
        self.max_pages = max_pages
        self.visited_urls = set()# 记录已访问的url
        self.url_map = {}  # 记录url和page_id的映射关系
        self.to_visit_urls = deque([start_url])
        self.page_count = 0
        self.page_id_counter = 1  # 用于分配唯一的 page_id
        # 初始化 Elasticsearch 客户端
        self.es = Elasticsearch("http://localhost:9200")
        # 创建网页信息索引
        self.create_index('web_pages', {
            "mappings": {
                "properties": {
                    "page_id": {"type": "integer"},
                    "url": {"type": "keyword"},
                    "title": {"type": "text"},
                    "content": {"type": "text"},
                    "last_modify_time": {"type": "date"},
                    "size": {"type": "integer"}
                }
            }
        })
        # 创建网页结构索引(parent, child)
        self.create_index('web_page_structure', {
            "mappings": {
                "properties": {
                    "parent_page_id": {"type": "integer"},
                    "child_page_id": {"type": "integer"}
                }
            }
        })
        # 创建反向网页结构索引（子 -> 父）
        self.create_index('reverse_web_page_structure', {
            "mappings": {
                "properties": {
                    "child_page_id": {"type": "integer"},
                    "parent_page_id": {"type": "integer"}
                }
            }
        })

    def create_index(self, index_name, mapping):
        # 检查索引是否存在
        if self.es.indices.exists(index=index_name):
            # 如果存在，则删除该索引
            self.es.indices.delete(index=index_name)
            print(f"索引 {index_name} 已删除")
        # 创建新的索引
        self.es.indices.create(index=index_name, body=mapping)
        print(f"索引 {index_name} 创建成功")

    def fetch_page_links(self, url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                # 检查 HTTP 头中的 Last-Modified 字段
                last_modified = response.headers.get('Last-Modified')
                if last_modified:
                    last_modify_time = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z')
                else:
                    last_modify_time = datetime.now()
                soup = BeautifulSoup(response.text, 'html.parser')
                title = soup.title.string if soup.title else ''
                content =soup.body.get_text().replace('\n', '').replace('\r', '')
                size=response.raw._fp_bytes_read if response.raw._fp_bytes_read else 0

                # 没访问过且没分配过id
                if url not in self.url_map:
                    page_id = self.page_id_counter
                    self.page_id_counter += 1
                    self.url_map[url] = page_id
                # 没访问过但分配过id
                else:
                    page_id = self.url_map[url]
                # 存储网页信息到 Elasticsearch
                self.store_page_info(page_id, url, title, content, last_modify_time,size)

                links = [link.get('href') for link in soup.find_all('a', href=True)]
                return page_id, links
            return None, []
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return None, []

    def store_page_info(self, page_id, url, title, content, last_modify_time,size):
        doc = {
            "page_id": page_id,
            "url": url,
            "title": title,
            "content": content,
            "last_modify_time": last_modify_time,
            "size": size
        }
        self.es.index(index='web_pages', id=page_id, body=doc)

    def store_page_structure(self, parent_page_id, child_page_id):
        doc = {
            "parent_page_id": parent_page_id,
            "child_page_id": child_page_id
        }
        self.es.index(index='web_page_structure', body=doc)
        # 存储反向网页结构
        reverse_doc = {
            "child_page_id": child_page_id,
            "parent_page_id": parent_page_id
        }
        self.es.index(index='reverse_web_page_structure', body=reverse_doc)

    def crawl(self):
        while self.to_visit_urls and self.page_count < self.max_pages:
            url = self.to_visit_urls.popleft()
            if url in self.visited_urls:
                continue
            print(f"Crawling: {url}")
            self.visited_urls.add(url)
            self.page_count += 1
            parent_page_id, links = self.fetch_page_links(url)#肯定没访问过
            if parent_page_id:
                for link in links:
                    new_full_url = urljoin(url, link)
                    
                    # 没分配过id(没访问过   )
                    if new_full_url not in self.url_map:
                        child_page_id = self.page_id_counter
                        self.page_id_counter += 1
                        self.url_map[new_full_url] = child_page_id
                        self.to_visit_urls.append(new_full_url)
                    # 已分配过id(没访问过)
                    elif new_full_url in self.url_map and new_full_url not in self.visited_urls:
                        child_page_id = self.url_map[new_full_url]
                        self.to_visit_urls.append(new_full_url)
                    # 已访问过(已分配过id)
                    elif new_full_url in self.visited_urls:
                        child_page_id = self.url_map[new_full_url]
                    
                    self.store_page_structure(parent_page_id, child_page_id)
    
    def view_web_pages_data(self):
        """查看 web_pages 索引中的数据"""
        query = {
            "query": {
                "match_all": {}
            },
            "size": 1000
        }
        result = self.es.search(index='web_pages', body=query)
        for hit in result['hits']['hits']:
            print(hit['_source'])

    def view_web_page_structure_data(self):
        """查看 web_page_structure 索引中的数据"""
        query = {
            "query": {
                "match_all": {}
            },
            "size": 1000
        }
        result = self.es.search(index='web_page_structure', body=query)
        for hit in result['hits']['hits']:
            print(hit['_source'])
            
    def view_reverse_web_page_structure_data(self):
        """查看 reverse_web_page_structure 索引中的数据"""
        query = {
            "query": {
                "match_all": {}
            },
            "size": 1000
        }
        result = self.es.search(index='reverse_web_page_structure', body=query)
        for hit in result['hits']['hits']:
            print(hit['_source'])


if __name__ == "__main__":
    start_url = 'https://www.cse.ust.hk/~kwtleung/COMP4321/testpage.htm'
    max_pages = 10
    spider = Spider(start_url, max_pages)
    spider.crawl()

索引 web_pages 已删除
索引 web_pages 创建成功
索引 web_page_structure 已删除
索引 web_page_structure 创建成功


  self.es.indices.create(index=index_name, body=mapping)


索引 reverse_web_page_structure 已删除
索引 reverse_web_page_structure 创建成功
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/testpage.htm


  self.es.index(index='web_pages', id=page_id, body=doc)
  self.es.index(index='web_page_structure', body=doc)
  self.es.index(index='reverse_web_page_structure', body=reverse_doc)


Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/ust_cse.htm
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/news.htm
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/books.htm
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/Movie.htm
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/ust_cse/PG.htm
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/ust_cse/UG.htm
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/news/bbc.htm
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/news/cnn.htm
Crawling: https://www.cse.ust.hk/~kwtleung/COMP4321/books/book1.htm


In [12]:
spider.page_count

10

In [13]:
spider.view_web_pages_data()

{'page_id': 1, 'url': 'https://www.cse.ust.hk/~kwtleung/COMP4321/testpage.htm', 'title': 'Test page', 'content': 'This is the Test page for a crawlerBefore getting the Admission ofCSE department of HKUST, You should read through these international news and thesebooks.Here is my Movie List (New)', 'last_modify_time': '2023-05-16T05:03:16', 'size': 603}
{'page_id': 2, 'url': 'https://www.cse.ust.hk/~kwtleung/COMP4321/ust_cse.htm', 'title': 'CSE department of HKUST', 'content': 'CSE department of HKUSTPG AdmissionUG AdmissionBack to main', 'last_modify_time': '2023-05-16T05:03:16', 'size': 392}
{'page_id': 3, 'url': 'https://www.cse.ust.hk/~kwtleung/COMP4321/news.htm', 'title': 'News', 'content': 'NewsBBC newsCNN newsBack to main', 'last_modify_time': '2023-05-16T05:03:16', 'size': 384}
{'page_id': 4, 'url': 'https://www.cse.ust.hk/~kwtleung/COMP4321/books.htm', 'title': 'books', 'content': 'Books and articles to readAutomated Keyword Classification for Information RetrievalGeneration an

  result = self.es.search(index='web_pages', body=query)


In [14]:
spider.view_web_page_structure_data()

{'parent_page_id': 1, 'child_page_id': 2}
{'parent_page_id': 1, 'child_page_id': 3}
{'parent_page_id': 1, 'child_page_id': 4}
{'parent_page_id': 1, 'child_page_id': 5}
{'parent_page_id': 2, 'child_page_id': 6}
{'parent_page_id': 2, 'child_page_id': 7}
{'parent_page_id': 2, 'child_page_id': 1}
{'parent_page_id': 3, 'child_page_id': 8}
{'parent_page_id': 3, 'child_page_id': 9}
{'parent_page_id': 3, 'child_page_id': 1}
{'parent_page_id': 4, 'child_page_id': 10}
{'parent_page_id': 4, 'child_page_id': 11}
{'parent_page_id': 4, 'child_page_id': 12}
{'parent_page_id': 4, 'child_page_id': 1}
{'parent_page_id': 5, 'child_page_id': 1}
{'parent_page_id': 5, 'child_page_id': 13}
{'parent_page_id': 5, 'child_page_id': 14}
{'parent_page_id': 5, 'child_page_id': 15}
{'parent_page_id': 5, 'child_page_id': 16}
{'parent_page_id': 5, 'child_page_id': 17}
{'parent_page_id': 5, 'child_page_id': 18}
{'parent_page_id': 5, 'child_page_id': 19}
{'parent_page_id': 5, 'child_page_id': 20}
{'parent_page_id': 5, '

  result = self.es.search(index='web_page_structure', body=query)


In [15]:
spider.view_reverse_web_page_structure_data()

{'child_page_id': 2, 'parent_page_id': 1}
{'child_page_id': 3, 'parent_page_id': 1}
{'child_page_id': 4, 'parent_page_id': 1}
{'child_page_id': 5, 'parent_page_id': 1}
{'child_page_id': 6, 'parent_page_id': 2}
{'child_page_id': 7, 'parent_page_id': 2}
{'child_page_id': 1, 'parent_page_id': 2}
{'child_page_id': 8, 'parent_page_id': 3}
{'child_page_id': 9, 'parent_page_id': 3}
{'child_page_id': 1, 'parent_page_id': 3}
{'child_page_id': 10, 'parent_page_id': 4}
{'child_page_id': 11, 'parent_page_id': 4}
{'child_page_id': 12, 'parent_page_id': 4}
{'child_page_id': 1, 'parent_page_id': 4}
{'child_page_id': 1, 'parent_page_id': 5}
{'child_page_id': 13, 'parent_page_id': 5}
{'child_page_id': 14, 'parent_page_id': 5}
{'child_page_id': 15, 'parent_page_id': 5}
{'child_page_id': 16, 'parent_page_id': 5}
{'child_page_id': 17, 'parent_page_id': 5}
{'child_page_id': 18, 'parent_page_id': 5}
{'child_page_id': 19, 'parent_page_id': 5}
{'child_page_id': 20, 'parent_page_id': 5}
{'child_page_id': 21, '

  result = self.es.search(index='reverse_web_page_structure', body=query)


#### env
kibana 7.3.1 ;elasticsearch 7.3.1;python 3.9.1

In [4]:
!pip list --format=freeze > requirements.txt

In [2]:
! pip list

Package            Version
------------------ -----------
aiohappyeyeballs   2.4.4
aiohttp            3.8.4
aiosignal          1.3.1
APScheduler        3.2.0
asttokens          3.0.0
async-timeout      4.0.2
attrs              24.3.0
backcall           0.2.0
beautifulsoup4     4.12.2
blinker            1.6.2
Bottleneck         1.4.2
Brotli             1.0.9
bs4                0.0.1
cchardet           2.1.7
certifi            2023.5.7
cffi               1.15.1
chardet            3.0.4
charset-normalizer 3.1.0
click              8.0.1
colorama           0.4.6
comm               0.2.2
cryptography       41.0.3
cssselect          1.2.0
debugpy            1.8.14
decorator          5.2.1
elastic-transport  7.16.0
elasticsearch      7.17.9
environs           9.5.0
executing          2.1.0
fake-headers       1.0.2
Flask              2.3.2
Flask-SQLAlchemy   3.0.5
frozenlist         1.3.3
gevent             21.12.0
greenlet           1.1.3.post0
gunicorn           19.9.0
html5lib           1.1
