In [13]:
import os
import re
import logging  
import requests  
import numpy as np
import cssutils
import zipfile
import nltk  
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup, Comment  
from urllib.parse import urljoin  
import gensim.downloader as api
from gensim.models import KeyedVectors 
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity  
 
# 忽略所有警告
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'

# 配置日志记录  
logging.basicConfig(level=logging.INFO)  
logger = logging.getLogger(__name__)  
  
def clean_html(html_content):  
    # 使用BeautifulSoup解析HTML  
    soup = BeautifulSoup(html_content, 'html.parser')  
      
    # 移除多余的空白和换行  
    for tag in soup.findAll(text=True):  
        if tag.strip().replace("\n", "") == "" and tag.parent.name not in ['pre', 'textarea']:  
            tag.extract()  
      
    # 移除注释  
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))  
    [comment.extract() for comment in comments]  
      
    # 提取并返回清理后的HTML  
    cleaned_html = str(soup).replace(">\n<", "><")  # 移除标签间的换行  
    cleaned_html = re.sub(r">\s+<", "><", cleaned_html)  # 移除标签间多余的空白字符
    cleaned_html = re.sub(r'\s+', ' ', cleaned_html)  # 将所有连续的空白字符替换为一个空格  
    cleaned_html = cleaned_html.replace(' {', '{').replace(' }', '}')  # 去除大括号内外的空格  
    cleaned_html = cleaned_html.replace(': ', ':').replace('; ', ';')  # 去除属性和值之间以及分号后的空格  
    cleaned_html = cleaned_html.strip()  # 去除字符串两端的空白字符
    
    return cleaned_html  

def clean_css(css_content):
    css_text = css_content
    cleaned_css = re.sub(r'>\s+<', '><', css_text)  # 去除标签之间的多余空格  
    cleaned_css = re.sub(r'\s+', ' ', cleaned_css)  # 将所有连续的空白字符替换为一个空格  
    cleaned_css = cleaned_css.replace(' {', '{').replace(' }', '}')  # 去除大括号内外的空格  
    cleaned_css = cleaned_css.replace(': ', ':').replace('; ', ';')  # 去除属性和值之间以及分号后的空格  
    cleaned_css = cleaned_css.strip()  # 去除字符串两端的空白字符  
    return cleaned_css 
  
def fetch_and_clean_webpage(response):  
    try:          
        # 解析HTML内容  
        soup = BeautifulSoup(response.text, 'html.parser')  
          
        # 提取并清理内部CSS样式（如果有）  
        internal_css = ''  
        for style_tag in soup.find_all('style'):  
            internal_css += style_tag.get_text()  
        cleaned_internal_css = clean_css(internal_css)  
          
        # 分离HTML和CSS（外部样式表）  
        css_links = soup.find_all('link', {'rel': 'stylesheet'})  
        css_contents = ""  
          
        # 获取并清理所有外部CSS文件内容  
        for css_link in css_links:  
            css_url = css_link.get('href')  
            if css_url:  
                full_css_url = urljoin(url, css_url)  # 使用urljoin构建完整的CSS URL  
                try:  
                    css_response = requests.get(full_css_url, timeout=100)  
                    css_response.raise_for_status()  
                    css_content = css_response.text  
                    cleaned_css = clean_css(css_content)  
                    css_contents += cleaned_css  
                except requests.RequestException as e:  
                    logger.error(f"Error fetching CSS from {full_css_url}: {e}")  
          
        # 清理并返回HTML内容以及CSS内容（包括内部和外部样式）  
        cleaned_html = clean_html(str(soup))  
        return cleaned_html, css_contents + cleaned_internal_css  
    except requests.RequestException as e:  
        logger.error(f"Error fetching webpage from {url}: {e}")  
        raise  # Reraise the exception after logging  
    except Exception as e:  
        logger.exception(f"An unexpected error occurred while processing {url}: {e}")  
        raise  # Reraise the exception after logging  

def main(url):
    
    # 若本地缺乏该模型，则需激活该行语句进行下载
    print("欢迎使用网页相似度对比小程序！本程序需运行Google Word2Vec模型,")
    print("您本地若不存在该模型，则系统将自动为您下载，请保证网络通畅！")
    print("模型下载完毕后，请解压该模型包并将模型文件放入Jupyter根目录并重新运行本程序！") 
    try:
        # 加载Google News的Word2Vec模型 
        print("模型已检测到！请耐心等待模型加载...") 
        model_path = 'GoogleNews-vectors-negative300.bin'  
        model = KeyedVectors.load_word2vec_format(model_path, binary=True)  
    except:
        wv = api.load('word2vec-google-news-300')
        print("Please restart the program!")
        exit()
    
    url = ''
    # 获取并清理指定网页的内容 
    print("模型已加载完毕！") 
    print("请输入第一个网址:")
    url = 'http://' + input().strip()
    response = requests.get(url, timeout=100)
    try:  
        cleaned_html, cleaned_css = fetch_and_clean_webpage(response)
    except Exception as e:  
        print(f"An error occurred: {e}")
    
    print("请输入第二个网址:")
    url = 'http://' + input().strip()  
    response = requests.get(url, timeout=100)

    try:  
        cleaned_html_c, cleaned_css_c = fetch_and_clean_webpage(response)
    except Exception as e:  
        print(f"An error occurred: {e}")

    words1 = word_tokenize(cleaned_html) 
    words2 = word_tokenize(cleaned_html_c)     

    # 将词汇转换为向量并计算平均向量  
    vector1 = np.mean([model[word] for word in words1 if word in model], axis=0)  
    vector2 = np.mean([model[word] for word in words2 if word in model], axis=0)  

    # 计算余弦相似度  
    similarity = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]  
    print(f'两个网页的相似度为: {similarity}')
    
if __name__== "__main__" :
    url = ''
    main(url)

INFO:gensim.models.keyedvectors:loading projection weights from GoogleNews-vectors-negative300.bin


欢迎使用网页相似度对比小程序！本程序需运行Google Word2Vec模型,
您本地若不存在该模型，则系统将自动为您下载，请保证网络通畅！
模型下载完毕后，请解压该模型包并将模型文件放入Jupyter根目录并重新运行本程序！
模型已检测到！请耐心等待模型加载...


INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-05-22T10:21:02.339894', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'load_word2vec_format'}


模型已加载完毕！
请输入第一个网址:
www.google.com


ERROR:__main__:Error fetching CSS from /xjs/_/ss/k=xjs.hp.eg0cCuMGRO8.L.X.O/am=AQAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAgQAAAAAAACAAAAAAIgAACAI/d=1/ed=1/rs=ACT90oFsGfnLe8OJfaH_K4qEI_DqP_66nQ/m=sb_he,d: Invalid URL '/xjs/_/ss/k=xjs.hp.eg0cCuMGRO8.L.X.O/am=AQAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAgQAAAAAAACAAAAAAIgAACAI/d=1/ed=1/rs=ACT90oFsGfnLe8OJfaH_K4qEI_DqP_66nQ/m=sb_he,d': No scheme supplied. Perhaps you meant http:///xjs/_/ss/k=xjs.hp.eg0cCuMGRO8.L.X.O/am=AQAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAgQAAAAAAACAAAAAAIgAACAI/d=1/ed=1/rs=ACT90oFsGfnLe8OJfaH_K4qEI_DqP_66nQ/m=sb_he,d?


请输入第二个网址:
www.baidu.com
两个网页的相似度为: 0.769131064414978
