In [13]:
import os
import re
import logging  
import requests  
import numpy as np
import cssutils
import zipfile
import nltk  
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup, Comment  
from urllib.parse import urljoin  
import gensim.downloader as api
from gensim.models import KeyedVectors 
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity  
 
# Configure logging
logging.basicConfig(level=logging.INFO)  
logger = logging.getLogger(__name__)  
  
def clean_html(html_content):  
    #Parsing HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')  
      
    # Remove excess space and line breaks
    for tag in soup.findAll(text=True):  
        if tag.strip().replace("\n", "") == "" and tag.parent.name not in ['pre', 'textarea']:  
            tag.extract()  
      
    # Remove comments
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))  
    [comment.extract() for comment in comments]  
      
    # Extract and return cleaned HTML
    cleaned_html = str(soup).replace(">\n<", "><")  # Remove excess spaces between labels 
    cleaned_html = re.sub(r">\s+<", "><", cleaned_html)  # Remove excess spaces between labels
    cleaned_html = re.sub(r'\s+', ' ', cleaned_html)  # Replace all consecutive white space characters with one space   
    cleaned_html = cleaned_html.replace(' {', '{').replace(' }', '}')  # Remove spaces inside and outside of curly braces
    cleaned_html = cleaned_html.replace(': ', ':').replace('; ', ';')  # Remove spaces between attributes and values, as well as spaces after semicolons 
    cleaned_html = cleaned_html.strip()  # Remove space characters from both ends of the string
    
    return cleaned_html  

def clean_css(css_content):
    css_text = css_content
    cleaned_css = re.sub(r'>\s+<', '><', css_text)  # Remove excess spaces between labels
    cleaned_css = re.sub(r'\s+', ' ', cleaned_css)  # Replace all consecutive white space characters with one space  
    cleaned_css = cleaned_css.replace(' {', '{').replace(' }', '}')  # Remove spaces inside and outside of curly braces
    cleaned_css = cleaned_css.replace(': ', ':').replace('; ', ';')  # Remove spaces between attributes and values, as well as spaces after semicolons 
    cleaned_css = cleaned_css.strip()  # Remove space characters from both ends of the string
    return cleaned_css 
  
def fetch_and_clean_webpage(response):  
    try:          
        # Parsing HTML content
        soup = BeautifulSoup(response.text, 'html.parser')  
          
        # Extract and clean internal CSS styles (if any)
        internal_css = ''  
        for style_tag in soup.find_all('style'):  
            internal_css += style_tag.get_text()  
        cleaned_internal_css = clean_css(internal_css)  
          
        # Separate HTML and CSS (external style sheets) 
        css_links = soup.find_all('link', {'rel': 'stylesheet'})  
        css_contents = ""  
          
        # Get and clean up all external CSS file contents
        for css_link in css_links:  
            css_url = css_link.get('href')  
            if css_url:  
                full_css_url = urljoin(url, css_url)  # Build a complete CSS URL using urljoin  
                try:  
                    css_response = requests.get(full_css_url, timeout=100)  
                    css_response.raise_for_status()  
                    css_content = css_response.text  
                    cleaned_css = clean_css(css_content)  
                    css_contents += cleaned_css  
                except requests.RequestException as e:  
                    logger.error(f"Error fetching CSS from {full_css_url}: {e}")  
          
        # Clean up and return HTML content and CSS content (including internal and external styles) 
        cleaned_html = clean_html(str(soup))  
        return cleaned_html, css_contents + cleaned_internal_css  
    except requests.RequestException as e:  
        logger.error(f"Error fetching webpage from {url}: {e}")  
        raise  # Reraise the exception after logging  
    except Exception as e:  
        logger.exception(f"An unexpected error occurred while processing {url}: {e}")  
        raise  # Reraise the exception after logging  

def main(url):
    
    print("Welcome to the webpage similarity comparison program! This program needs to run the Google Word2Vec model,")
    print("If the model does not exist locally, the system will automatically download it for you. Please ensure smooth network connection!")
    print("After downloading the model, please unzip the model package and place the model file in the Jupyter root directory and run this program again!") 
    try:
        # Load Google News的Word2Vec Model 
        print("Model detected! Please be patient while the model loads...") 
        model_path = 'GoogleNews-vectors-negative300.bin'  
        model = KeyedVectors.load_word2vec_format(model_path, binary=True)  
    except:
        wv = api.load('word2vec-google-news-300')
        print("Please restart the program!")
        exit()
    
    url = ''
    # Locate the contents of the destinated webpage
    print("The model has been loaded completely！") 
    print("Please enter the first website address:")
    url = 'http://' + input().strip()
    response = requests.get(url, timeout=100)
    try:  
        cleaned_html, cleaned_css = fetch_and_clean_webpage(response)
    except Exception as e:  
        print(f"An error occurred: {e}")
    
    print("Please enter the second website address:")
    url = 'http://' + input().strip()  
    response = requests.get(url, timeout=100)

    try:  
        cleaned_html_c, cleaned_css_c = fetch_and_clean_webpage(response)
    except Exception as e:  
        print(f"An error occurred: {e}")

    words1 = word_tokenize(cleaned_html) 
    words2 = word_tokenize(cleaned_html_c)     

    # Convert vocabulary into vectors and calculate the average vector
    vector1 = np.mean([model[word] for word in words1 if word in model], axis=0)  
    vector2 = np.mean([model[word] for word in words2 if word in model], axis=0)  

    # Calculate cosine similarity 
    similarity = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]  
    print(f'The similarity between the two websites is: {similarity}')
    
if __name__== "__main__" :
    url = ''
    main(url)

INFO:gensim.models.keyedvectors:loading projection weights from GoogleNews-vectors-negative300.bin


欢迎使用网页相似度对比小程序！本程序需运行Google Word2Vec模型,
您本地若不存在该模型，则系统将自动为您下载，请保证网络通畅！
模型下载完毕后，请解压该模型包并将模型文件放入Jupyter根目录并重新运行本程序！
模型已检测到！请耐心等待模型加载...


INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-05-22T10:21:02.339894', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'load_word2vec_format'}


模型已加载完毕！
请输入第一个网址:
www.google.com


ERROR:__main__:Error fetching CSS from /xjs/_/ss/k=xjs.hp.eg0cCuMGRO8.L.X.O/am=AQAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAgQAAAAAAACAAAAAAIgAACAI/d=1/ed=1/rs=ACT90oFsGfnLe8OJfaH_K4qEI_DqP_66nQ/m=sb_he,d: Invalid URL '/xjs/_/ss/k=xjs.hp.eg0cCuMGRO8.L.X.O/am=AQAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAgQAAAAAAACAAAAAAIgAACAI/d=1/ed=1/rs=ACT90oFsGfnLe8OJfaH_K4qEI_DqP_66nQ/m=sb_he,d': No scheme supplied. Perhaps you meant http:///xjs/_/ss/k=xjs.hp.eg0cCuMGRO8.L.X.O/am=AQAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAgQAAAAAAACAAAAAAIgAACAI/d=1/ed=1/rs=ACT90oFsGfnLe8OJfaH_K4qEI_DqP_66nQ/m=sb_he,d?


请输入第二个网址:
www.baidu.com
两个网页的相似度为: 0.769131064414978
