In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
class Article:
    article_url: str
    original_title: str
    
    def __init__(self, url: str, title: str) -> None:
        self.article_url = url
        self.original_title = title

class ArticleCollection:
    articles: list[Article]

    def __init__(self):
        self.articles = []

    def add_article(self, article: Article):
        self.articles.append(article)

    def info(self,):
        print(f"There are {len(self.articles)} titles.")
        
    def display_titles(self):
        for article in self.articles:
            print(f"Original Title: {article.original_title}")
            print(f"URL: {article.article_url}")
            print("-" * 15, '\n')

In [3]:
class ArticleCollectionFromUrl(ArticleCollection):
    urls: list[str] = []
    def __init__(self, urls=[]):
        self.urls = urls
        super().__init__()

    def fetch_articles(self,):
        for url in tqdm(self.urls):
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                for item in soup.find_all("li", class_="announcement-item"):
                    link_tag = item.find("h2").find("a")
                    title = link_tag.text.strip()
                    href = link_tag["href"]
                    self.add_article(Article(url=href, title=title))
            else:
                print(f"無法訪問網站，狀態碼：{response.status_code}")
        return len(self.articles)

In [5]:
PAGES = 20
Articles = ArticleCollectionFromUrl()
Articles.urls = [f"https://www.cs.nycu.edu.tw/announcements?page={i}" for i in range(1, PAGES+1)]

Articles.fetch_articles()
Articles.info()

100%|██████████| 20/20 [00:01<00:00, 11.11it/s]

There are 207 titles.





In [None]:
import requests
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import numpy as np
import nltk
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 預處理文本
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]# 移除標點
    tokens = [word for word in tokens if word not in stop_words]# 除掉停用詞
    return tokens

class ArticleSearch:
    def __init__(self, articles, model_name='paraphrase-MiniLM-L6-v2'):
        self.articles = articles
        self.model = SentenceTransformer(model_name)
        self.article_titles = [article.original_title for article in articles]
        self.article_vectors = self.model.encode(self.article_titles)

    # 回傳前K個相似度最高的向量
    def Kth_max(self, arr, k=1):
        return np.argsort(-arr, axis=0)[:k]

    def cosine_similarity_custom(self, A, B):
        dot_product = np.dot(A, B)
        norm_A = np.linalg.norm(A)
        norm_B = np.linalg.norm(B)
        return dot_product / (norm_A * norm_B)

    def search(self, query):
        query_vector = self.model.encode([query])
        similarities = np.array([self.cosine_similarity_custom(query_vector, b)[0] for b in self.article_vectors])
        return similarities

    def get_suggestions(self, query, k=5):
        similarities = self.search(query)
        suggestions = self.Kth_max(similarities, k)
        return suggestions, similarities

    def print_suggestions(self, query, k=5):
        suggestions, similarities = self.get_suggestions(query, k)
        for index in suggestions:
            print(f"{index}\t{similarities[index]:.4f} : {self.article_titles[index]}")
        print(f"最相似的文章是: {self.article_titles[suggestions[0]]}")
        
article_search = ArticleSearch(Articles.articles)

In [7]:
query = "獎學金"
article_search.print_suggestions(query, k=5)

156	0.8431 : 財團法人福琳工商發展基金會工商學生清寒獎學金
127	0.8124 : 戴夫寇爾全國資訊安全獎學金計畫
105	0.7936 : 力積電獎學金及學/暑期實習說明會(報名至10/8 中午12:00截止)
16	0.7906 : 【趨勢科技-新鮮人擴大徵才】
42	0.7845 : 國泰儲備/海外人才實習說明
最相似的文章是: 財團法人福琳工商發展基金會工商學生清寒獎學金


In [None]:
# 用於驗證
for idx, i in enumerate(Articles.articles):
    if i.original_title.find("UIUC") != -1:
        print(idx, i.original_title)