In [23]:
pip install beautifulsoup4 lxml nltk scikit-learn requests



In [37]:
from bs4 import BeautifulSoup
import requests
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
from collections import deque

# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
def tokenize_and_normalize(html_text):
    # Parse HTML and extract text
    soup = BeautifulSoup(html_text, 'lxml')
    text = soup.get_text()

    # Remove numbers and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Convert to lowercase and split into tokens
    tokens = text.lower().split()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

def stem_terms(terms):
    stemmer = PorterStemmer()
    stemmed_terms = [stemmer.stem(term.lower()) for term in terms]
    return stemmed_terms

def compute_tfidf(corpus, terms):
    vectorizer = TfidfVectorizer(vocabulary=terms, token_pattern=r'(?u)\b\w+\b')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer

def measure_article_relevance_with_tfidf(article_html, terms, corpus, threshold=0.1):
    # Tokenize and normalize the input article
    processed_article = tokenize_and_normalize(article_html)

    # Debug: Print processed article
    print("Processed article:", processed_article)

    # Manually check if the terms are present in the processed article tokens
    article_tokens = processed_article.split()
    matched_terms = [term for term in terms if term in article_tokens]
    print("Matched terms:", matched_terms)

    # Update corpus with the processed article
    corpus.append(processed_article)

    # Compute TF-IDF
    tfidf_matrix, vectorizer = compute_tfidf(corpus, terms)

    # Extract TF-IDF scores for the last document (the input article)
    tfidf_scores = tfidf_matrix[-1].toarray()[0]

    # Debug: Print TF-IDF scores
    print("TF-IDF scores:", tfidf_scores)

    # Calculate the relevance score as the sum of TF-IDF scores
    relevance_score = np.sum(tfidf_scores)

    # Debug: Print relevance score
    print("Relevance score:", relevance_score)

    # Determine if the article is relevant
    return relevance_score >= threshold

In [29]:
# Example usage
biological_and_taxonomic_terms = [
    "Crocodile",
    "Crocodylidae",
    "Crocodylus",
    "Alligator",
    "Caiman",
    "Gharial",
    "Crocodilian"
]

# Stem the terms
stemmed_terms = stem_terms(biological_and_taxonomic_terms)

# Debug: Print stemmed terms
print("Stemmed terms:", stemmed_terms)

article_html = """
<html>
    <head><title>Crocodile</title></head>
    <body>
        <p>Crocodiles are large <b>aquatic</b> reptiles that live throughout the tropics in Africa, Asia, the Americas, and Australia.</p>
        <p>The family <i>Crocodylidae</i> includes the true crocodiles and their closest relatives, such as Alligators and Gharials.</p>
        <p>There are 14 extant species of crocodiles.</p>
    </body>
</html>
"""

# Corpus of previously processed articles (can be empty initially)
corpus = []

# Measure relevance with TF-IDF
is_relevant = measure_article_relevance_with_tfidf(article_html, stemmed_terms, corpus)
print("Is the article relevant?", is_relevant)

Stemmed terms: ['crocodil', 'crocodylida', 'crocodylu', 'allig', 'caiman', 'gharial', 'crocodilian']
Processed article: crocodil crocodil larg aquat reptil live throughout tropic africa asia america australia famili crocodylida includ true crocodil closest rel allig gharial extant speci crocodil
Matched terms: ['crocodil', 'crocodylida', 'allig', 'gharial']
TF-IDF scores: [0.91766294 0.22941573 0.         0.22941573 0.         0.22941573
 0.        ]
Relevance score: 1.6059101370939324
Is the article relevant? True


In [34]:
# Get all valid Wikipedia article links from a page
def get_links_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('/wiki/') and ':' not in a['href']]
    full_links = [requests.compat.urljoin(url, link) for link in links]
    return full_links

In [35]:
initial_url = 'https://en.wikipedia.org/wiki/Crocodile'
initial_links = get_links_from_page(initial_url)
initial_links

['https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Crocodile',
 'https://en.wikipedia.org/wiki/Crocodile',
 'https://en.wikipedia.org/wiki/Crocodile',
 'https://en.wikipedia.org/wiki/Crocodile_(disambiguation)',
 'https://en.wikipedia.org/wiki/Eocene',
 'https://en.wikipedia.org/wiki/Holocene',
 'https://en.wikipedia.org/wiki/Megaannum',
 'https://en.wikipedia.org/wiki/Precambrian',
 'https://en.wikipedia.org/wiki/Cambrian',
 'https://en.wikipedia.org/wiki/Ordovician',
 'https://en.wikipedia.org/wiki/Silurian',
 'https://en.wikipedia.org/wiki/Devonian',
 'https://en.wikipedia.org/wiki/Carboniferous',
 'https://en.wikipedia.org/wiki/Permian',
 'https://en.wikipedia.org/wiki/Triassic',
 'https://en.wikipedia.org/wiki/Jurassic',
 'https://en.wikipedia.org/wiki/Cretaceous',
 'https://en.wikipedia.org/wiki/Paleogene',
 'https://en.wikipedia.org/wiki/Neogene',
 'https://en.wikipedia.org/wiki/Nile_crocodile',
 'https://en.wi

In [39]:
# Wikipedia crawling function
def crawl_wikipedia(start_url, terms, limit=10):
    relevant_links = []
    queue = deque([start_url])
    visited = set([start_url])
    corpus = []

    while queue and len(relevant_links) < limit:
        current_url = queue.popleft()
        response = requests.get(current_url)
        content = response.content

        if measure_article_relevance_with_tfidf(content, terms, corpus):
            if current_url not in relevant_links:
                relevant_links.append(current_url)
            new_links = get_links_from_page(current_url)
            for link in new_links:
                if link not in visited:
                    visited.add(link)
                    queue.append(link)

    return relevant_links

In [40]:
# Example usage
biological_and_taxonomic_terms = [
    "Crocodile",
    "Crocodylidae",
    "Crocodylus",
    "Alligator",
    "Caiman",
    "Gharial",
    "Crocodilian"
]

# Stem the terms
stemmed_terms = stem_terms(biological_and_taxonomic_terms)

# Start crawling
relevant_links = crawl_wikipedia(initial_url, stemmed_terms, limit=10)
print("Relevant links:", relevant_links)

Processed article: crocodil wikipedia jump content main menu main menu move sidebar hide navig main pagecontentscurr eventsrandom articleabout wikipediacontact usdon contribut helplearn editcommun portalrec changesupload file search search creat account log person tool creat account log page log editor learn contributionstalk content move sidebar hide top etymolog taxonomi phylogeni toggl taxonomi phylogeni subsect phylogeni speci characterist toggl characterist subsect size teeth biolog behaviour toggl biolog behaviour subsect sens vision olfact hear touch hunt diet bite locomot longev social behaviour vocal reproduct cognit relationship human toggl relationship human subsect danger human crocodil product crocodil hunt conserv religion mytholog languag symbol fashion logo see also refer read extern link toggl tabl content crocodil languag achafrikaansngliscaragonsazrbaycancabasa balibanjar bnlmgbrezhonegcatalcebuanoetinachishonacymraegdanskdeutschdin bizaadespaolesperantoeuskarafranai