In [45]:
import requests
from bs4 import BeautifulSoup

In [42]:
# post with payload
def postToCointelegraph(pageLength, offest):
    payload = {
        "operationName": "TagPageQuery",
        "query": "query TagPageQuery($short: String, $slug: String!, $order: String, $offset: Int!, $length: Int!) {\n  locale(short: $short) {\n    tag(slug: $slug) {\n      id\n      slug\n      avatar\n      createdAt\n      updatedAt\n      redirectRelativeUrl\n      alternates {\n        short\n        domain\n        id\n        code\n        __typename\n      }\n      tagTranslates {\n        id\n        title\n        metaTitle\n        pageTitle\n        description\n        metaDescription\n        keywords\n        __typename\n      }\n      posts(order: $order, offset: $offset, length: $length) {\n        data {\n          id\n          slug\n          views\n          postTranslate {\n            id\n            title\n            avatar\n            published\n            publishedHumanFormat\n            leadText\n            author {\n              id\n              slug\n              authorTranslates {\n                id\n                name\n                __typename\n              }\n              __typename\n            }\n            __typename\n          }\n          category {\n            id\n            slug\n            __typename\n          }\n          author {\n            id\n            slug\n            authorTranslates {\n              id\n              name\n              __typename\n            }\n            __typename\n          }\n          postBadge {\n            id\n            label\n            postBadgeTranslates {\n              id\n              title\n              __typename\n            }\n            __typename\n          }\n          showShares\n          showStats\n          __typename\n        }\n        postsCount\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}",
        "variables": {
            "cacheTimeInMS": 300000,
            "length": pageLength,
            "offset": offest,
            "order": "postPublishedTime",
            "short": "en",
            "slug": "markets"
        }
    }
    url = "https://conpletus.cointelegraph.com/v1/"
    headers = {
        "Content-Type": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.post(url, headers=headers, json=payload)
    return response.json()

In [47]:
pageLength = 15
limit = 100

newsBasePath = "https://cointelegraph.com/news"

newsList = []
for i in range(0, limit, pageLength):
    data = postToCointelegraph(pageLength, i)
    posts = data["data"]["locale"]["tag"]["posts"]["data"]
    if len(posts) == 0:
        break

    for post in posts:
        postSlug = post["slug"]
        newsList.append(f"{newsBasePath}/{postSlug}")


In [59]:
articles = []

for news in newsList:
    # crawl news with tag article
    response = requests.get(news, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(response.content, "html.parser")
    article = soup.find("article")

    # find h1 tag class post__title
    title = article.find("h1", class_="post__title").text

    # find all p tags
    paragraphs = article.find_all("p")
    # get p tag text
    content = "\n".join([p.text for p in paragraphs])

    # find li tag class tags-list__item
    tags = article.find_all("li", class_="tags-list__item")
    # get a tag text
    hashtags = [tag.find("a").text for tag in tags]

    articles.append({
        "title": title,
        "content": content,
        "hashtags": hashtags
    })

In [63]:
# save articles to json file
import json
with open("articles.json", "w") as file:
    json.dump(articles, file, indent=4)

In [71]:
# doc 2 vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt_tab")

data = []
for i, article in enumerate(articles):
    data.append(TaggedDocument(words=word_tokenize(article["content"].lower()), tags=[f"article_{i}"]))

model = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=40)
model.build_vocab(data)
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)

# save model
model.save("doc2vec.model")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/khachenthammasathidkul/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [75]:
# load model
model = Doc2Vec.load("doc2vec.model")

# find similar articles 
similarArticles = []
for i in range(len(articles)):
    similar = model.docvecs.most_similar(f"article_{i}")
    similarArticles.append({
        "article": articles[i],
        "similar": [articles[int(sim[0].split("_")[1])] for sim in similar]
    })
    
print(json.dumps(similarArticles, indent=4))

[
    {
        "article": {
            "title": " Stablecoin market cap surpasses $200B as USDC dominance rises ",
            "content": " The stablecoin market cap reached a new record high above $200 billion. Is it a sign that Bitcoin and altcoins will rise higher? \nThe stablecoin market capitalization reached a record high following a period of consistent growth since mid-2023, according to data from Alphractal.\u00a0\nThe data analysis platform\u00a0reported\u00a0on Jan. 31 that the stablecoin market cap rose 73% from $121.18 billion in August 2023 to reach an all-time high of $211 billion, while USDC (USDC)\u00a0 has been gaining an edge over other stablecoins.\nStablecoin market capitalization. Source: Alphractal\nAnalyzing the market capitalization of other stablecoins, excluding Tether\u2019s USDT (USDT) and USDC, the data reveals relatively average growth since 2024.\u00a0This suggests that USDT and USDC remain the two most preferred stablecoins in the crypto market.\nStab

  similar = model.docvecs.most_similar(f"article_{i}")
