In [5]:
import re
import sys
import json
import traceback
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
from datetime import datetime
from queue import Queue
import asyncio
import bson.json_util

import pymongo
from kipp.decorator import timer
from kipp.utils import setup_logger
from pymongo import MongoClient
from tweepy import API, OAuthHandler
import requests
from bs4 import BeautifulSoup


logger = setup_logger("telegram")
executor = ThreadPoolExecutor(max_workers=10)


# sys.path.append(r'/Users/laisky/repo/laisky/ramjet/ramjet/settings')
sys.path.append(r"/opt/configs/ramjet")
sys.path
import prd

mongo = MongoClient(
    f"mongodb://{prd.MONGO_ADMIN_USER}:{prd.MONGO_ADMIN_PASSWD}@{prd.MONGO_HOST}:{prd.MONGO_PORT}",
)
notes = mongo["telegram"]["notes"]


# add index to db
notes.create_index([("post_id", pymongo.ASCENDING)], unique=True)
notes.create_index([("content", pymongo.TEXT)]) # add text index for field `content`

'content_text'

In [None]:
from random import choice


def upload_akord(filecontent: bytes) -> str:
    """upload file to akord

    Args:
        filecontent (bytes): file content

    Returns:
        str: txid
    """
    url = "https://api.akord.com/files"
    apikey = choice(prd.AKORD_APIKEYs)
    resp = requests.post(
        url,
        data=filecontent,
        headers={
            "Accept": "application/json",
            "Api-Key": apikey,
            "Content-Type": "application/json",
        },
    )
    assert resp.status_code == 200, f"[{resp.status_code}]{resp.text}"

    return resp.json()["tx"]["id"]


@timer
def _upload_one_post(docu):
    txid = upload_akord(bson.json_util.dumps(docu))
    notes.update_one({"_id": docu["_id"]}, {"$set": {"akord_txid": txid}})
    logger.info(f"succeed uploaded {docu['post_id']=} to arweave")


@timer
def upload_all_posts():
    # for docu in notes.find({"arweave_id": {"$exists": False}}):
    fs = []
    for docu in notes.find():
        fs.append(executor.submit(_upload_one_post, docu))
        break

    wait(fs, return_when=ALL_COMPLETED)


upload_all_posts()

In [None]:
# post_id = 294
# url = f"https://t.me/laiskynotes/{post_id}"
# resp = requests.get(url, headers={
#     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
#     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
#     "Accept-Language": "en,zh-CN;q=0.9,zh-TW;q=0.8,zh;q=0.7,fr;q=0.6",
# })
# assert resp.status_code == 200

# # extract content
# soup = BeautifulSoup(resp.text, "html.parser")
# content = soup.select_one("head > meta:nth-child(8)").attrs["content"]

# # extract image
# image_urls = []
# images = soup.find_all("body .tgme_widget_message_grouped_layer .tgme_widget_message_photo_wrap")
# images

In [None]:
@timer
def fetch_content(post_id: str):
    logger.info(f"fetching {post_id=}")
    url = f"https://t.me/laiskynotes/{post_id}"
    resp = requests.get(
        url,
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
            "Accept-Language": "en,zh-CN;q=0.9,zh-TW;q=0.8,zh;q=0.7,fr;q=0.6",
        },
    )
    assert resp.status_code == 200

    # extract content
    soup = BeautifulSoup(resp.text, "html.parser")
    ele = soup.select_one("head > meta:nth-child(8)")
    if not ele:
        logger.info(f"cannot find element in {post_id=}")
        return

    content = ele.attrs["content"]
    if (
        not content.strip()
        or "记录和分享有趣的信息。 		Record and share interesting information." in content
    ):
        logger.info(f"cannot find content in {post_id=}")
        return

    # extract image
    #     image_urls = []
    #     images = soup.select("body > div > div.tgme_widget_message_bubble > div.tgme_widget_message_grouped_wrap.js-message_grouped_wrap > div > div a[style]")
    #     for img in images:
    #         image_urls.append(img.attrs["style"])

    # save to db
    now = datetime.now()
    docu_id = notes.update_one(
        {"post_id": post_id},
        {
            "post_id": post_id,
            "content": content,
            "created_at": now,
            "updated_at": now,
        },
        upsert=True,
    ).upserted_id

    # upload to akord
    if not docu_id:
        docu = notes.find_one({"post_id": post_id})
    else:
        docu = notes.find_one({"_id": docu_id})

    txid = upload_akord(bson.json_util.dumps(docu)(docu))

    # update db with txid
    notes.update_one(
        {"_id": docu_id},
        {"$set": {"arweave_id": txid}},
    )


@timer
def main():
    futures = []
    for i in range(1, 298):
        f = executor.submit(fetch_content, i)
        futures.append(f)

    wait(futures, return_when=ALL_COMPLETED)


main()