In [None]:
# fetch by actress list
fetch_by_actress_list()

In [None]:
import asyncio
import json
import logging
import re
import sys
import time
import traceback
from collections import namedtuple
from concurrent.futures import ALL_COMPLETED, ThreadPoolExecutor, wait
from datetime import datetime
from hashlib import sha256
from io import BytesIO
from queue import Queue
from typing import Generator, List, NamedTuple, Optional

import bson.json_util
import pymongo
import requests
from bs4 import BeautifulSoup
from bson import ObjectId
from kipp.decorator import timer
from kipp.utils import setup_logger
from minio import Minio
from pymongo import MongoClient
from tweepy import API, OAuthHandler

logger = setup_logger("jav")
logger.setLevel(logging.DEBUG)
executor = ThreadPoolExecutor(max_workers=100)


sys.path.append(r"/opt/configs/ramjet")
import prd

# mongodb
mongo = MongoClient(
    f"mongodb://{prd.MONGO_ADMIN_USER}:{prd.MONGO_ADMIN_PASSWD}@{prd.MONGO_HOST}:{prd.MONGO_PORT}",
)
col_actress = mongo["jav"]["actress"]
col_movies = mongo["jav"]["movies"]


# add index to db
# notes.create_index([("post_id", pymongo.ASCENDING)], unique=True)

# minio
s3cli: Minio = Minio(
    endpoint=prd.S3_MINIO_ADDR,
    access_key=prd.S3_KEY,
    secret_key=prd.S3_SECRET,
    secure=True,
)

In [None]:
actress_index_url = "https://onejav.com/actress/"
ACTRESS_START_PAGE = 1


class Actress(NamedTuple):
    id: ObjectId
    name: str
    name_english: Optional[str]
    url: str


class Movie(NamedTuple):
    actress_id: ObjectId
    name: str
    description: Optional[str]
    img_url: str
    tags: List[str]


def gen_actress() -> Generator[Actress, None, None]:
    n_page = ACTRESS_START_PAGE
    n_retry = 0
    while True:
        url = f"https://onejav.com/actress/?page={n_page}"
        resp = requests.get(url)
        if resp.status_code == 500:
            if n_retry < 3:
                time.sleep(3)
                logger.warn(f"gen_actress retry {url=}, {n_retry=}")
                n_retry += 1
                continue
            else:
                n_retry = 0
                n_page += 1
                continue

        if resp.status_code != 200:
            logger.info(f"gen_actress exit for {url=} [{resp.status_code}]{resp.text}")
            return

        soup = BeautifulSoup(resp.text, "html.parser")
        for card in soup.select(".container .card"):
            name, url = "", ""

            ele = card.select_one(".card-header .card-header-title")
            if ele:
                name = ele.decode_contents().strip()

            ele = card.select_one(".card-header a")
            if ele:
                url = f"https://onejav.com{ele.attrs['href']}?page="

            logger.debug(f"yield actress {name}")
            actress = save_actress(name, url)
            yield actress

        n_page += 1


reg_name = re.compile("([\w ]+)(?:<[^>]+>(\w+).*)?")


def save_actress(name: str, url: str, name_english: str = "") -> Actress:
    """save actress to db, return actress_id"""
    logger.info(f"save actress {name}")

    # extract name
    # name = 'Jun Suehiro <small class="text-muted ml-1">末広純</small>'
    [(name_english, name)] = reg_name.findall(name)
    name = name.strip()
    name_english = name_english.strip()
    name = name or name_english

    docu = {"name": name, "url": url}
    if name_english:
        docu.update({"name_english": name_english})

    col_actress.update_one(
        {"name": name},
        {"$set": docu},
        upsert=True,
    )

    docu = col_actress.find_one({"name": name})
    assert docu, f"can not find actress {name}"
    return Actress(name=name, url=url, id=docu["_id"], name_english=name_english)


def replace_image_url(img_url) -> str:
    """download image and upload to s3, return new url"""
    resp = requests.get(img_url)
    assert resp.status_code == 200

    img_content = resp.content
    digest = sha256(img_content).hexdigest()
    objkey = f"jav/{digest[:2]}/{digest[2:4]}/{digest}.png"

    # check whether image exists
    new_img_url = f"{prd.S3_SERVER}/public/{objkey}"
    if requests.head(new_img_url).status_code == 200:
        return new_img_url

    logger.info(f"upload image to s3: {objkey}")
    s3cli.put_object(
        bucket_name="public",
        object_name=objkey,
        data=BytesIO(img_content),
        length=len(img_content),
        content_type="image/png",
    )

    return new_img_url


def gen_movies(actress: Actress):
    n_page = 1
    n_retry = 0
    while True:
        url = f"{actress.url}{n_page}"
        resp = requests.get(url)
        if resp.status_code == 500:
            if n_retry < 3:
                time.sleep(3)
                logger.warn(f"gen_movies retry {url=}, {n_retry=}")
                n_retry += 1
                continue
            else:
                n_retry = 0
                n_page += 1
                continue

        if resp.status_code != 200:
            logger.info(
                f"gen_movies exit since for {url=} [{resp.status_code}]{resp.text}"
            )
            return

        soup = BeautifulSoup(resp.text, "html.parser")
        for card in soup.select(".container .card"):
            try:
                img_url = card.select_one(".image").attrs["src"]
                name = card.select_one(".title a").decode_contents().strip()
                description = None
                if card.select_one(".has-text-grey-dark"):
                    description = (
                        card.select_one(".has-text-grey-dark").decode_contents().strip()
                    )
                tags = [
                    ele.decode_contents().strip() for ele in card.select(".tags .tag")
                ]

                img_url = replace_image_url(img_url)
                movie = Movie(
                    actress_id=actress.id,
                    name=name,
                    description=description,
                    img_url=img_url,
                    tags=tags,
                )
                yield movie
            except Exception:
                logger.exception(f"parse movie error {url=}")
                continue

        n_page += 1


def save_movie(movie: Movie):
    logger.info(f"save movie {movie.name}")
    col_movies.update_one(
        {"name": movie.name},
        {
            "$set": {
                "actress_id": movie.actress_id,
                "name": movie.name,
                "description": movie.description,
                "img_url": movie.img_url,
                "tags": movie.tags,
            }
        },
        upsert=True,
    )

In [None]:
@timer
def _run_gen_actress(q: Queue[Optional[Actress]]):
    for actress in gen_actress():
        q.put(actress)


@timer
def _run_gen_movies(
    id_: str, upstream: Queue[Optional[Actress]], downstream: Queue[Optional[Movie]]
):
    while True:
        actress = upstream.get()
        if actress is None:
            upstream.put(None)
            logger.info(f"_run_gen_movies[{id_}] exit")
            return

        for movie in gen_movies(actress):
            downstream.put(movie)


@timer
def _run_save_movie(id_: str, upstream: Queue[Optional[Movie]]):
    while True:
        movie = upstream.get()
        if movie is None:
            upstream.put(None)
            logger.info(f"_run_save_movie[{id_}] exit")
            return

        save_movie(movie)


@timer
def fetch_by_actress_list():
    """fetch actresses and movies by actress list and save to db
    """
    actress_queue = Queue(maxsize=10)
    movie_queue = Queue(maxsize=10)

    fs_load_actress = []
    fs_load_movies = []
    fs_save_movies = []

    fs_load_actress.append(executor.submit(_run_gen_actress, actress_queue))
    for i in range(20):
        fs_load_movies.append(
            executor.submit(_run_gen_movies, str(i), actress_queue, movie_queue)
        )
        fs_save_movies.append(executor.submit(_run_save_movie, str(i), movie_queue))

    wait(fs_load_actress, return_when=ALL_COMPLETED)
    actress_queue.put(None)
    wait(fs_load_movies, return_when=ALL_COMPLETED)
    movie_queue.put(None)
    wait(fs_load_movies, return_when=ALL_COMPLETED)