In [None]:
import re
import sys
import traceback
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import datetime
from queue import Queue

import pymongo
import tweepy
from kipp.decorator import debug_wrapper
from pymongo import MongoClient
from tweepy import API, OAuthHandler

executor = ThreadPoolExecutor(max_workers=10)


# sys.path.append(r'/Users/laisky/repo/laisky/ramjet/ramjet/settings')
sys.path.append(r"/opt/configs/ramjet")
sys.path
import prd

mongo = MongoClient(
    f"mongodb://{prd.MONGO_USER}:{prd.MONGO_PASSWD}@{prd.MONGO_HOST}:{prd.MONGO_PORT}/{prd.MONGO_DB}",
)
tweets = mongo["twitter"]["tweets"]

auth = OAuthHandler(prd.CONSUMER_KEY, prd.CONSUMER_SECRET)
auth.set_access_token(prd.ACCESS_TOKEN, prd.ACCESS_TOKEN_SECRET)
api = API(auth, wait_on_rate_limit=True, parser=tweepy.parsers.JSONParser())
api.me()


In [None]:
api.get_status(1350109300346281984, tweet_mode='extended')

In [None]:
api.user_timeline(tweet_mode='extended')

In [None]:
# index

# tweets.create_index([('created_at', pymongo.DESCENDING)])
# tweets.create_index([('id', pymongo.DESCENDING)])
tweets.create_index([('user.id', pymongo.DESCENDING)])

In [None]:
# delete tweets

# @debug_wrapper
def is_status_tobe_delete(tweet):
    if len(tweet.get("entities", {}).get("hashtags", [])) >= 1:
        return False

    if tweet["created_at"] > datetime(2021, 1, 1):
        return False

    if tweet.get("in_reply_to_status_id") is not None:
        return False

    if tweets.count_documents({"in_reply_to_status_id": tweet["id"]}) > 0:
        return False

    return True


@debug_wrapper
def run_delete():
    delete_q = Queue(maxsize=50)
    fs = []
    fs.append(executor.submit(gen_tweet, delete_q))
    for _ in range(10):
        fs.append(executor.submit(delete_tweet, delete_q))

    wait(fs)


@debug_wrapper
def gen_tweet(q: Queue):
    for tweet in tweets.find({"deleted": {"$ne": True}}).sort("created_at", 1):
        q.put(tweet)


@debug_wrapper
def delete_tweet(q: Queue):
    while 1:
        tweet = q.get()
        if not is_status_tobe_delete(tweet):
            print(f"pass tweet {tweet['id']} {tweet['created_at']}")
            continue

        try:
            api.destroy_status(tweet["id"])
        except tweepy.error.TweepError as err:
            if "No status found with that ID." in f"{err}":
                # deleted
                tweets.update_one(
                    {"_id": tweet["_id"]},
                    {"$set": {"deleted": True}},
                )
                print(f"mark {tweet['id']} {tweet['created_at']} deleted")
                continue

            traceback.print_exc()
        except Exception:
            traceback.print_exc()

        tweets.update_one(
            {"_id": tweet["_id"]},
            {"$set": {"deleted": True}},
        )

        #         n_del += 1
        print("delete", tweet["id"], tweet["created_at"])


#         if n % 1000 == 0:
#             print(f">> scan {n} statuses")
#         if n_del % 100 == 0:
#             print(f">> delete {n_del} statuses")


run_delete()


In [None]:
# download image
from pathlib import Path

import requests

dirpath = r"/var/www/uploads/twitter"
# dirpath = r'/Users/laisky/Downloads'


def download_images_for_tweet(tweet):
    for img in tweet["entities"]["media"]:
        with requests.get(img["media_url_https"] + ":orig") as r:
            if r.status_code != 200:
                print(f"download error: [{r.status_code}]{r.content}")
                continue

            fpath = Path(dirpath, img["media_url_https"].split("/")[-1])
            if fpath.is_file():
                continue

            with open(fpath, "wb") as f:
                f.write(r.content)

            print("tweet img ok", tweet["id"], fpath)


def download_images():
    for tweet in tweets.find(
        {"entities.media": {"$exists": 1}}, no_cursor_timeout=True
    ).sort("_id", -1):
        download_images_for_tweet(tweet)


download_images()


In [None]:
# download related tweets


def get_tweet_text(tweet: Dict[str, any]) -> str:
    return tweet.get("full_text") or tweet.get("text")


def twitter_api_parser(tweet: Dict[str, any]) -> Dict[str, any]:
    """Parse tweet document got from twitter api"""
    reg_topic = re.compile(r"[\b|\s]#(\S+)")
    tweet["topics"] = reg_topic.findall(get_tweet_text(tweet).replace(".", "_"))
    tweet["created_at"] = datetime.datetime.strptime(
        tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y"
    )

    # replace url
    t = get_tweet_text(tweet)
    if tweet.get("entities"):
        # parse entities media
        entities = tweet.get("extended_entities") or tweet.get("entities")
        if "media" in entities:
            for media in tweet["entities"]["media"]:
                for surl in ["url", "display_url"]:
                    durl = media.get("media_url_https") or media["media_url"]
                    t = t.replace(surl, durl)

        # parse entities urls
        if "urls" in tweet["entities"]:
            for d in tweet["entities"]["urls"]:
                for surl in ["url", "display_url"]:
                    eurl = d["expanded_url"]
                    t = t.replace(surl, eurl)

        tweet["text"] = t

    return tweet


def gen_related_tweets(
    tweetCol: pymongo.collection.Collection, tweet: Dict[str, any]
) -> Generator[str, None, None]:
    related_ids = []
    tweet.get("in_reply_to_status_id") and related_ids.append(
        tweet["in_reply_to_status_id"]
    )
    tweet.get("retweeted_status") and related_ids.append(
        tweet["retweeted_status"]["id"]
    )
    tweet.get("quoted_status") and related_ids.append(tweet["quoted_status"]["id"])
    for _id in filter(lambda id_: not tweetCol.find_one({"id": id_}), related_ids):
        yield _id


def save_relate_tweets(status):
    related_ids = []
    status.get("in_reply_to_status_id") and related_ids.append(
        status["in_reply_to_status_id"]
    )
    status.get("retweeted_status") and related_ids.append(
        status["retweeted_status"]["id"]
    )
    status.get("quoted_status") and related_ids.append(status["quoted_status"]["id"])
    related_ids = filter(lambda id_: not tweets.find_one({"id": id_}), related_ids)

    for id_ in related_ids:
        try:
            docu = api.get_status(id_, tweet_mode="extended")
        except Exception as err:
            print(f"load tweet {id_} got error: {err}")
        else:
            print(f"save tweet [{docu['user']['screen_name']}]{docu['id']}")
            save_tweet(docu)
            save_relate_tweets(docu)


def save_tweet(docu):
    docu = twitter_api_parser(docu)
    tweets.update_one({"id": docu["id"]}, {"$set": docu}, upsert=True)


def download_relate_tweets():
    for tweet in tweets.find(no_cursor_timeout=True).sort("_id", -1):
        save_relate_tweets(tweet)


# download_relate_tweets()
