In [1]:
import json
import requests
import time
import os
import urllib.request
import argparse
import pandas as pd
from tqdm import tqdm
from tools.util import get_current_time_format, generate_url_with_xbs, sleep_random
#从config中，读取视频发布者信息。update the information in config.py when download new account
from config import IS_SAVE, SAVE_FOLDER, USER_SEC_UID, IS_WRITE_TO_CSV, LOGIN_COOKIE, CSV_FILE_NAME, ACCOUNT_NAME
import pure_downloader as downloader
import sys


In [2]:
class DouYinUtil(object):

    def __init__(self, sec_uid: str):
        """
        :param sec_uid: 抖音id
        """
        self.sec_uid = sec_uid
        self.is_save = IS_SAVE
        self.save_folder = SAVE_FOLDER
        self.is_write_to_csv = IS_WRITE_TO_CSV
        self.csv_name = CSV_FILE_NAME
        self.video_api_url = ''
        self.account_name = ACCOUNT_NAME
        self.api_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
            'Referer': 'https://www.douyin.com/',
            'Cookie': LOGIN_COOKIE
        }
        self.cursor = 0
        self.videos_list = []  # 视频列表id
        self.video_info_list = []
        self.video_info_dict = {}
        self.stop_flag = False  # 默认不停止

    def get_user_video_info(self, url: str):
        res = requests.get(url, headers=self.api_headers)
        res.encoding = 'utf-8'
        res_text = res.text
        return json.loads(res_text)

    def get_all_videos(self):
        """
        获取所有的视频
        :return:
        """
        while not self.stop_flag:
            self.video_api_url = f'https://www.douyin.com/aweme/v1/web/aweme/post/?aid=6383&sec_user_id={self.sec_uid}&count=35&max_cursor={self.cursor}&cookie_enabled=true&platform=PC&downlink=10'
            xbs = generate_url_with_xbs(self.video_api_url, self.api_headers.get('User-Agent'))
            user_video_url = self.video_api_url + '&X-Bogus=' + xbs
            user_info = self.get_user_video_info(user_video_url)
            aweme_list = user_info['aweme_list']
            for aweme_info in aweme_list:
                self.video_info_list.append(aweme_info)
                self.video_info_dict.setdefault(aweme_info['aweme_id'], aweme_info)
                self.videos_list.append(aweme_info['aweme_id'])
            if int(user_info['has_more']) == 0:
                self.stop_flag = True
            else:
                self.cursor = user_info['max_cursor']
                #self.stop_flag = True
            sleep_random()
        return self.videos_list

    #use this download videos if videos are more than 200
    def get_all_videos_withdownload(self,restart,total_left):
        """
        获取所有的视频
        :return:
        """
        auto_increase_key = 0
        pbar = tqdm(total=total_left)
        while not self.stop_flag:
            self.video_api_url = f'https://www.douyin.com/aweme/v1/web/aweme/post/?aid=6383&sec_user_id={self.sec_uid}&count=35&max_cursor={self.cursor}&cookie_enabled=true&platform=PC&downlink=10'
            xbs = generate_url_with_xbs(self.video_api_url, self.api_headers.get('User-Agent'))
            user_video_url = self.video_api_url + '&X-Bogus=' + xbs
            #每次请求前将进度储存在log中方便断续重连
            with open('log.txt', 'w') as file:
                 file.write(f"{auto_increase_key}\n")
            user_info = self.get_user_video_info(user_video_url)
            aweme_list = user_info['aweme_list']
            #aweme_info 每一条视频的信息
            for aweme_info in aweme_list:
                self.video_info_list.append(aweme_info)
                self.video_info_dict.setdefault(aweme_info['aweme_id'], aweme_info)
                self.videos_list.append(aweme_info['aweme_id'])
                #跳过已经收集了的视频
                if auto_increase_key > restart:
                    #断网15分钟自动终止程序
                    self.robust_downloader(id=aweme_info['aweme_id'],account_name=self.account_name)
                    auto_increase_key+=1
                    pbar.update(1)
                else:
                    auto_increase_key+=1
                    pbar.update(1)           
            if int(user_info['has_more']) == 0:
                self.stop_flag = True
            else:
                self.cursor = user_info['max_cursor']
                #self.stop_flag = True
            sleep_random()
        return self.videos_list

    #稳健的视频下载方式
    def robust_downloader(self,id,account_name,save_folder='D:/save_video'):
        information = self.get_video_detail_info(id)
        link = self.get_video_detail_info(id)['link']
        video = self.get_video_detail_info(id)['is_video']
        if video is True:
            attempts = 0
            #尝试5次，每次间隔3分钟，防止由于间歇性网络问题造成循环终止
            while attempts < 5:
                try:
                    downloader.download_video(account_name=account_name,video_url=link,save_folder=save_folder, file_name=f"{id}.mp4")
                    break
                except Exception as e:
                    print(f"Attempt {attempts + 1} failed: {e}")
                    attempts += 1
                    time.sleep(180)
                if attempts==5:
                    sys.exit()
        if video is False:
            try:
                downloader.download_images(account_name=account_name,image_list=link,save_folder=save_folder, image_dir=id)
                time.sleep(60)
            except Exception as e:
                pass

        
    #use this download videos if videos are fewer than 200
    def download_video(self, video_url: str, file_name: str = None):
        """
        下载视频
        :param video_url: 视频地址
        :param file_name: 视频保存文件名: 默认为空
        :return:
        """
        if not self.is_save:
            print("当前不需要保存")
            return
        save_folder = f"{self.save_folder}/{self.account_name}"
        if not os.path.exists(save_folder):
            os.mkdir(save_folder)
        real_file_name = f"{save_folder}/{file_name}"
        #print(f"下载url:{video_url}\n保存文件名:{real_file_name}")
        if os.path.exists(real_file_name):
            os.remove(real_file_name)
        urllib.request.urlretrieve(video_url, real_file_name)
        
    def download_images(self,image_list:list,image_dir:str=None):
        """
        下载图片
        :param image_list: 图片地址
        :param file_name: 图片目录: 默认为空
        :return:
        """
        if not self.is_save:
            print("当前不需要保存")
            return
        
        parent_folder = f"{self.save_folder}/{self.account_name}"
        if not os.path.exists(parent_folder):
            os.mkdir(parent_folder)
        save_folder = f"{self.save_folder}/{self.account_name}/{image_dir}"
        
        print(f"save-dir:{save_folder}")
        
        num=1
        if not os.path.exists(save_folder):
            os.mkdir(save_folder)
        for image_url in image_list:
            num+=1
            print(f"image_url:{image_url} {num}")
            real_file_name = f"{save_folder}/{num}.jpeg"
            print(f"下载url:{image_url}\n保存文件名:{real_file_name}")
            if os.path.exists(real_file_name):
                os.remove(real_file_name)
            urllib.request.urlretrieve(image_url, real_file_name)
        
    def get_video_detail_info(self, video_id: str):
        """
        获取视频详细信息
        :param video_id: 视频id
        :return:
        """
        default_response = {
            'video_id': video_id,  # 视频id
            'link': 'None',  # 视频链接
            'is_video': True,  # 是否为视频
            'title': 'None',  # 标题
            'thumb_up_num': 0,  # 点赞数
            'comment_num': 0,  # 评论数
            'collect_num': 0, # 收藏数
            'share_num': 0, # 分享数
            'cover_url': 'http://www.baidu.com',  # 视频封面
            'publish_time': '',  # 发布日期
            'record_time': '记录日期',  # 更新日期
        }
        res_info = self.video_info_dict.get(video_id, None)
        if res_info is None:
            return default_response
        default_response['title'] = res_info['desc']
        create_time = res_info['create_time']
        local_time = time.localtime(create_time)
        local_time_str = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
        default_response['publish_time'] = local_time_str
        default_response['record_time'] = get_current_time_format()
        if res_info['images'] is None:
            default_response['link'] = res_info["video"]["play_addr"]["url_list"][0]
            default_response['cover_url'] = res_info["video"]["cover"]["url_list"][0]
            default_response['is_video'] = True
        else:
            default_response['link'] = list(map(lambda x: x["url_list"][-1], res_info["images"]))
            default_response['is_video'] = False
        default_response['thumb_up_num'] = res_info['statistics']['digg_count']
        default_response['comment_num'] = res_info['statistics']['comment_count']
        default_response['collect_num'] = res_info['statistics']['collect_count']
        default_response['share_num'] = res_info['statistics']['share_count']
        return default_response

In [None]:
#现在视频的meta information， 10分钟内完成下载
dy_util = DouYinUtil(sec_uid=USER_SEC_UID)
all_video_list = dy_util.get_all_videos()
csvVideos =[]
for video_id in all_video_list:
    video_info = dy_util.get_video_detail_info(video_id)
    csvVideos.append(video_info)
data = pd.DataFrame(csvVideos)
csvHeaders = ["id","视频链接","是否为视频","标题","点赞数","评论数","收藏数","分享数","视频封面","发布日期","更新日期"]
try:
    data.to_csv(CSV_FILE_NAME, header=csvHeaders, index=False, mode='a+', encoding='utf-8')
except UnicodeEncodeError:
    print("编码错误, 该数据无法写到文件中, 直接忽略该数据")

In [3]:
#下载视频
dy_util = DouYinUtil(sec_uid=USER_SEC_UID)
# 第一个参数是从第几个视频开始下载，第二个参数是视频总数
all_video_list = dy_util.get_all_videos_withdownload(217,1637)

 15%|███████████▋                                                                | 252/1637 [05:51<16:41:51, 43.40s/it]

Attempt 1 failed: <urlopen error retrieval incomplete: got only 2095479 out of 3259324 bytes>


 16%|████████████▏                                                                | 258/1637 [10:37<9:05:51, 23.75s/it]

Attempt 1 failed: [WinError 10054] 远程主机强迫关闭了一个现有的连接。
Attempt 2 failed: [WinError 10054] 远程主机强迫关闭了一个现有的连接。


 16%|███████████▉                                                               | 260/1637 [19:44<48:47:20, 127.55s/it]

Attempt 1 failed: [WinError 10054] 远程主机强迫关闭了一个现有的连接。


 40%|██████████████████████████████▋                                              | 652/1637 [1:08:50<57:12,  3.48s/it]

Attempt 1 failed: HTTP Error 504: Gateway Time-out


 43%|████████████████████████████████▏                                          | 703/1637 [1:17:50<2:04:23,  7.99s/it]

Attempt 1 failed: [WinError 10054] 远程主机强迫关闭了一个现有的连接。


 58%|████████████████████████████████████████████▎                                | 942/1637 [1:44:12<44:58,  3.88s/it]

Attempt 1 failed: HTTP Error 424: Failed Dependency


 69%|████████████████████████████████████████████████████▋                       | 1134/1637 [2:04:13<44:39,  5.33s/it]

Attempt 1 failed: <urlopen error retrieval incomplete: got only 1046917 out of 1964733 bytes>


 77%|██████████████████████████████████████████████████████████▊                 | 1268/1637 [2:17:11<18:39,  3.03s/it]

Attempt 1 failed: [WinError 10054] 远程主机强迫关闭了一个现有的连接。


100%|███████████████████████████████████████████████████████████████████████████▉| 1635/1637 [2:47:50<00:12,  6.16s/it]
