In [24]:
import selenium
import requests
import os
import time
import threading
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException,NoSuchElementException,ElementNotVisibleException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import ui
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
import uuid
import shutil
import regex
import re
import pickle
 
def generate_uid(input_str):
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, input_str))

def find_file_move(model_name: str, uid: str, destination_folder='D://脚痛大学//basics//大三上//机器学习//models', download_address=r"C://Users//夏欣媛//Downloads", max_attempts=1000, delay=3):
    '''
    把位于默认下载目录下的model_name.glb移动至你规定的目录下，并改名为{uid}.glb
    model_name: 原模型名
    uid: 根据网址生成的uid唯一标识符
    destination_folder: 你想要模型去到的目录
    download_address: 你的chrome的默认下载位置
    max_attempts: 最大尝试次数
    delay: 每次尝试之间的延迟（秒）
    '''
    destination_path = os.path.join(destination_folder, f"{uid}.glb")

    for attempt in range(max_attempts):
        try:
            # 等待文件出现在下载目录
            while True:
                # file_names = os.listdir(download_address)
                file_names = [f for f in os.listdir(download_address) if f.endswith('.glb')]
                if file_names:
                    break
                time.sleep(delay)

            source_path = os.path.join(download_address, file_names[0])

            # 尝试移动文件
            shutil.move(source_path, destination_path)
            print(f"Moved to {destination_path}")
            time.sleep(delay)

            # 清理下载目录中的其他文件
            for file_name in os.listdir(download_address):
                file_path = os.path.join(download_address, file_name)
                if os.path.isfile(file_path):
                    os.unlink(file_path)

            return True

        except PermissionError:
            print(f"Attempt {attempt + 1}: File is still in use. Retrying in {delay} seconds...")
            time.sleep(delay)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return False

    print(f"Failed to move file after {max_attempts} attempts.")
    return False

def login(wait,browser,account):
    """
        Manually login the browser.
    """
    for trying in range(3):
        login_button = browser.find_elements(By.XPATH,"//input[@type='email']")
        if login_button==[]:
            if trying==2:
                print("no logging page")
            else:
                continue
        else:
            break
    username_input = wait.until(EC.presence_of_element_located((By.XPATH, "//input[@type='email']")))
    username_input.send_keys(account["username"])
    time.sleep(1)
    password_input = wait.until(EC.presence_of_element_located((By.XPATH, "//input[@type='password']")))
    password_input.send_keys(account["password"])
    time.sleep(1)
    # 找到登录按钮并点击
    login_button = wait.until(EC.presence_of_element_located((By.XPATH, "//button[@data-selenium='submit-button']")))
    browser.execute_script("arguments[0].click();",login_button)
    time.sleep(3)

def get_download_links_by_keyword(
                    key_word:str,
                    load_page:int=5,
                    test:bool=False):
    '''
    key_word: 你要查找什么关键词？
    load_page: 最大加载的页面数量
    test: 一个用于debug的参数，默认就好
    '''
    # 设置网址
    url=f"https://sketchfab.com/search?features=downloadable&q={key_word}&type=models"

    # 打开浏览器
    chrome_options = Options()
    # 设置浏览器的偏好设置
    prefs = {
        "download.prompt_for_download":False, #下载时不弹出提示框，直接下载
        "download.default_directory":"./models", #下载文件的指定目录 "D:\BaiduSyncdisk\selenium_learning\models"
        "profile.default_content_settings.popups":0 #禁止弹出窗口
    }
    chrome_options.add_experimental_option('prefs',prefs)
    # chrome_options.add_argument('--headless')  #增加无头，防止被网站识别
    chrome_options.page_load_strategy = 'eager' #增加界面加载速度，在DOM内容加载完成后立即返回，不必等所有资源（eg：图像）加载完成
    chrome_options.add_argument('--disable-gpu') #禁止GPU加速，提高稳定性
    # chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument("--disable-blink-features=AutomationControlled") #防止网站检测到selenium
    chrome_options.add_experimental_option('excludeSwitches',['enable-automation']) #关闭在浏览器中显示自动化控制的标识，防止网站检测
    browser = webdriver.Chrome(options=chrome_options)
    browser.get(url)
    wait = WebDriverWait(browser,180)
    
    
    print("loading page")
    for i in range(load_page):
        time.sleep(5)
        button = browser.find_elements(By.XPATH,"//div[@class='c-grid__button --next']//button")
        if button==[]:
            break
        else:
            browser.execute_script("arguments[0].click();",button[0])
            print("loading",end=None)
            if test:
                break
        

    print("all pages loaded")
    # 加载所有下载网址
    links=browser.find_elements(By.XPATH, "//div[@class='card__main__corner --top-right']/a")
    model_urls=[]
    for link in links:
        model_urls.append(link.get_attribute('href'))
    print(f"total {len(model_urls)} models available")
    with open(f'{key_word}_model_urls.pkl', 'wb') as f:
        pickle.dump(model_urls, f)
    print(f"model_urls 已被保存到 {key_word}_model_urls.pkl 文件中。")

    return f"{key_word}_model_urls.pkl"

def get_download_links(
                    urls:list,
                    test:bool=False):
    '''
    input:
        urls: 一个列表，存放所有你想要爬的博物馆网站
        test: 一个用于debug的参数，默认就好
    output:
        f"{key_word}_model_urls.pkl" 文件名，存放了所有博物馆网站中的模型下载链接
    '''
    # 设置网址
    model_urls=[]
    for url in urls:
        url=url+"/models"

        # 打开浏览器
        chrome_options = Options()
        # 设置浏览器的偏好设置
        prefs = {
            "download.prompt_for_download":False, #下载时不弹出提示框，直接下载
            "download.default_directory":".\models", #下载文件的指定目录 "D:\BaiduSyncdisk\selenium_learning\models"
            "profile.default_content_settings.popups":0 #禁止弹出窗口
        }
        chrome_options.add_experimental_option('prefs',prefs)
        # chrome_options.add_argument('--headless')  #增加无头，防止被网站识别
        chrome_options.page_load_strategy = 'eager' #增加界面加载速度，在DOM内容加载完成后立即返回，不必等所有资源（eg：图像）加载完成
        chrome_options.add_argument('--disable-gpu') #禁止GPU加速，提高稳定性
        # chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument("--disable-blink-features=AutomationControlled") #防止网站检测到selenium
        chrome_options.add_experimental_option('excludeSwitches',['enable-automation']) #关闭在浏览器中显示自动化控制的标识，防止网站检测
        browser = webdriver.Chrome(options=chrome_options)
        browser.get(url)
        wait = WebDriverWait(browser,180)
        extracted_part = url.split("sketchfab.com")[-1]
        total_links=wait.until(EC.presence_of_element_located((By.XPATH, f"//a[@href='{extracted_part}']/span"))) 
        total_links=int(total_links.text)
        
        
        print("loading page")
        while True:
            button = browser.find_elements(By.XPATH,"//div[@class='c-grid__button --next']//button")
            if button==[]:
                # links=browser.find_elements(By.XPATH, "//div[@class='card__main__corner --top-right']/a")
                # if len(links)==total_links:
                #     break
                break
            else:
                browser.execute_script("arguments[0].click();",button[0])
                print("loading",end=None)
                if test:
                    break
            time.sleep(10)

        print("all pages loaded")
        # 加载所有下载网址
        links=browser.find_elements(By.XPATH, "//div[@class='card__main__corner --top-right']/a")
        print(f"total {len(links)}/{total_links} models in {url}")
        
        for link in links:
            model_urls.append(link.get_attribute('href'))

    key_word=url.split("/")[-2]
    with open(f'{key_word}_model_urls.pkl', 'wb') as f:
        pickle.dump(model_urls, f)
    print(f"model_urls 已被保存到 {key_word}_model_urls.pkl 文件中。")

    return f"{key_word}_model_urls.pkl"
    
def download_models(account:dict,urls_file_name:str,from_url:str = None,download_time:int=8,test:bool=False):
    '''
    account: 账号{"username":"xxx","password":"xxx"}
    urls_file_name: 网站下载链接的文件名 (get_download_links返回值)
    from_url: 若下载中断，可在此处填入上次下载的最后一个url
    '''
    file=open(urls_file_name,'rb')
    model_urls=pickle.load(file)
    chrome_options = Options()
    # 设置浏览器的偏好设置
    prefs = {
        "download.prompt_for_download":False, #下载时不弹出提示框，直接下载
        "download.default_directory":".\models", #下载文件的指定目录 "D:\BaiduSyncdisk\selenium_learning\models"
        "profile.default_content_settings.popups":0 #禁止弹出窗口
    }
    chrome_options.add_experimental_option('prefs',prefs)
    # chrome_options.add_argument('--headless')  #增加无头，防止被网站识别
    chrome_options.page_load_strategy = 'eager' #增加界面加载速度，在DOM内容加载完成后立即返回，不必等所有资源（eg：图像）加载完成
    chrome_options.add_argument('--disable-gpu') #禁止GPU加速，提高稳定性
    # chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument("--disable-blink-features=AutomationControlled") #防止网站检测到selenium
    chrome_options.add_experimental_option('excludeSwitches',['enable-automation']) #关闭在浏览器中显示自动化控制的标识，防止网站检测
    browser = webdriver.Chrome(options=chrome_options)
    wait = WebDriverWait(browser,180) 

    if from_url:
        from_idx=model_urls.index(from_url)+1 #!!!
    else:
        from_idx=0
    cnt=from_idx
    for model_url in model_urls[from_idx:]:
        browser.get(model_url)
        model_names=browser.find_elements(By.XPATH,"//div[@class='c-download__title-text']//span")
        if model_names==[]:
            login(wait,browser,account)
            
        time.sleep(2)
        model_name=wait.until(EC.presence_of_element_located((By.XPATH,"//div[@class='c-download__title-text']//span"))).text
        model_description=wait.until(EC.presence_of_element_located((By.XPATH,"//div[@id='descriptionContent']"))).text
        uid=generate_uid(model_url)
        print("model name:",model_name)
        print("download link:",model_url)
        print("uid:",uid)
        model_data={
            "model name":model_name,
            "description":model_description,
            "download_link":model_url,
            "uid":uid
        }
        # ???
        download_area=wait.until(EC.presence_of_all_elements_located((By.XPATH,"//div[@class='c-download__links']")))
        if len(download_area)>1:
            continue
        # ???
        buttons=wait.until(EC.presence_of_all_elements_located(((By.XPATH,"//div[@class='c-download__links']/div"))))
        length=len(buttons)
        button=wait.until(EC.presence_of_element_located((By.XPATH,f"//div[@class='c-download__links']/div[{length}]//button")))
        browser.execute_script("arguments[0].click();",button)
        time.sleep(download_time) 
        is_moved=find_file_move(model_name=model_name,uid=uid)
        if is_moved:
            with open('models.json','a',encoding='utf-8') as json_flie:
                json.dump(model_data,json_flie,ensure_ascii=False,indent=4)
                json_flie.write('\n')
            cnt+=1
            print(f"got {cnt} models")
        if test:
            break
    return model_url



# 示例

In [None]:
urls=["https://sketchfab.com/hmane"]
urls_file_name=get_download_links(urls=urls,
                                test=False)

# urls_file_name=get_download_links_by_keyword(key_word="bronze",
#                                              load_page=50,)

account={"username":"xxx","password":"xxx"}
download_models(account=account,
                urls_file_name=urls_file_name
                )

In [2]:
# account={"username":"rgcai@outlook.com","password":"Cjy142857!"}
account={"username":"rgcai@outlook.com","password":"Cjy142857!"}
download_models(account=account,
                urls_file_name="dynasty_model_urls.pkl",
                from_url="https://sketchfab.com/3d-models/indra-with-thunderbolt-guarding-buddha-0932f9a762894ac7b6491d79e606b0dd#download")


FileNotFoundError: [Errno 2] No such file or directory: 'dynasty_model_urls.pkl'

In [4]:
from selenium.common.exceptions import NoSuchWindowException
import numpy as np
account={"username":"charlottexxy32@163.com","password":"qwertY123456"}
account_list = [
    {"username":"char-lotte@sjtu.edu.cn","password":"qwertY123456"},
    {"username":"charlottexxy32@163.com","password":"qwertY123456"}
]
last_model_url = "https://sketchfab.com/3d-models/pottery-small-jug-painted-and-glazed-a904204e287f43238db1f3c895dbdd9a#download"

while True:
    account_state = np.zeros(len(account_list))
    for index, account in enumerate(account_list):
        try:
            last_model_url = download_models(account=account,
                    urls_file_name="D://脚痛大学//basics//大三上//机器学习//sketchfab//pottery_model_urls.pkl",
                    from_url=last_model_url)
            account_state[index] = 1
        except NoSuchWindowException:
            print("too many requests")
            continue
    if sum(account_state) == len(account_list):
        break


model name: Neolithic Butmir pottery, Sarajevo, Bosnia
download link: https://sketchfab.com/3d-models/neolithic-butmir-pottery-sarajevo-bosnia-1ddf9ad6f171457fa82b43099fab313d#download
uid: f0e06673-5bd3-5cd2-bf56-fda732b20191
Moved to D://脚痛大学//basics//大三上//机器学习//models\f0e06673-5bd3-5cd2-bf56-fda732b20191.glb
got 278 models
model name: Visigoth Pottery, Castilla La Mancha, Spain
download link: https://sketchfab.com/3d-models/visigoth-pottery-castilla-la-mancha-spain-c8a2e2263bef4a7dbf050e82d72133ec#download
uid: 0a7f69b2-2cfe-5bbc-a1e2-dfb5e3123255
Moved to D://脚痛大学//basics//大三上//机器学习//models\0a7f69b2-2cfe-5bbc-a1e2-dfb5e3123255.glb
got 279 models


In [27]:
account = {"username":"charlottexxy32@163.com","password":"qwertY123456"}
# account = {"username":"char-lotte@sjtu.edu.cn","password":"qwertY123456"}
last_model_url = 'https://sketchfab.com/3d-models/tomb-bhs-68-jebel-al-buhais-sharjah-uae-2f74620996e24fb79d0745359c9f02b1#download'
last_model_url = download_models(account=account,
                    urls_file_name="D://脚痛大学//basics//大三上//机器学习//sketchfab//pottery_model_urls.pkl",
                    from_url=last_model_url)

model name: Pottery sherd from Debelo Brdo near Sarajevo
download link: https://sketchfab.com/3d-models/pottery-sherd-from-debelo-brdo-near-sarajevo-9009d352c1e343e19a7434ce94d4cfe1#download
uid: 470bbcdb-9ef6-55d0-b65c-7c51cbb984ae
Moved to D://脚痛大学//basics//大三上//机器学习//models\470bbcdb-9ef6-55d0-b65c-7c51cbb984ae.glb
got 809 models
no logging page


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF78EABB125+29573]
	(No symbol) [0x00007FF78EA2FF50]
	(No symbol) [0x00007FF78E8EB6EA]
	(No symbol) [0x00007FF78E93F815]
	(No symbol) [0x00007FF78E93FA6C]
	(No symbol) [0x00007FF78E98B917]
	(No symbol) [0x00007FF78E96733F]
	(No symbol) [0x00007FF78E9886BC]
	(No symbol) [0x00007FF78E9670A3]
	(No symbol) [0x00007FF78E9312DF]
	(No symbol) [0x00007FF78E932441]
	GetHandleVerifier [0x00007FF78EDEC76D+3377613]
	GetHandleVerifier [0x00007FF78EE37B67+3685831]
	GetHandleVerifier [0x00007FF78EE2CF8B+3641835]
	GetHandleVerifier [0x00007FF78EB7B2A6+816390]
	(No symbol) [0x00007FF78EA3B25F]
	(No symbol) [0x00007FF78EA37084]
	(No symbol) [0x00007FF78EA37220]
	(No symbol) [0x00007FF78EA2607F]
	BaseThreadInitThunk [0x00007FFB96E3257D+29]
	RtlUserThreadStart [0x00007FFB98CEAF08+40]


In [26]:
import json
import re

def read_models(file_path):
    models = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        # 使用正则表达式分割多个 JSON 对象
        json_objects = re.findall(r'\{.*?\}', content, re.DOTALL)
        for obj in json_objects:
            try:
                models.append(json.loads(obj))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}\nObject: {obj}")
    return models

# 使用示例
file_path = 'models.jsonl'
models = read_models(file_path)
for model in models:
    print(model['model name'])
    

Roman pottery (Samian ware) and mould
Roman pottery (Samian ware) and mould
Chinese pottery vase
Pottery
Egyptian pottery
Ancient Egyptian Pottery
Painted female figurines of Tang pottery
Another pot
Ancient Pottery Workshop
Stylized Pottery Asset Pack
Pottery PBR
Pottery - Bazaar
Ancient Pottery Vase
DAE Islands: Mayan Pottery
Fantasy Pottery
Roman Pottery workshop - free download
Unstan Neolithic Tomb, Stenness, Orkney
Pottery
Pottery - Trim Sheet Texture
PBR Greek Pottery
Ancient Chinese Pottery | DAE Villages
Leather Flask Pottery Mesh
Pottery Flask [3D Printable Proxy]
DAE Villages - Egyptian Pottery
Indian Pottery
Decorative Pottery Collection
Neolithic pottery piece
Hanging Ancient Pottery Set
Mud Masters - Albion Online Inspired Pottery
Greek Pottery house DAE
Cantil | Pottery Flask
Pottery Jug
Delfts Blue Pottery - 500th Followers GiveAway
Pottery Beach Cross-Section, Brownsea Island, UK
Pottery Pots
FishSoup_Pot
The Jericho Skull
Medieval Pottery Storage Jar
Thai water buffal

In [3]:
urls_file_name="D://脚痛大学//basics//大三上//机器学习//sketchfab//pottery_model_urls.pkl"
file=open(urls_file_name,'rb')
model_urls=pickle.load(file)


In [7]:
model_urls.index("https://sketchfab.com/3d-models/visigoth-pottery-castilla-la-mancha-spain-c8a2e2263bef4a7dbf050e82d72133ec#download")

278

In [10]:
model_urls[279]

'https://sketchfab.com/3d-models/medieval-persian-pottery-fragment-d13e94c9c5cb4f639f37edea03513a47#download'