In [1]:
import feedparser
import requests
import time
import random
import re
import json
from bs4 import BeautifulSoup
import os

def sanitize_filename(filename):
    # 定義要移除的非法字符集合
    illegal_chars = set('/\\%*|"')

    # 創建一個映射表，將所有非法字符映射為空字串
    translation_table = str.maketrans('', '', ''.join(illegal_chars))
    
    # 使用 translate 方法來應用映射表，移除非法字符
    sanitized_filename = filename.translate(translation_table)
    
    return sanitized_filename

def download_mp3(url, folder_path, filename):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, stream=True, allow_redirects=True)
        response.raise_for_status()

        # 獲取最終的下載 URL
        final_url = response.url
        response = requests.get(final_url, headers=headers, stream=True)
        response.raise_for_status()

        filepath = os.path.join(folder_path, filename)
        with open(filepath, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"MP3 檔案已下載並儲存為 {filename}")
    except requests.RequestException as e:
        print(f"下載失敗: {e}")

def get_rss_file(newsurl, test_n, parent_folder_path=None):
    def get_title_name(url):
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title = soup.find("title").text if soup.find("title") else "Unknown Title"
            return sanitize_filename(title)
        return "Unknown Title"

    # 獲取節目標題，並在母資料夾內建立子資料夾
    if newsurl[13:21] == "firstory":
        response = requests.get(newsurl)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title = soup.find("title").text if soup.find("title") else "Unknown Title"
            title = sanitize_filename(title)
    else:
        title = get_title_name(newsurl)

    # 如果指定了母資料夾，則在其內建立子資料夾
    folder_path = os.path.join(parent_folder_path, title) if parent_folder_path else title

    # 創建資料夾
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    if newsurl[13:21] == "firstory":
        soup = BeautifulSoup(response.content, 'html.parser')
        items = soup.find_all("item")
        n = len(items)
        print(title, "一共", n, "集")

        test_n = n if n <= 2 or test_n == "all" else int(test_n)

        for i in range(min(test_n, n)):
            time.sleep(random.randrange(1, 5))
            item = items[i]
            mp3_url = item.find('enclosure')['url'] if item.find('enclosure') else None
            name = item.find('itunes:title').text if item.find('itunes:title') else f"Episode {i+1}"
            if mp3_url:
                sanitized_name = sanitize_filename(name)
                download_mp3(mp3_url, folder_path, sanitized_name + ".mp3")
            else:
                print(f"無法找到第 {i+1} 集的 MP3 URL")

    elif newsurl[14:23] == "buzzsprout":
        feed = feedparser.parse(newsurl)
        title = feed.feed.title
        n = len(feed.entries)
        print(title, "一共", n, "集")

        test_n = n if n <= 2 or test_n == "all" else int(test_n)

        for i in range(min(test_n, n)):
            time.sleep(random.randrange(1, 5))
            entry = feed.entries[i]
            mp3_url = entry.enclosures[0]['url'] if entry.enclosures else None
            name = entry.title
            if mp3_url:
                sanitized_name = sanitize_filename(name)
                download_mp3(mp3_url, folder_path, sanitized_name + ".mp3")
            else:
                print(f"無法找到第 {i+1} 集的 MP3 URL")

    else:
        file = feedparser.parse(newsurl)
        n = len(file.entries)
        print(title, "一共", n, "集")

        test_n = n if n <= 2 or test_n == "all" else min(int(test_n), n)

        for i in range(test_n):
            time.sleep(random.randrange(1, 5))
            entry = file.entries[i]
            try:
                mp3_url = next((link['href'] for link in entry.links if link.get('type', '').startswith('audio/')), None)
                if not mp3_url:
                    raise ValueError("No audio link found")
            except Exception as e:
                print(f"Error fetching MP3 URL for episode {i+1}: {e}")
                continue

            name = entry.title
            sanitized_name = sanitize_filename(name)
            download_mp3(mp3_url, folder_path, sanitized_name + ".mp3")
                    
            
            
        
        
# 測試字符是否為中文
def is_chinese(char):
    # 判斷字符的 Unicode 編碼值是否在中文範圍內
    if '\u4e00' <= char <= '\u9fff':
        return True
    else:
        return False      
    
def get_all_category():
    all_cata=['all',
     'Arts',
     'Arts / Books',
     'Arts / Design',
     'Arts / Fashion & Beauty',
     'Arts / Food',
     'Arts / Performing Arts',
     'Arts / Visual Arts',
     'Business',
     'Business / Careers',
     'Business / Entrepreneurship',
     'Business / Investing',
     'Business / Management',
     'Business / Marketing',
     'Business / Non-Profit',
     'Comedy',
     'Comedy / Comedy Interviews',
     'Comedy / Improv',
     'Comedy / Stand-Up',
     'Education',
     'Education / Courses',
     'Education / How To',
     'Education / Language Learning',
     'Education / Self-Improvement',
     'Fiction',
     'Fiction / Comedy Fiction',
     'Fiction / Drama',
     'Fiction / Science Fiction',
     'Government',
     'Health & Fitness',
     'Health & Fitness / Alternative Health',
     'Health & Fitness / Fitness',
     'Health & Fitness / Medicine',
     'Health & Fitness / Mental Health',
     'Health & Fitness / Nutrition',
     'Health & Fitness / Sexuality',
     'History',
     'Kids & Family',
     'Kids & Family / Education for Kids',
     'Kids & Family / Parenting',
     'Kids & Family / Pets & Animals',
     'Kids & Family / Stories for Kids',
     'Leisure',
     'Leisure / Animation & Manga',
     'Leisure / Automotive',
     'Leisure / Aviation',
     'Leisure / Crafts',
     'Leisure / Games',
     'Leisure / Hobbies',
     'Leisure / Home & Garden',
     'Leisure / Video Games',
     'Music',
     'Music / Music Commentary',
     'Music / Music History',
     'Music / Music Interviews',
     'News',
     'News / Business News',
     'News / Daily News',
     'News / Entertainment News',
     'News / News Commentary',
     'News / Politics',
     'News / Sports News',
     'News / Tech News',
     'Religion & Spirituality',
     'Religion & Spirituality / Buddhism',
     'Religion & Spirituality / Christianity',
     'Religion & Spirituality / Hinduism',
     'Religion & Spirituality / Islam',
     'Religion & Spirituality / Judaism',
     'Religion & Spirituality / Religion',
     'Religion & Spirituality / Spirituality',
     'Science',
     'Science / Astronomy',
     'Science / Chemistry',
     'Science / Earth Sciences',
     'Science / Life Sciences',
     'Science / Mathematics',
     'Science / Natural Sciences',
     'Science / Nature',
     'Science / Physics',
     'Science / Social Sciences',
     'Society & Culture',
     'Society & Culture / Documentary',
     'Society & Culture / Personal Journals',
     'Society & Culture / Philosophy',
     'Society & Culture / Places & Travel',
     'Society & Culture / Relationships',
     'Sports',
     'Sports / Baseball',
     'Sports / Basketball',
     'Sports / Cricket',
     'Sports / Fantasy Sports',
     'Sports / Football',
     'Sports / Golf',
     'Sports / Hockey',
     'Sports / Rugby',
     'Sports / Running',
     'Sports / Soccer',
     'Sports / Swimming',
     'Sports / Tennis',
     'Sports / Volleyball',
     'Sports / Wilderness',
     'Sports / Wrestling',
     'TV & Film',
     'TV & Film / After Shows',
     'TV & Film / Film History',
     'TV & Film / Film Interviews',
     'TV & Film / Film Reviews',
     'TV & Film / TV Reviews',
     'Technology',
     'True Crime']
    all_category=[]
    for i in range(len(all_cata)):
        all_category.append((all_cata[i].replace(" / ","-").replace(" & ","-").replace(" ","-").lower()))
    return all_category


def get_name_href(category_name):
    #抓到總排行榜   全部的網址 跟名稱
    rank_list_herf=[]
    rank_list_name=[]
    # 目標網站的URL

    url="https://rephonic.com/charts/apple/tw/"+category_name
    # 發送GET請求獲取網頁內容
    response = requests.get(url)

    # 檢查是否成功獲取網頁內容
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(response.content, 'html.parser')

        # 在這裡編寫你的程式碼來處理解析後的網頁內容

        # 以下是一個示例，尋找所有<a>標籤並獲取其連結和文字內容
        for link in soup.find_all('a'):
            href = link.get('href')
            text = link.get_text()
            if len(text)>0 and is_chinese(text)==True:
                #print(f"連結: {href}\t文字內容: {text}")
                rank_list_herf.append("https://rephonic.com"+href)
                rank_list_name.append(text)
            else:
                pass
    else:
        print("無法獲取網頁內容")
    return  rank_list_herf,rank_list_name



def get_rss(url):
    # 目標網頁的URL
    #url = "https://rephonic.com/podcasts/li-jing-lei-de-chen-jing-shi-jian"

    # 發送GET請求獲取網頁內容
    response = requests.get(url)

    # 檢查是否成功獲取網頁內容
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(response.content, 'html.parser')

        # 尋找包含RSS網址的元素
        find_rss=soup.find_all("",text=re.compile("@context"))

        # 要解析的 JSON 字串
        json_str = find_rss[0]
        # 解析 JSON 字串
        data = json.loads(json_str)
        # 提取 identifier 後面的文字
        identifier_text = data["identifier"]
        print(identifier_text)

    else:
        print("無法獲取網頁內容")
    return identifier_text

#### 選擇要爬蟲的節目 (RSS)

In [None]:
# 存放mp3檔路徑
folder_path = r"/media/starklab/BACKUP/Podcast_project/mp3//"

# #貼上rss即可下載。＃新資料夾
# 百靈果（近100集）
# get_rss_file("https://feeds.buzzsprout.com/1974862.rss","all")
# #史塔克實驗室
# get_rss_file("https://feeds.soundon.fm/podcasts/e4f101be-289a-4101-bb11-59fc61e5c88b.xml","all")
# #跳脫Do式圈
# get_rss_file("https://feeds.soundon.fm/podcasts/22505944-fec2-4417-b277-649ce5d3a491.xml","all")
#週報時光機 (還沒)
# get_rss_file("https://feed.firstory.me/rss/user/ckcnhs4x0yuqw0918kkui2pjw","all",folder_path)
#寧可當吃貨 
# get_rss_file("https://feed.firstory.me/rss/user/cklase4t37jae0872lm9x1xmv","all",folder_path)
#科技浪 
# get_rss_file("https://feeds.soundon.fm/podcasts/03f4a53e-80cf-4a20-ad2c-bdb31a76c7b3.xml","all",folder_path)
#老高 
# get_rss_file("https://anchor.fm/s/3ba51528/podcast/rss","all")
#Joe & Jet 未過濾的 with Jason
# get_rss_file("https://feeds.soundon.fm/podcasts/78a91a6a-5c6b-43cc-aaac-7918f792e5ae.xml","all")


#### 沒有用到

In [None]:
def main():
    
    #先得到所有分類的名字
    category_name=get_all_category()
    #使用者輸入想跑多少分類
    category_n=input("你想要跑多少種分類？ 輸入數字，或者all抓取全部")
    if category_n=="all":       
        category_n=len(category_name)
    else:
        category_n=int(category_n)
    
    for i in range(category_n):      #這邊的迴圈  是指定跑幾個分類    
        #得到該分類的所有節目名稱跟網址
        rank_list_herf,rank_list_name=get_name_href(category_name[i])
        #使用者輸入想跑多少節目
        rank_list_n=5        #("一個分類想要抓取多少節目？ 輸入數字，或者all抓取全部")
        if rank_list_n=="all":    
            rank_list_n=len(rank_list_herf)
        else:
            rank_list_n=int(rank_list_n)
        
        for j in range(rank_list_n):     #這邊的迴圈  是指定跑幾個節目
            
            #得到該節目的rss
            rss_url=get_rss(rank_list_herf[j])
            #下載所有節目
            get_rss_file(rss_url,3)    #後面的後面的數字   是測試用的時候。要下載幾集。  ＃要全部的集數。 輸入"all" 
main()