In [61]:
from bs4 import BeautifulSoup
import requests, os, time, csv, random

In [62]:
class TranslationPair:
    def __init__(self, dutch_word, english_translation, processed_vocab):
        self.dutch_word = dutch_word
        self.english_translation = english_translation
        self.processed_vocab = processed_vocab

In [63]:
def get_wait_time():
    wait_time = random.uniform(1, 7)
    return wait_time

In [67]:
# Read csv file (assumes Column 1 = dutch word/phrase & Colum 2 = English translation)
# remove article if noun; remove preposition if present; replace spaces with _ if multiple words

def process_vocab(vocab_file_path):
    with open(vocab_file_path, newline='', encoding = 'utf-8-sig') as csvfile:
        vocab_list = csv.reader(csvfile, delimiter=',', quotechar='"')
        processed_vocab_list = []

        for word_pair in vocab_list:
            dutch_phrase = word_pair[0]
            english_phrase = word_pair[1]

            # check for article or reflexive & remove if present
            segmented_dutch = dutch_phrase.split(' ')
            if segmented_dutch[0] in ('de', 'het', 'zich'):
                word = segmented_dutch[1]
                #print(f'{word_pair[0]} had an article; final word: {word}')

            # if there is no article...
            else:
                # is the phrase made up of multiple words?
                if len(segmented_dutch) > 1:

                    # check for preposition & remove if present
                    if segmented_dutch[1].startswith('('):
                        word = segmented_dutch[0]
                        #print(f'{word_pair[0]} had a preposition/reflexive; final word:  {word}')

                    # not a preposition but multiple words
                    else:
                        word = word_pair[0].replace(' ', '_') 
                        #print(f'{word_pair[0]} is multiple words; final word:  {word}')

                # no second word found
                else:
                    word = word_pair[0] 
                    #print(f'{word_pair[0]} has no article/preposition; final word:     {word}') 

            # Make list of processed words & their translations
            word_pair = TranslationPair(dutch_phrase, english_phrase, word)
            processed_vocab_list.append(word_pair)
    
    return processed_vocab_list
    
    

In [65]:
# Fetch audio files for each word from wiktionary page
# If multiple audio files are listed, the user is informed & the first is used.

def fetch_audio(session, word, save_folder):
    url = f"http://nl.wiktionary.org/wiki/{word}"
    response = session.get(url)#, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find <span> elements with the class "IPA unicode audiolink"
        spans = soup.find_all('span', class_="IPA unicode audiolink")
        if spans:
            if len(spans) > 1:
                print(f'{word} has multiple sound files; downloading only the first. /n Check word listing for other files: {url}')

            for span in spans:
                time.sleep(get_wait_time()) # adding wait time between requests to hopefully prevent 403 error

                # Find the <a> tag inside the <span>
                link = span.find('a', class_="internal")

                if not link or not link['href']:
                    print(f'No href link found for {word}')

                elif link and link['href']:
                    audio_url = "http:" + link['href']  # Construct the full URL
                    file_name = f"Nl-{word}.ogg" # matches default naming convention for manual file download
                    file_path = os.path.join(save_folder, file_name)
                    
                    with open(file_path, 'wb') as f:
                        attempt = 0
                        while attempt < 5:
                            audio_response = session.get(audio_url)#, headers=headers)
                            audio_file = audio_response.content
                            if audio_response.status_code == 200:
                                f.write(audio_file)
                                print(f"Downloaded: {file_name} from {url}")
                                return file_name # exit after downloading successfully
                            else:
                                print(f"Failed to download: {file_name} from    {url} dt {audio_response.status_code}; attempt = {attempt}")
                                time.sleep(30) # retry in 30 seconds
                                attempt += 1
                                                            
                    break  # Exit after downloading the first valid audio file
            else:
                print(f"No valid audio link found for {word}")
        else:
            print(f"No audio links found for {word}")
    else:
        print(f"Failed to fetch page for {word} (Status Code: {response.status_code})")

In [None]:
# hardcoded test file
vocab_file_path = r"C:\Users\wisery\Desktop\Words that need sound.csv"

# avoid bot flagging
session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
session.headers.update(headers)
time.sleep(get_wait_time()) # wait a random amount of time between words 

# prepare save location
save_folder = r'C:\Users\wisery\AppData\Roaming\Anki2\User 1\collection.media'

#r'C:\Users\wisery\AppData\Roaming\Anki2\User 1\collection.media' #default Anki media storage location

# r"C:\Users\wisery\Data Science Projects\Anki Deck\test sound files"
# test data storage
os.makedirs(save_folder, exist_ok=True)

# prepare words
#with open(vocab_file_path, newline='') as csvfile:
#        vocab_list = csv.reader(csvfile, delimiter=',', quotechar='|')
        
words = process_vocab(vocab_file_path) # yields dictionary w/ keys = processed dutch word & values = original dutch & english translations
print('All vocab processed.')

# get the audio files for each word
write_data = {}
for word in words:
    audio_file_name = fetch_audio(session, word.processed_vocab, save_folder)
    #audio_file_name = 'Nl-'+ word.processed_vocab +'.ogg'

    # modify formatting based on if audio file download was successful
    if not audio_file_name:
        dutch_complete = word.dutch_word

    else:
        dutch_complete = f'{word.dutch_word} \n[sound:{str(audio_file_name)}]'
    #time.sleep(get_wait_time())
    write_data[dutch_complete]= f'{word.english_translation}'
    
print('Sound files collected; writing to file.')

with open(r'C:\Users\wisery\Data Science Projects\Anki Deck\audio_added_vocab.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for k in write_data.items():
        writer.writerow(k)

print('File complete.')

