In [None]:
"""
Python script for getting the definitions of 53,999 Irish words inorder to make flash cards out of them. This code is intended to show
how this was done, rather than intended for replication.
"""

In [None]:
#import libraries
from bs4 import BeautifulSoup
import pandas as pd
import random
import requests
from string import ascii_lowercase
import time

In [None]:
#root url
root = "https://www.teanglann.ie/en/fgb/_"

In [None]:
#for all the original letters
alphabet = []
for c in ascii_lowercase:
    alphabet.append(root+c)

In [None]:
#as each letter has its own page, a list of all 26 letters must be cycled through to obtain all the words
soup_kitchen = []
for element in range(0,26): soup_kitchen.append(BeautifulSoup(requests.get(alphabet[element]).text,'html.parser').find_all("span",{"class": "abcItem"}))

In [None]:
#initialise the list of links and fill it with the word definitions
all_links = []
for i in range(0,26):
    for element in soup_kitchen[i]:
        all_links.append(element.find_all('a'))

In [None]:
#get the word from each link
all_text = []
for link in all_links:
   all_text.append(link[0].text)

In [None]:
#new root for definitions pages
root = "https://www.teanglann.ie"

In [None]:
#create a list of all definition urls
full_urls = []
for link in all_links:
    for bit in link:
        full_urls.append(root+list(bit.attrs.values())[0])

In [None]:
#loop through all urls and scrape each url's word, grammatical features and first definition
final_results = []
for link in full_urls:
    data = requests.get(link)
    soup = BeautifulSoup(data.text, 'html.parser')
    titles = []
    grammars = []
    definitions = []
    for index, entry in enumerate(soup.select("body div#envelope div#invelope .listings .fgb.entry .fgb.title")):
        if index == 0: #flashcards cannot be too long, so cut off at first definition
            titles.append(entry.text+' ('+str(index+1)+')')
        for grammar in entry.parent.select(".fgb.g"):
            grammars.append(grammar.text)
        for definition in entry.parent.select(".fgb.trans"):
            definitions.append(definition.text)
    
    result = zip(titles,grammars,definitions)
    result_set = set(result)
    final_results.append(sorted(result_set)) 
    time.sleep(random.randint(1,5)*random.random()) #delay for politeness

In [None]:
#make a dataframe of the results
definitions = pd.DataFrame(final_results,columns = ['definition'])
definitions['word'] = all_text #original word column included to join definitions to audio file in instances where the first definition does not perfectly match the word text

In [None]:
#remove empty definitions
defineds = definitions.loc[definitions['definition1'].notna()]

In [None]:
#save definitions as excel document
defineds.to_excel('definitions.xlsx',sheet_name='Sheet1', na_rep='', index = False)