In [12]:
"""
Context: in Irish there is an oft-repeated spelling rule that any 
consonant or consonant cluster flanked by a broad vowel (a,o,u) must
have a broad vowel on the other side, and likewise with slender vowels
(e,i). Mixing broad and slender is forbidden, but how forbidden is it
*really*? This script checks for violations of this rule in a list of
almost 54000 Irish words pulled from an online Irish dictionary.
"""

'\nContext: in Irish there is an oft-repeated spelling rule that any \nconsonant or consonant cluster flanked by a broad vowel (a,o,u) must\nhave a broad vowel on the other side, and likewise with slender vowels\n(e,i). Mixing broad and slender is forbidden, but how forbidden is it\n*really*? This script checks for violations of this rule in a list of\nalmost 54000 Irish words pulled from an online Irish dictionary.\n'

In [36]:
#import libraries
from bs4 import BeautifulSoup
from itertools import compress
import os
import requests
import re
import time
from string import ascii_lowercase

In [33]:
#root url
root = "https://www.teanglann.ie/en/fgb/_"

In [34]:
#get a list of all the letter links on the dictionary website
alphabet = []
for c in ascii_lowercase:
    alphabet.append(root+c)

In [37]:
#cycles through the letter links and scrapes their html into a list
soup_kitchen = []
for element in range(0,26):
    soup_kitchen.append(BeautifulSoup(requests.get(alphabet[element]).text,'html.parser').find_all("span",{"class": "abcItem"}))
    time.sleep(2)

In [40]:
#initialise the list of links
all_links = []

In [41]:
#fill the list of links with the word definitions
for i in range(0,26):
    for element in soup_kitchen[i]:
        all_links.append(element.find_all('a'))
print(all_links[0:5])

[[<a href="/en/fgb/A">A</a>], [<a href="/en/fgb/%c3%a1">á</a>], [<a href="/en/fgb/ab">ab</a>], [<a href="/en/fgb/aba">aba</a>], [<a href="/en/fgb/abab%c3%ba">ababú</a>]]


In [42]:
#get the word from each link
all_text = []
for link in all_links:
    all_text.append(link[0].text)

In [57]:
#create a list of words turned into simplified strings
simplified_words = []

In [58]:
#replaces all the letters in a word with stand-in letters that represent the three kinds of letters
for word in all_text:
    word = word.translate({ord(ch):'c' for ch in 'bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ'})
    word = word.translate({ord(ch):'b' for ch in 'aouáóúAOUÁÓÚ'})
    word = word.translate({ord(ch):'s' for ch in 'ieíéÍÉ'})
    simplified_words.append(word)

In [52]:
#remove repeat letters, as these are not relevant to determining if the rule applies
class Solution:
   def solve(self, s):
      seen = s[0]
      ans = s[0]
      for i in s[1:]:
         if i != seen:
            ans += i
            seen = i
      return ans
repeat_kill = Solution()

In [59]:
#remove repeated character types
unduped_list = []
for word in simplified_words:
    word = repeat_kill.solve(word)
    unduped_list.append(word)

['b', 'b', 'bc', 'bcb', 'bcbcb', 'bcbcbc', 'bcbcc', 'bcbccc', 'bcbcc', 'bcbsc', 'bcbscc', 'bcbscc', 'bcbsc', 'bcbsc', 'bcbscc', 'bcbsccsbcc', 'bcbccb', 'bcbccbccc', 'bcbc', 'bcbcbcc', 'bcbccbcc', 'bcbcccb', 'bcbcccbccc', 'bcccbscs', 'bccbcccbcc', 'bccb', 'bccb', 'bccbc', 'bccbcc', 'bccbccc', 'bccbcccbcc', 'bccbsc', 'bccbscs', 'bccbscc', 'bccbsccsbcc', 'bccbsccs', 'bccbscc', 'bccbscsc', 'bccbscsc', 'bccbc', 'bccbcccssc', 'bccbcc', 'bccbccccbcc', 'bccbcccbc', 'bccbcc', 'bccbcc', 'bccbccbcc', 'bccbccb', 'bccbcccbcc', 'bccbc', 'bccbcbccbc', 'bccbcbs', 'bccbcbsbcc', 'bccbccbsc', 'bccbcccb', 'bccbcccbccc', 'bcccbcc', 'bcccbscs', 'bcccbscsbccc', 'bcccbsc', 'bcccb', 'bcccbcc', 'bcccbccb', 'bcccbcc', 'bcccbsc', 'bcccbscsbccc', 'bcccbcc', 'bcccb', 'bccb', 'bccbc', 'bccbcbcc', 'bccbcbccc', 'bccbcbs', 'bccbscsbc', 'bcccb', 'bcccbscc', 'bcccbsbccc', 'bcccbscsbcc', 'bcccbc', 'bcccbc', 'bcccb', 'bccbc', 'bccbcc', 'bccbsc', 'bccbcbs', 'bcbcbscc', 'bccbccb', 'bccbccc', 'bccbcc', 'bccbsbcc', 'bccbsbcc',

In [60]:
#now check for violations of the rule and create a boolean list of whether the rule has been violated
bool_list = []
for word in range(0,len(unduped_list)):
    bool_list.append(bool(re.search((r".bcs|scb."),unduped_list[word])))

In [61]:
#to find out the total number that violate rule
print("the total number of words that violate 'slender with slender and broad with broad' is: "+str(sum(bool_list)))

the total number of words that violate 'slender with slender and broad with broad' is: 4000


In [62]:
#using the boolean index to subset the original list
violators = list(compress(all_text, bool_list))

In [63]:
#save a list of all the words that violate the rule
with open(os.getcwd()+r'\violators.doc', 'w') as f:
    for item in violators:
        f.write(item+', ')