<a href="https://colab.research.google.com/github/drew-chien/dictionary-crawler/blob/main/dictionary_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install required packages for one time usage

In [None]:
# install python-docx to write to doc
!pip install python-docx
!pip show python-docx

# Install required packages permanently

In [None]:
import os, sys
from google.colab import drive
# Create a symbolic link with no whitespace to the Colab notebooks, and put that in PATH.
drive.mount('/content/drive')
softlink = '/content/notebooks_link'
os.symlink('/content/drive/My Drive/Colab Notebooks', softlink)
sys.path.insert(0,softlink)

!pip install --target=$softlink python-docx
!pip show python-docx

# Put in words

In [9]:
# put in words
words = ['fad', 'deficient', 'oesophagus', 'equator']
print(words)

['fad', 'deficient', 'oesophagus', 'equator']


# Data processing

In [10]:
import requests
from bs4 import BeautifulSoup
import re
import docx
from google.colab import drive

class Definition():
  def __init__(self, meaning):
    self.meaning = meaning.text
    print('meaning: ' + self.meaning)
    # chinese
    self.chinese = meaning.next_sibling.find('span', {'lang': 'zh-Hant'}).text
    print('chinese: ' + self.chinese)
    # examples
    self.examples = []
    for example in meaning.next_sibling.find_all('div', {'class': 'examp'}):
      self.examples.append(re.sub('\n$', '', example.text))
    print(self.examples)


class WordData():
  def __init__(self, word):
    # Make the request to a url
    r = requests.get('https://dictionary.cambridge.org/dictionary/english-chinese-traditional/' + word,
              headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'})
    soup = BeautifulSoup(r.content)

    # get the title
    title = soup.find('title').text
    print('raw title: ' + title)
    self.title = re.sub(' \|.*', '', title)
    print('clean title: ' + self.title)

    # get the pronouciation
    self.pronunciations = []
    for pron in soup.find_all('span', {'class': 'pron'}):
      print('pron: ' + pron.text)
      self.pronunciations.append(pron.text)
    print(self.pronunciations)

    # get the meaning
    self.definitions = []
    for meaning in soup.find_all('div', {'class': 'ddef_h'}):
      self.definitions.append(Definition(meaning))

  def ToDoc(self, doc):
    p = doc.add_paragraph(self.title, style='List Bullet')
    p.add_run('\n')
    first = True
    for pron in self.pronunciations:
      if not first:
        p.add_run(', ')
      first = False
      p.add_run(pron)
    
    for definition in self.definitions:
      p = doc.add_paragraph(definition.meaning, style='List Bullet 2')
      p.add_run('\n')
      p.add_run(definition.chinese)
      for example in definition.examples:
        doc.add_paragraph(example, style='List Bullet 3')

# processing
doc = docx.Document()
for word in words:
  wordData = WordData(word)
  wordData.ToDoc(doc)

# mount the drive and then output the file to the drive
drive.mount('/content/drive')
doc.save('/content/drive/MyDrive/helloWorld.docx')

raw title: fad | translate to Traditional Chinese: Cambridge Dictionary
clean title: fad
pron: /fæd/
pron: /fæd/
['/fæd/', '/fæd/']
meaning:  a style, activity, or interest that is very popular for a short period of time 
chinese: 一時的風尚；短暫的狂熱
[' the latest health fad\n最新健康時尚', 'UK There was a fad for wearing ripped jeans a few years ago.\n幾年前有一陣子流行穿有洞的牛仔褲。']
raw title: deficient | translate to Traditional Chinese: Cambridge Dictionary
clean title: deficient
pron: /dɪˈfɪʃ.ənt/
pron: /dɪˈfɪʃ.ənt/
['/dɪˈfɪʃ.ənt/', '/dɪˈfɪʃ.ənt/']
meaning:  not having enough of 
chinese: 缺乏的，缺少的
[' A diet deficient in vitamin D may cause the disease rickets.\n飲食中缺乏維他命D可導致佝僂病。']
meaning:  not good enough 
chinese: 不夠的，不足的
[' His theory is deficient in several respects.\n他的理論有幾方面的不足。']
raw title: oesophagus | translate to Traditional Chinese: Cambridge Dictionary
clean title: oesophagus
pron: /ɪˈsɒf.ə.ɡəs/
pron: /ɪˈsɑː.fə.ɡəs/
['/ɪˈsɒf.ə.ɡəs/', '/ɪˈsɑː.fə.ɡəs/']
meaning:  the tube in the body that takes food