In [1]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import requests
import re
import bs4
import unicodedata

In [2]:
#make directory if it doesn't exist
folder_name = 'jlpt_vocab'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

### Testing on JLPT N3 part 1 vocab

creating beautiful soup object

In [3]:
url = 'https://www.kanshudo.com/collections/wikipedia_jlpt/WPJLPT-N3-1'
response = requests.get(url)
response.raise_for_status()

HTTPError: 403 Client Error: Forbidden for url: https://www.kanshudo.com/collections/wikipedia_jlpt/WPJLPT-N3-1

#### bypassing the 403 error
1. Go to the webpage
2. bring up the developer view
3. Go to network tab
4. Make a request by clicking on an object in the webpage
5. Select 1 of the requests
6. Go to headers
7. Copy the information in Request Headers under User-Agent
8. Create a dictionary with information as seen below

In [4]:
headers = {'User-Agent':
           'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
          }

Trying again

In [5]:
url = 'https://www.kanshudo.com/collections/wikipedia_jlpt/WPJLPT-N3-1'
response = requests.get(url, headers=headers)
response.raise_for_status()

Since there is no error, the header param worked

In [6]:
soup = BeautifulSoup(response.content, 'lxml')
soup

<!DOCTYPE html>
<html lang="ja" xml:lang="ja" xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Wikipedia JLPT Vocab - Kanshudo</title>
<meta charset="utf-8"/>
<script>window.NREUM||(NREUM={});NREUM.info={"beacon":"bam-cell.nr-data.net","errorBeacon":"bam-cell.nr-data.net","licenseKey":"23e93f0784","applicationID":"5958078","transactionName":"dlZaRhEKW10AR0sFC1lVUVEXDFhfFhoTDw9cSVFWCgRoWwlFEA==","queueTime":5,"applicationTime":498,"agent":""}</script>
<script>(window.NREUM||(NREUM={})).loader_config={licenseKey:"23e93f0784",applicationID:"5958078"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var i=e[n]={exports:{}};t[n][0].call(i.exports,function(e){var i=t[n][1][e];return r(i||e)},i,i.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<n.length;i++)r(n[i]);return r}({1:[function(t,e,n){function r(){}function i(t,e,n){return function(){return o(t,[u.now()].concat(f(arguments)),e?null:this,n),e?void 0:this

### Finding the HTML elements for words

the html element for the entire section for each work looks like:  
`<div id="jukugo_119971" class="jukugorow first last">...</div>`
where the numerals after "jukugo_" are unique to each word

the kanji and furigana for the word is nested in the above element  
`<span class="f_container noflip" id="jk_jk_119971_fc">  
    <div class="furigana" id="jk_jk_119971_f">にんき</div>  
    <div class="f_kanji" id="jk_jk_119971_fk">人気</div></span>`  

# Output

- 2 __tsv__ files
    - vocab list
    - example sentence list
    - vocab list of words that don't have example sentences

### Dataframe 1 - vocab list

For each group of 100 words per JLPT level

TSV has 3 columns:
- [vocab word column] vocab words in kanji
- [definition column] furigana enclosed by japanese brackets 「」+ colon + 1-3 definitions (separated by semi-colon)
- [tag column] tag that specifies the jlpt level (ex. n4_vocab)

#### defining the vocab list dataframe structure

In [7]:
vocab_list_structure = []
vocab_list_structure.append({'vocab_word':vocab_word,
                             'definition':definition,
                              'tag':tag})
vocab_list_df = pd.DataFrame(vocab_list_structure, columns = ['vocab_word', 'definition', 'tag'])

NameError: name 'vocab_word' is not defined

#### creating a list of all jukugo words/ids
this list will be the basis of the for loop that gathers all the individual components

In [8]:
word_id_list = []
for i in soup.find_all('div',id=re.compile('^jukugo_')):
    id_string = i.get('id')
    word_id_list.append(id_string)

In [9]:
word_id_list

['jukugo_119971',
 'jukugo_125556',
 'jukugo_134546',
 'jukugo_140632',
 'jukugo_261135',
 'jukugo_118940',
 'jukugo_103899',
 'jukugo_106019',
 'jukugo_105810',
 'jukugo_109942',
 'jukugo_132830',
 'jukugo_102904',
 'jukugo_128935',
 'jukugo_114053',
 'jukugo_105620',
 'jukugo_102750',
 'jukugo_118065',
 'jukugo_135887',
 'jukugo_125194',
 'jukugo_110048',
 'jukugo_126617',
 'jukugo_105145',
 'jukugo_104347',
 'jukugo_100476',
 'jukugo_105752',
 'jukugo_109216',
 'jukugo_124321',
 'jukugo_274529',
 'jukugo_131104',
 'jukugo_268405',
 'jukugo_110721',
 'jukugo_127768',
 'jukugo_261298',
 'jukugo_117044',
 'jukugo_113250',
 'jukugo_116421',
 'jukugo_115217',
 'jukugo_109070',
 'jukugo_278504',
 'jukugo_118954',
 'jukugo_131405',
 'jukugo_124359',
 'jukugo_104886',
 'jukugo_104587',
 'jukugo_104884',
 'jukugo_112853',
 'jukugo_117808',
 'jukugo_133411',
 'jukugo_126193',
 'jukugo_132035',
 'jukugo_125259',
 'jukugo_125518',
 'jukugo_117855',
 'jukugo_101214',
 'jukugo_264115',
 'jukugo_1

#### within an id element, isolate kanji vocab word

In [10]:
soup.find(id=word_id_list[0]).find('div', class_='f_kanji').contents[0]

'人気'

#### within an id element, isolate hiragana

In [11]:
soup.find(id=word_id_list[0]).find('div', class_='furigana').contents[0]

'にんき'

modify furigana to include 「」and :

In [12]:
furi = soup.find(id=word_id_list[0]).find('div', class_='furigana').contents[0]
furi_brak = '「'+furi+'」'+':'+' '
furi_brak

'「にんき」: '

#### grabbing definitions

###### determining if a word hase 1 definition only or 1+ definitions

one definition word
id = jukugo_119971

1+ definition word
id = jukugo_125556

__definitions are found under this element__

\<div class="jr_details" id='jk_details_119991' style='display: block;">

defintions themselves are in \<div class="vm">

###### one definition word

In [13]:
soup.find(id=word_id_list[0]).find('div', class_='vm').contents[-1]

'popularity; popular feeling; business conditions'

In [14]:
num_defs = len(soup.find(id=word_id_list[0]).find_all('div', class_='vm'))

In [15]:
soup.find(id=word_id_list[0]).find_all('div', class_='vm')

[<div class="vm"><div><span>noun, <a href="/grammar/id/699">'no' adjective</a>, <a href="/grammar/id/451">'na' adjective</a></span></div>popularity; popular feeling; business conditions</div>]

###### 1+ defintion word

In [16]:
soup.find(id='jukugo_125556').find_all('div',class_='vm')

[<div class="vm"><div><span>noun, <a href="/grammar/id/699">'no' adjective</a></span></div><span class="vm_id">1. </span>center; centre; middle; heart; core; focus; pivot; emphasis; balance</div>,
 <div class="vm"><div><span>suffix</span></div><span class="vm_id">2. </span>-centered; -centred; -focussed; -oriented; centered on; focussed on</div>]

In [17]:
for i in soup.find(id='jukugo_125556').find_all('div',class_='vm'):
    print(i.contents[-1])

center; centre; middle; heart; core; focus; pivot; emphasis; balance
-centered; -centred; -focussed; -oriented; centered on; focussed on


##### checking to see if loop works for both 1 only and 1+ def words

In [18]:
# 1 def word
for i in soup.find(id=word_id_list[0]).find_all('div',class_='vm'):
    print(i.contents[-1])

popularity; popular feeling; business conditions


#### handling definitions

In [19]:
#creating a list with defs
def_list = []
for i in soup.find(id='jukugo_125556').find_all('div',class_='vm'):
    definition = i.contents[-1]
    def_list.append(definition)

In [20]:
def_list

['center; centre; middle; heart; core; focus; pivot; emphasis; balance',
 '-centered; -centred; -focussed; -oriented; centered on; focussed on']

In [21]:
#joining def list as one string
'; '.join(def_list)

all_def_string = '; '.join(def_list)
all_def_string

'center; centre; middle; heart; core; focus; pivot; emphasis; balance; -centered; -centred; -focussed; -oriented; centered on; focussed on'

In [22]:
all_def_string.split(';')

['center',
 ' centre',
 ' middle',
 ' heart',
 ' core',
 ' focus',
 ' pivot',
 ' emphasis',
 ' balance',
 ' -centered',
 ' -centred',
 ' -focussed',
 ' -oriented',
 ' centered on',
 ' focussed on']

In [23]:
split_defs = all_def_string.split(';')
only_three = split_defs[:3]
only_three

['center', ' centre', ' middle']

In [24]:
';'.join(only_three)

'center; centre; middle'

In [25]:
definition_string = ';'.join(only_three)

#joining furigana and def string
furi_brak + definition_string

'「にんき」: center; centre; middle'

#### constructing tag

In [26]:
re.findall(r'N\d',url)

['N3']

In [27]:
tag = re.findall(r'N\d',url)[0].lower()+'_vocab'
tag

'n3_vocab'

## Constructing vocab list dataframe - putting it all together

In [29]:
#getting list of ids

#building dataframe  
vocab_list_structure = []
for count, word in enumerate(word_id_list):
    print(word)  ###########for debugging################
    print(count)   ###########for debugging################
    #determining if kanji or kana word
    if soup.find(id=word).find('div', class_='f_kanji'):
        if len(soup.find('a', id='jk'+word[word.find('_'):]).contents) > 1:#kanji and kana
            kanji = soup.find(id=word).find('div', class_='f_kanji').contents[0]
            kana = [i for i in soup.find('a', id='jk'+word[word.find('_'):]).contents if type(i) is bs4.element.NavigableString]
            kana = ''.join(kana)
            vocab_word = kanji+kana
            print(vocab_word)###########for debugging################
            furi = soup.find(id=word).find('div', class_='furigana').contents[0]
            furi_brak = '「'+furi+kana+'」'+':'+' '
        else:#kanji only
            vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]
            print(vocab_word)###########for debugging################
            #furigana
            furi = soup.find(id=word).find('div', class_='furigana').contents[0]
            furi_brak = '「'+furi+'」'+':'+' '
    else:#kana only
        vocab_word = soup.find('a', id='jk'+word[word.find('_'):]).contents[0]
        print(vocab_word)###########for debugging################
    '''
    if soup.find(id=word).find('div', class_='f_kanji'): #kanji
        vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]
        print(vocab_word) ###########for debugging################
        #furigana
        furi = soup.find(id=word).find('div', class_='furigana').contents[0]
        furi_brak = '「'+furi+'」'+':'+' '
        print(furi_brak) ###########for debugging###################
    else: #kana
        vocab_word = soup.find('a', id='jk'+word[word.find('_'):]).contents[0]
        print(vocab_word) ###########for debugging################
    '''
    #getting definition
    #creates list of definitions
    def_list = []
    #checks the 
    for i in soup.find(id=word).find_all('div', class_='vm'):
        for obj in i:
            if type(obj) is bs4.element.NavigableString:
                def_list.append(obj)
                all_def_string = '; '.join(def_list)
                split_defs = all_def_string.split(';')
                only_three = split_defs[:3]
                definition_string = ';'.join(only_three)
                if soup.find(id=word).find('div', class_='f_kanji'):
                    definition = furi_brak+definition_string
                else:
                    definition = definition_string
    definition=unicodedata.normalize('NFKD',definition).strip()
    print(definition) ###########for debugging###################
                           
    #getting tag
    tag = re.findall(r'N\d',url)[0].lower()+'_vocab'                 

    #creating dataframe structure
    vocab_list_structure.append({'vocab_word':vocab_word,
                                 'definition':definition,
                                  'tag':tag})
#building dataframe                           
vocab_list_df = pd.DataFrame(vocab_list_structure, columns = ['vocab_word', 'definition', 'tag'])

jukugo_119971
0
人気
「にんき」: popularity; popular feeling; business conditions
jukugo_125556
1
中心
「ちゅうしん」: center; centre; middle
jukugo_134546
2
方法
「ほうほう」: method; process; manner
jukugo_140632
3
全国
「ぜんこく」: countrywide; nationwide; whole country
jukugo_261135
4
関連
「かんれん」: relation; connection; relevance
jukugo_118940
5
情報
「じょうほう」: information; news; intelligence
jukugo_103899
6
海外
「かいがい」: foreign; abroad; overseas
jukugo_106019
7
記事
「きじ」: article; news story; report
jukugo_105810
8
機能
「きのう」: function; facility; faculty
jukugo_109942
9
現在
「げんざい」: now; current; present
jukugo_132830
10
部分
「ぶぶん」: portion; section; part
jukugo_102904
11
可能
「かのう」: potential; possible; practicable
jukugo_128935
12
内容
「ないよう」: subject; contents; matter
jukugo_114053
13
使用
「しよう」: use; application; employment
jukugo_105620
14
基本
「きほん」: foundation; basis; standard
jukugo_102750
15
価格
「かかく」: price; value; cost
jukugo_118065
16
商品
「しょうひん」: commodity; article of commerce; goods
jukugo_135887
17
無料
「むりょう」: free;

### breaking at 無料
無料 is jukugo_135887

__need to develop a way to account for words that break__
use try: except: statements to throw broken words and errors into a separate dataframe, make sure to id the part that broke and 

##### 無料 broke at the definition --> investigations

In [None]:
### checking definition list
debug_def_list = []
for i in soup.find(id='jukugo_135887').find_all('div',class_='vm'):
    definition = i.contents[-1]
    debug_def_list.append(definition)
debug_def_list

*unexpected behavior*
grab all under div element, 'vm' class

In [None]:
soup.find(id='jukugo_135887').find_all('div',class_='vm')

In [None]:
for i in soup.find(id='jukugo_135887').find_all('div',class_='vm'):
    print(i'\n')

### different types of definition storages

###### "buried" definition - only 1

In [None]:
import bs4
for i in soup.find(id='jukugo_135887').find('div',class_='vm').contents:
    print(type(i), i, type(i) is bs4.element.NavigableString)

In [None]:
soup.find(id='jukugo_135887').find_all('div',class_='vm')

In [None]:
len(soup.find(id='jukugo_135887').find_all('div',class_='vm'))

In [None]:
len(soup.find(id='jukugo_135887').find('div',class_='vm').contents)

###### "not buried" - multiple definitions 

In [None]:
soup.find(id='jukugo_125556').find_all('div',class_='vm')

In [None]:
t = soup.find(id='jukugo_125556').find_all('div',class_='vm')

In [None]:
len(soup.find(id='jukugo_125556').find_all('div',class_='vm'))

In [None]:
definition = t.contents[-1]
definition

###### "not buried" - only 1

In [None]:
soup.find(id='jukugo_119971').find_all('div',class_='vm')

In [None]:
len(soup.find(id='jukugo_119971').find_all('div',class_='vm'))

In [None]:
len(soup.find(id='jukugo_119971').find('div',class_='vm').contents)

In [None]:
soup.find(id='jukugo_119971').find('div',class_='vm').contents

In [None]:
for i in soup.find(id='jukugo_119971').find('div',class_='vm').contents:
    print(type(i), i, type(i) is bs4.element.NavigableString)

~# NOTES FOR NEXT SESSION - 07/11/2021~
# NOTES FOR NEXT SESSION - 07/12/2021
1. need to find a way to deal with the different definition storage types above  

  ~NEED TO TEST THE CODE WRITTEN IN SUBSEQUENT CELL TO SEE IF ACCOUNTS FOR ABOVE DEF TYPES~
 
     ~LOOKS LIKE IT WORKS try rerunning code~
     
     ~BREAK at 一般 「いっぱん」:~
   
    
  ~fixed break for definitions, new break for dealing with non-kanji words
        - remember to account for if kanji, if hiragana, if katakana~
        
  ~- found issue with 考え because it is kanji+ kana
        - need to debug jukugo_111613~
        
      
    07/19/2021 fixed kanji+kana word, need to now create a tsv file from dataframe
  
2. need to add to the main loop a "break" protocol
    - use try-excepts to account for unexpected breakages
        - if a part of the word breaks, send to a data frame, include the following
            - the word
            - the jukugo id
            - the part that broke

In [None]:
##psuedo code for 1.

#creates list of definitions
def_list = []
#checks the 
if len(soup.find(id=word).find_all('div',class_='vm')) > 1:
    for i in soup.find(id=word).find_all('div',class_='vm'):
        definition = i.contents[-1]
        def_list.append(definition)
        all_def_string = '; '.join(def_list)
        split_defs = all_def_string.split(';')
        only_three = split_defs[:3]
        definition_string = ';'.join(only_three)
        definition = furi_brak+definition_string
else:
    for el in soup.find(id=word).find('div',class_='vm').contents:
        if type(el) is bs4.element.NavigableString:
            def_list.append(el)
            all_def_string = '; '.join(def_list)
            split_defs = all_def_string.split(';')
            only_three = split_defs[:3]
            definition_string = ';'.join(only_three)
            definition = furi_brak+definition_string

##### checking situation 1 - "buried" definition - only 1 (jukugo_135887)

In [None]:
#checking situation 1 - "buried" definition - only 1 (jukugo_135887)
word = 'jukugo_135887'
#creates list of definitions
def_list = []
#checks the 
if len(soup.find(id=word).find_all('div',class_='vm')) > 1:
    for i in soup.find(id=word).find_all('div',class_='vm'):
        definition = i.contents[-1]
        def_list.append(definition)
        all_def_string = '; '.join(def_list)
        split_defs = all_def_string.split(';')
        only_three = split_defs[:3]
        definition_string = ';'.join(only_three)
        definition = furi_brak+definition_string
else:
    for el in soup.find(id=word).find('div',class_='vm').contents:
        if type(el) is bs4.element.NavigableString:
            def_list.append(el)
            all_def_string = '; '.join(def_list)
            split_defs = all_def_string.split(';')
            only_three = split_defs[:3]
            definition_string = ';'.join(only_three)
            definition = furi_brak+definition_string
definition

In [None]:
#normalizing unicode
unicodedata.normalize('NFKD',definition).strip()

###### checking situation 2 - "not buried" - multiple definitions ('jukugo_125556')

In [None]:
# checking situation 2 - "not buried" - multiple definitions ('jukugo_125556')
word = 'jukugo_125556'
#creates list of definitions
def_list = []
#checks the 
if len(soup.find(id=word).find_all('div',class_='vm')) > 1:
    for i in soup.find(id=word).find_all('div',class_='vm'):
        definition = i.contents[-1]
        def_list.append(definition)
        all_def_string = '; '.join(def_list)
        split_defs = all_def_string.split(';')
        only_three = split_defs[:3]
        definition_string = ';'.join(only_three)
        definition = furi_brak+definition_string
else:
    for el in soup.find(id=word).find('div',class_='vm').contents:
        if type(el) is bs4.element.NavigableString:
            def_list.append(el)
            all_def_string = '; '.join(def_list)
            split_defs = all_def_string.split(';')
            only_three = split_defs[:3]
            definition_string = ';'.join(only_three)
            definition = furi_brak+definition_string
definition
definition=unicodedata.normalize('NFKD',definition).strip()
definition

In [None]:
def_list

###### checking situation 3 - "not buried" - only 1 ('jukugo_119971')

In [None]:
#checking situation 3 - "not buried" - only 1 ('jukugo_119971')
word = 'jukugo_119971'
#creates list of definitions
def_list = []
#checks the 
if len(soup.find(id=word).find_all('div',class_='vm')) > 1:
    for i in soup.find(id=word).find_all('div',class_='vm'):
        definition = i.contents[-1]
        def_list.append(definition)
        all_def_string = '; '.join(def_list)
        split_defs = all_def_string.split(';')
        only_three = split_defs[:3]
        definition_string = ';'.join(only_three)
        definition = furi_brak+definition_string
else:
    for el in soup.find(id=word).find('div',class_='vm').contents:
        if type(el) is bs4.element.NavigableString:
            def_list.append(el)
            all_def_string = '; '.join(def_list)
            split_defs = all_def_string.split(';')
            only_three = split_defs[:3]
            definition_string = ';'.join(only_three)
            definition = furi_brak+definition_string
definition

## dealing with new break 一般 jukugo_100476

In [None]:
word = 'jukugo_100476'
soup.find(id=word).find_all('div',class_='vm')

In [None]:
len(soup.find(id=word).find_all('div',class_='vm'))

In [None]:
soup.find(id=word).find_all('div',class_='vm')

##### checking situation 1 - "buried" definition - only 1 (jukugo_135887)
###### checking situation 2 - "not buried" - multiple definitions ('jukugo_125556')
###### checking situation 3 - "not buried" - only 1 ('jukugo_119971')
###### checking situation 4 - buried - multiple ('jukugo_100476')

Might be able to use the "buried" method for all of them

In [None]:
#situation 4
word = 'jukugo_100476'
soup.find(id=word).find_all('div',class_='vm')

In [None]:
####works for situation 4######
def_list = []
for i in soup.find(id=word).find_all('div', class_='vm'):
    for obj in i:
        if type(obj) is bs4.element.NavigableString:
            def_list.append(obj)
            all_def_string = '; '.join(def_list)
            split_defs = all_def_string.split(';')
            only_three = split_defs[:3]
            definition_string = ';'.join(only_three)
print(definition_string)

In [None]:
#situation 1
word = 'jukugo_135887'
soup.find(id=word).find_all('div',class_='vm')

In [None]:
####works for situation 1
def_list = []
for i in soup.find(id=word).find_all('div', class_='vm'):
    for obj in i:
        if type(obj) is bs4.element.NavigableString:
            def_list.append(obj)
            all_def_string = '; '.join(def_list)
            split_defs = all_def_string.split(';')
            only_three = split_defs[:3]
            definition_string = ';'.join(only_three)
print(definition_string)

In [None]:
#####situation 2
word = 'jukugo_125556'
soup.find(id=word).find_all('div',class_='vm')

In [None]:
####works for situation 2
def_list = []
for i in soup.find(id=word).find_all('div', class_='vm'):
    for obj in i:
        if type(obj) is bs4.element.NavigableString:
            def_list.append(obj)
            all_def_string = '; '.join(def_list)
            split_defs = all_def_string.split(';')
            only_three = split_defs[:3]
            definition_string = ';'.join(only_three)
print(definition_string)

In [None]:
#testing situation 3
word = 'jukugo_119971'
soup.find(id=word).find_all('div',class_='vm')

In [None]:
######works for situation 3
def_list = []
for i in soup.find(id=word).find_all('div', class_='vm'):
    for obj in i:
        if type(obj) is bs4.element.NavigableString:
            def_list.append(obj)
            all_def_string = '; '.join(def_list)
            split_defs = all_def_string.split(';')
            only_three = split_defs[:3]
            definition_string = ';'.join(only_three)
print(definition_string)

# Dealing with all word types: Kanji, Hiragana, Katakana

In [None]:
word = 
vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]

if kanji.contents have value:
    vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]
        print(vocab_word) ###########for debugging################
        #getting definition
        #furigana
        furi = soup.find(id=word).find('div', class_='furigana').contents[0]
        furi_brak = '「'+furi+'」'+':'+' '
        print(furi_brak)
elif hiragana/katakana (set up the same:
        

### hiragana word なお jukugo_326587

In [None]:
word = 'jukugo_326587'
for el in soup.find(id=word).find('div', class_='jukugo'):
    print(el,type(el))

In [None]:
#using string parsing to conduct find
soup.find('a', id='jk'+word[word.find('_'):]).contents[0]

In [None]:
word = 'jukugo_326587'
if soup.find(id=word).find('div', class_='f_kanji'):
    vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]
    print(vocab_word)
else:
    vocab_word = soup.find('a', id='jk'+word[word.find('_'):]).contents[0]
    print(vocab_word)

In [None]:
word = 'jukugo_119971'
if soup.find(id=word).find('div', class_='f_kanji'):
    vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]
    print(vocab_word)
else:
    vocab_word = soup.find('a', id='jk'+word[word.find('_'):]).contents[0]
    print(vocab_word)

### dealing with 考え jukugo_111613

In [None]:
word = 'jukugo_111613'
soup.find(id=word).find('div', class_='f_kanji').contents

In [None]:
vocab_word = soup.find('a', id='jk'+word[word.find('_'):]).contents[0]
print(vocab_word)

In [None]:
vocab_word = soup.find_all('a', id='jk'+word[word.find('_'):])
print(vocab_word)

In [None]:
for i in soup.find('a', id='jk'+word[word.find('_'):]).contents:
    print(i,type(i))


In [None]:
word = 'jukugo_119971'
if soup.find(id=word).find('div', class_='f_kanji'):
    if len(soup.find(id=word).find('div', class_='f_kanji')) > 1:
        kanji = soup.find(id=word).find('div', class_='f_kanji').contents[0]
        kana = [i for i in soup.find('a', id='jk'+word[word.find('_'):]).contents if type(i) is bs4.element.NavigableString]
        kana = ''.join(kana)
        vocab_word = kanji+kana
        print(vocab_word)
    else:
        vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]
        print(vocab_word)
else:
    vocab_word = soup.find('a', id='jk'+word[word.find('_'):]).contents[0]
    print(vocab_word)

In [None]:
len(soup.find(id=word).find('div', class_='f_kanji'))

In [None]:
len(soup.find('a', id='jk'+word[word.find('_'):]).contents)

In [None]:
word = 'jukugo_111613'
if soup.find(id=word).find('div', class_='f_kanji'):
    if len(soup.find('a', id='jk'+word[word.find('_'):]).contents) > 1:
        kanji = soup.find(id=word).find('div', class_='f_kanji').contents[0]
        kana = [i for i in soup.find('a', id='jk'+word[word.find('_'):]).contents if type(i) is bs4.element.NavigableString]
        kana = ''.join(kana)
        vocab_word = kanji+kana
        print(vocab_word)
    else:
        vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]
        print(vocab_word)
else:
    vocab_word = soup.find('a', id='jk'+word[word.find('_'):]).contents[0]
    print(vocab_word)

In [None]:
vocab_list_df

In [None]:
soup.find(id='jukugo_135887').find_all('div',class_='vm').contents

In [None]:
vocab_list_structure

In [None]:
test_list = ['whatever']
len(test_list)

In [None]:
test_list[0].split(';')

test

In [None]:
vocab_list_structure = []
word = word_id_list[3]
#getting kanji vocab word
vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]

#getting definition
#furigana
furi = soup.find(id=word).find('div', class_='furigana').contents[0]
furi_brak = '「'+furi+'」'+':'+' '
##creating a list with defs
def_list = []
for i in soup.find(id=word).find_all('div',class_='vm'):
    definition = i.contents[-1]
    def_list.append(definition)
if len(def_list) > 1:
    all_def_string = '; '.join(def_list)
    split_defs = all_def_string.split(';')
    only_three = split_defs[:3]
    definition_string = ';'.join(only_three)
    definition = furi_brak+definition_string
else:
    split_defs = def_list[0].split(';')
    only_three = split_defs[:3]
    definition_string = ';'.join(only_three)
    definition = furi_brak+definition_string

#getting tag
tag = re.findall(r'N\d',url)[0].lower()+'_vocab'                 

#creating dataframe structure
vocab_list_structure.append({'vocab_word':vocab_word,
                             'definition':definition,
                              'tag':tag})
#building dataframe                           
vocab_list_df = pd.DataFrame(vocab_list_structure, columns = ['vocab_word', 'definition', 'tag'])
vocab_list_df

In [None]:
for word in word_id_list:
    vocab_word = soup.find(id=word).find('div', class_='f_kanji').contents[0]
    print(vocab_word)

In [None]:
def_list = []
for i in soup.find(id=word_id_list[0]).find_all('div',class_='vm'):
    definition = i.contents[-1]
    def_list.append(definition)
if len(def_list) > 1:
    all_def_string = '; '.join(def_list)
    split_defs = all_def_string.split(';')
    only_three = split_defs[:3]
    definition_string = ';'.join(only_three)
    definition = furi_brak+definition_string
else:
    split_defs = def_list[0].split(';')
    only_three = split_defs[:3]
    definition_string = ';'.join(only_three)
    definition = furi_brak+definition_string


In [None]:
def_list

In [None]:
definition

In [None]:
vocab_list_structure

In [None]:
test_str = 'test'
t_list = test_str.split(';')[:3]
';'.join(t_list)

### troubleshooting errors

### Dataframe 2 - example sentence list

For each group of 100 words per JLPT level

TSV has 3 columns:
- [example sentence column] example sentence using the vocab word
    - some words don't have example sentences, export these words to a separate file
- [vocab column] vocab word in kanji + furigana enclosed by japanese brackets 「」+ colon + 1-3 definitions (separated by semi-colons
    - __vocab tsv__ [vocab word column] + ' ' + [definition column]
- [tag column] tag that specifies the jlpt level (ex. n4_vocab)
    - use the same tag column from __vocab list tsv__

### Dataframe 3 - vocab list of words that do not have example sentences
While creating example sentence list, if a word does not have an example sentence take the word and stick it into a list.  The list will be opened in Google sheets and vocab will be uploaded manually

TSV has 3 columns
-[expected example sentence column] empty string
-[vocab column] use vocab column from example sentence list: kanji + furigana+ 1-3 definitions
-[tag column] tag that specifies the jlpt level (ex. n3_vocab)