# Dependency Parsing CUB, Flower dataset

In [1]:
# pytorch == 1.1.0
from glob import glob
import os
import pprint
import stanfordnlp


In [2]:
stanfordnlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)


 y



Default download directory: C:\Users\dwkim\stanfordnlp_resources
Hit enter to continue or type an alternate directory.


 



Downloading models for: en_ewt
Download location: C:\Users\dwkim\stanfordnlp_resources\en_ewt_models.zip


100%|███████████████████████████████████████████████████████████████████████████████| 235M/235M [01:05<00:00, 3.59MB/s]



Download complete.  Models saved to: C:\Users\dwkim\stanfordnlp_resources\en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


## 1. Setting up the path

In [92]:
# Origin path
orig_dir_path = ".\\text_c10"
orig_text_path = ".\\text_c10\\*\\*.txt"
orig_dir_list = os.listdir(orig_dir_path)
orig_text_list = glob(orig_text_path)
pprint.pprint(orig_dir_list[:5])
pprint.pprint(orig_text_list[:5])

['001.Black_footed_Albatross',
 '002.Laysan_Albatross',
 '003.Sooty_Albatross',
 '004.Groove_billed_Ani',
 '005.Crested_Auklet']
['.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0001_796111.txt',
 '.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0002_55.txt',
 '.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0003_796136.txt',
 '.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0005_796090.txt',
 '.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0006_796065.txt']


In [109]:
# Converted file path
convert_dir_path = ".\\text_c10_convert"

convert_dir_list = [os.path.join(convert_dir_path, p ) for p in orig_dir_list]
convert_text_list = [ os.path.join(
                                    os.path.join(
                                        os.path.join(p.split(os.sep)[0], p.split(os.sep)[1] + '_convert')
                                    ), 
                      os.path.join(p.split(os.sep)[2], p.split(os.sep)[3])) for p in orig_text_list] 
pprint.pprint(convert_dir_list[:5])
pprint.pprint(convert_text_list[:5])

['.\\text_c10_convert\\001.Black_footed_Albatross',
 '.\\text_c10_convert\\002.Laysan_Albatross',
 '.\\text_c10_convert\\003.Sooty_Albatross',
 '.\\text_c10_convert\\004.Groove_billed_Ani',
 '.\\text_c10_convert\\005.Crested_Auklet']
['.\\text_c10_convert\\001.Black_footed_Albatross\\Black_Footed_Albatross_0001_796111.txt',
 '.\\text_c10_convert\\001.Black_footed_Albatross\\Black_Footed_Albatross_0002_55.txt',
 '.\\text_c10_convert\\001.Black_footed_Albatross\\Black_Footed_Albatross_0003_796136.txt',
 '.\\text_c10_convert\\001.Black_footed_Albatross\\Black_Footed_Albatross_0005_796090.txt',
 '.\\text_c10_convert\\001.Black_footed_Albatross\\Black_Footed_Albatross_0006_796065.txt']


In [None]:
# Create converted file folders

In [87]:
def create_folder(dir_path):
    try:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
    except OSError:
        print( "Error : Creating directory. {}".format(dir_path))

In [110]:
for i in convert_dir_list:
    create_folder(i)

## 2. Prepare Tokenizer and Dependency Parser

In [None]:
# Tokenize
import torch
from transformers import BertTokenizer

In [128]:
# Create Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create Dependency Parser
config = {
        'processors': 'tokenize,pos,depparse',
        'tokenize_pretokenized': True,
        'pos_batch_size': 1000
         }
nlp = stanfordnlp.Pipeline(**config)

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\dwkim\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tokenizer.pt', 'pretokenized': True, 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': 'C:\\Users\\dwkim\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tagger.pt', 'pretrain_path': 'C:\\Users\\dwkim\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'batch_size': 1000, 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: depparse
With settings: 
{'model_path': 'C:\\Users\\dwkim\\stanfordnlp_resources\\en_ewt_models\\en_ewt_parser.pt', 'pretrain_path': 'C:\\Users\\dwkim\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


## 3. Converting files (Tokenizing and Dependency Parsing) 

In [None]:
# Load text files
for idx, (orig_text_path, convert_text_path) in enumerate(zip(orig_text_list, convert_text_list)):
    if idx % 500 == 0:
        print('{} th file is converted from {} to {}'.format(idx, orig_text_path, convert_text_path))
        
    with open(orig_text_path, 'r') as f:
        txt = f.readlines()
        
# Tokenize the text & Dependency parsing
    tokenized_texts = [tokenizer.tokenize(s) for s in txt]
    doc = nlp(tokenized_texts)
    doc_dep = "\n".join(
                    [doc.sentences[i].dependencies_string().replace("\n", '|')
                     for i in range(len(tokenized_texts))]
                       )
    
#     doc_dep = [doc.sentences[i].dependencies_string() for i in range(len(tokenized_texts))]
# 

# Save
    with open(convert_text_path, 'w') as f:
        f.write(doc_dep)

0 th file is converted from .\text_c10\001.Black_footed_Albatross\Black_Footed_Albatross_0001_796111.txt to .\text_c10_convert\001.Black_footed_Albatross\Black_Footed_Albatross_0001_796111.txt
500 th file is converted from .\text_c10\010.Red_winged_Blackbird\Red_Winged_Blackbird_0027_4123.txt to .\text_c10_convert\010.Red_winged_Blackbird\Red_Winged_Blackbird_0027_4123.txt
1000 th file is converted from .\text_c10\019.Gray_Catbird\Gray_Catbird_0007_20186.txt to .\text_c10_convert\019.Gray_Catbird\Gray_Catbird_0007_20186.txt
1500 th file is converted from .\text_c10\027.Shiny_Cowbird\Shiny_Cowbird_0059_24421.txt to .\text_c10_convert\027.Shiny_Cowbird\Shiny_Cowbird_0059_24421.txt
2000 th file is converted from .\text_c10\036.Northern_Flicker\Northern_Flicker_0034_28740.txt to .\text_c10_convert\036.Northern_Flicker\Northern_Flicker_0034_28740.txt
2500 th file is converted from .\text_c10\044.Frigatebird\Frigatebird_0068_42795.txt to .\text_c10_convert\044.Frigatebird\Frigatebird_0068_42