# Dependency Parsing CUB, Flower dataset

In [1]:
# pytorch == 1.1.0
from glob import glob
import os
import pickle
import tqdm
import numpy as np
import pprint
import stanfordnlp

In [2]:
stanfordnlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)


 Y



Default download directory: C:\Users\dwkim\Documents\GitHub\stanfordnlp_resources
Hit enter to continue or type an alternate directory.


 



Downloading models for: en_ewt
Download location: C:\Users\dwkim\Documents\GitHub\stanfordnlp_resources\en_ewt_models.zip


100%|███████████████████████████████████████████████████████████████████████████████| 235M/235M [01:36<00:00, 2.44MB/s]



Download complete.  Models saved to: C:\Users\dwkim\Documents\GitHub\stanfordnlp_resources\en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


## 1. Setting up the path

In [74]:
# Origin path
orig_dir_path = ".\\text_c10"
orig_text_path = ".\\text_c10\\*\\*.txt"
orig_dir_list = os.listdir(orig_dir_path)
orig_text_list = glob(orig_text_path)
pprint.pprint(orig_dir_list[:5])
pprint.pprint(orig_text_list[:5])

['001.Black_footed_Albatross',
 '002.Laysan_Albatross',
 '003.Sooty_Albatross',
 '004.Groove_billed_Ani',
 '005.Crested_Auklet']
['.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0001_796111.txt',
 '.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0002_55.txt',
 '.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0003_796136.txt',
 '.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0005_796090.txt',
 '.\\text_c10\\001.Black_footed_Albatross\\Black_Footed_Albatross_0006_796065.txt']


In [75]:
# Converted file path
convert_dir_path = ".\\text_c10_convert_idx"

convert_dir_list = [os.path.join(convert_dir_path, p ) for p in orig_dir_list]
convert_text_list = [ os.path.join(
                                    os.path.join(
                                        os.path.join(p.split(os.sep)[0], p.split(os.sep)[1] + '_convert_idx')
                                    ), 
                      os.path.join(p.split(os.sep)[2], p.split(os.sep)[3])) for p in orig_text_list] 
pprint.pprint(convert_dir_list[:5])
pprint.pprint(convert_text_list[:5])

['.\\text_c10_convert_idx\\001.Black_footed_Albatross',
 '.\\text_c10_convert_idx\\002.Laysan_Albatross',
 '.\\text_c10_convert_idx\\003.Sooty_Albatross',
 '.\\text_c10_convert_idx\\004.Groove_billed_Ani',
 '.\\text_c10_convert_idx\\005.Crested_Auklet']
['.\\text_c10_convert_idx\\001.Black_footed_Albatross\\Black_Footed_Albatross_0001_796111.txt',
 '.\\text_c10_convert_idx\\001.Black_footed_Albatross\\Black_Footed_Albatross_0002_55.txt',
 '.\\text_c10_convert_idx\\001.Black_footed_Albatross\\Black_Footed_Albatross_0003_796136.txt',
 '.\\text_c10_convert_idx\\001.Black_footed_Albatross\\Black_Footed_Albatross_0005_796090.txt',
 '.\\text_c10_convert_idx\\001.Black_footed_Albatross\\Black_Footed_Albatross_0006_796065.txt']


In [82]:
convert_bin_list = []
for p in convert_text_list:
    p = p.split(".txt")
    p[-1] = ".bin"
    p = "".join(p)
    convert_bin_list.append(p)

# Create converted file folders

In [77]:
def create_folder(dir_path):
    try:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
    except OSError:
        print( "Error : Creating directory. {}".format(dir_path))

In [78]:
for i in convert_dir_list:
    create_folder(i)

## 2. Prepare Tokenizer and Dependency Parser

In [10]:
# Tokenize
import torch
from transformers import BertTokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [11]:
# Create Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create Dependency Parser
config = {
        'processors': 'tokenize,pos,depparse',
        'tokenize_pretokenized': True,
        'pos_batch_size': 1000
         }
nlp = stanfordnlp.Pipeline(**config)

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\dwkim\\Documents\\GitHub\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tokenizer.pt', 'pretokenized': True, 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': 'C:\\Users\\dwkim\\Documents\\GitHub\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tagger.pt', 'pretrain_path': 'C:\\Users\\dwkim\\Documents\\GitHub\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'batch_size': 1000, 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: depparse
With settings: 
{'model_path': 'C:\\Users\\dwkim\\Documents\\GitHub\\stanfordnlp_resources\\en_ewt_models\\en_ewt_parser.pt', 'pretrain_path': 'C:\\Users\\dwkim\\Documents\\GitHub\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


## 3. Converting files (Tokenizing and Dependency Parsing) 

In [86]:
orig_text_list.sort()
convert_bin_list.sort()

In [91]:
# Load text files
for idx, (orig_text_path, convert_bin_path) in tqdm.tqdm(enumerate(zip(orig_text_list, convert_bin_list))):
    if idx % 500 == 0:
        print('{} th file is converted from {} to {}'.format(idx, orig_text_path, convert_bin_path))
        
    with open(orig_text_path, 'r') as f:
        txt = f.readlines()
        
# Tokenize the text & Dependency parsing
    tokenized_texts = [tokenizer.tokenize(s) for s in txt]
    doc = nlp(tokenized_texts)
    sample = np.array([np.array([np.array([int(i[2].index), i[2].governor]) for i in j.dependencies]) for j in doc.sentences])
    
#     doc_dep = [doc.sentences[i].dependencies_string() for i in range(len(tokenized_texts))]
# 
# Save
    with open(convert_bin_path, 'wb') as f:
        pickle.dump(sample, f, pickle.HIGHEST_PROTOCOL)

1it [00:00,  8.29it/s]

0 th file is converted from .\text_c10\001.Black_footed_Albatross\Black_Footed_Albatross_0001_796111.txt to .\text_c10_convert_idx\001.Black_footed_Albatross\Black_Footed_Albatross_0001_796111.bin


501it [00:48, 10.64it/s]

500 th file is converted from .\text_c10\010.Red_winged_Blackbird\Red_Winged_Blackbird_0027_4123.txt to .\text_c10_convert_idx\010.Red_winged_Blackbird\Red_Winged_Blackbird_0027_4123.bin


1002it [01:38, 10.27it/s]

1000 th file is converted from .\text_c10\019.Gray_Catbird\Gray_Catbird_0007_20186.txt to .\text_c10_convert_idx\019.Gray_Catbird\Gray_Catbird_0007_20186.bin


1502it [02:26, 10.58it/s]

1500 th file is converted from .\text_c10\027.Shiny_Cowbird\Shiny_Cowbird_0059_24421.txt to .\text_c10_convert_idx\027.Shiny_Cowbird\Shiny_Cowbird_0059_24421.bin


2000it [03:14,  9.85it/s]

2000 th file is converted from .\text_c10\036.Northern_Flicker\Northern_Flicker_0034_28740.txt to .\text_c10_convert_idx\036.Northern_Flicker\Northern_Flicker_0034_28740.bin


2501it [04:04, 10.05it/s]

2500 th file is converted from .\text_c10\044.Frigatebird\Frigatebird_0068_42795.txt to .\text_c10_convert_idx\044.Frigatebird\Frigatebird_0068_42795.bin


3001it [04:54, 10.08it/s]

3000 th file is converted from .\text_c10\052.Pied_billed_Grebe\Pied_Billed_Grebe_0106_35418.txt to .\text_c10_convert_idx\052.Pied_billed_Grebe\Pied_Billed_Grebe_0106_35418.bin


3500it [05:43, 10.11it/s]

3500 th file is converted from .\text_c10\061.Heermann_Gull\Heermann_Gull_0034_45693.txt to .\text_c10_convert_idx\061.Heermann_Gull\Heermann_Gull_0034_45693.bin


4001it [06:33,  9.04it/s]

4000 th file is converted from .\text_c10\069.Rufous_Hummingbird\Rufous_Hummingbird_0109_60021.txt to .\text_c10_convert_idx\069.Rufous_Hummingbird\Rufous_Hummingbird_0109_60021.bin


4502it [07:25, 10.27it/s]

4500 th file is converted from .\text_c10\078.Gray_Kingbird\Gray_Kingbird_0010_70057.txt to .\text_c10_convert_idx\078.Gray_Kingbird\Gray_Kingbird_0010_70057.bin


5001it [08:14, 10.10it/s]

5000 th file is converted from .\text_c10\086.Pacific_Loon\Pacific_Loon_0040_75414.txt to .\text_c10_convert_idx\086.Pacific_Loon\Pacific_Loon_0040_75414.bin


5502it [09:03, 10.26it/s]

5500 th file is converted from .\text_c10\094.White_breasted_Nuthatch\White_Breasted_Nuthatch_0129_86761.txt to .\text_c10_convert_idx\094.White_breasted_Nuthatch\White_Breasted_Nuthatch_0129_86761.bin


6000it [09:51, 10.72it/s]

6000 th file is converted from .\text_c10\103.Sayornis\Sayornis_0066_98309.txt to .\text_c10_convert_idx\103.Sayornis\Sayornis_0066_98309.bin


6501it [10:41, 10.08it/s]

6500 th file is converted from .\text_c10\111.Loggerhead_Shrike\Loggerhead_Shrike_0128_105238.txt to .\text_c10_convert_idx\111.Loggerhead_Shrike\Loggerhead_Shrike_0128_105238.bin


7001it [11:30, 10.51it/s]

7000 th file is converted from .\text_c10\120.Fox_Sparrow\Fox_Sparrow_0078_114582.txt to .\text_c10_convert_idx\120.Fox_Sparrow\Fox_Sparrow_0078_114582.bin


7501it [12:18, 10.14it/s]

7500 th file is converted from .\text_c10\128.Seaside_Sparrow\Seaside_Sparrow_0066_120791.txt to .\text_c10_convert_idx\128.Seaside_Sparrow\Seaside_Sparrow_0066_120791.bin


8001it [13:08, 10.49it/s]

8000 th file is converted from .\text_c10\137.Cliff_Swallow\Cliff_Swallow_0035_133097.txt to .\text_c10_convert_idx\137.Cliff_Swallow\Cliff_Swallow_0035_133097.bin


8502it [13:57,  9.75it/s]

8500 th file is converted from .\text_c10\145.Elegant_Tern\Elegant_Tern_0072_150911.txt to .\text_c10_convert_idx\145.Elegant_Tern\Elegant_Tern_0072_150911.bin


9001it [14:47, 10.07it/s]

9000 th file is converted from .\text_c10\154.Red_eyed_Vireo\Red_Eyed_Vireo_0023_156800.txt to .\text_c10_convert_idx\154.Red_eyed_Vireo\Red_Eyed_Vireo_0023_156800.bin


9502it [15:36, 10.39it/s]

9500 th file is converted from .\text_c10\162.Canada_Warbler\Canada_Warbler_0064_162417.txt to .\text_c10_convert_idx\162.Canada_Warbler\Canada_Warbler_0064_162417.bin


10001it [16:26, 10.48it/s]

10000 th file is converted from .\text_c10\170.Mourning_Warbler\Mourning_Warbler_0074_795367.txt to .\text_c10_convert_idx\170.Mourning_Warbler\Mourning_Warbler_0074_795367.bin


10502it [17:14, 10.47it/s]

10500 th file is converted from .\text_c10\179.Tennessee_Warbler\Tennessee_Warbler_0031_174802.txt to .\text_c10_convert_idx\179.Tennessee_Warbler\Tennessee_Warbler_0031_174802.bin


11002it [18:04, 10.54it/s]

11000 th file is converted from .\text_c10\187.American_Three_toed_Woodpecker\American_Three_Toed_Woodpecker_0040_796180.txt to .\text_c10_convert_idx\187.American_Three_toed_Woodpecker\American_Three_Toed_Woodpecker_0040_796180.bin


11501it [18:54, 10.16it/s]

11500 th file is converted from .\text_c10\196.House_Wren\House_Wren_0035_187708.txt to .\text_c10_convert_idx\196.House_Wren\House_Wren_0035_187708.bin


11788it [19:22, 10.14it/s]


## 4. Save it as single file

In [96]:
all_data = []

In [97]:
for path in convert_bin_list:
    with open(path, 'rb') as f:
        data = pickle.load(f)
    all_data.append(data)

In [101]:
all_data[10][0]

array([[ 1,  3],
       [ 2,  3],
       [ 3,  4],
       [ 4,  0],
       [ 5,  7],
       [ 6,  7],
       [ 7,  4],
       [ 8, 10],
       [ 9, 10],
       [10,  7],
       [11, 10],
       [12, 13],
       [13, 11],
       [14, 13]])

In [112]:
pathndata = {sample[0].split(os.sep)[-1]:sample[1]  for sample in zip(convert_bin_list, all_data) }

In [110]:
len(all_data)

11788

In [121]:
with open('dp_index_dict.bin', 'wb') as f:
    pickle.dump(pathndata, f)