# All imports necessary

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

In [3]:
import seaborn as sns
sns.set(color_codes=True)

In [5]:
from tqdm.autonotebook import tqdm

In [6]:
%matplotlib inline

# Auxiliary methods

In [18]:
def filter_by_subcorpus(data_root_folder, subcorpus_name):
    data_folders = [
        os.path.join(data_root_folder, 'data', folder, subfolder)
        for folder in tqdm(os.listdir(os.path.join(data_root_folder, 'data')))
        for subfolder in os.listdir(os.path.join(data_root_folder, 'data', folder))
    ]
    mask = []
    for folder in tqdm(data_folders):
        with open(os.path.join(folder, 'en.met'), 'r', encoding='utf-8') as myfile:
            mask.append(subcorpus_name in myfile.read().replace('\n', ''))
    return np.array(data_folders)[mask]

In [9]:
def get_raw_texts(folders, cache_folder):
    if not os.path.exists(os.path.join(cache_folder, 'en.raw.extracted')):
        texts = []
        for folder in tqdm(folders):
            with open(os.path.join(folder, 'en.raw'), 'r', encoding='utf-8') as myfile:
                texts.append(myfile.read().replace('\n', ''))
        np.save(os.path.join(cache_folder, 'en.raw.extracted'), texts)
        return texts
    else:
        return np.load(os.path.join(cache_folder, 'en.raw.extracted'))

In [10]:
def get_tagged_texts(folders, cache_folder):
    if not os.path.exists(os.path.join(cache_folder, 'en.tags.extracted')):
        tagged_texts = []
        for folder in tqdm(folders):
            tagged_text = np.loadtxt(os.path.join(folder, 'en.tags'), dtype=str, delimiter='\t', encoding='utf-8')
            tagged_texts.append({
                'word': tagged_text[:, 0],
                'POS-tag': tagged_text[:, 1],
                'lemma': tagged_text[:, 2],
                'TARGET': list(map(lambda target_tag: target_tag.split('-')[0], tagged_text[:, 3]))
            })
        np.save(os.path.join(cache_folder, 'en.tags.extracted'), texts)
        return tagged_texts
    else:
        return np.load(os.path.join(cache_folder, 'en.tags.extracted'))

In [8]:
def get_tagged_texts_as_pd(folders, cache_folder):
    if not os.path.exists(os.path.join(cache_folder, 'en.tags.extracted')):
        words = []
        pos_tags = []
        lemmas = []
        targets = []
        for folder in tqdm(folders):
            tagged_text = np.loadtxt(os.path.join(folder, 'en.tags'), dtype=str, delimiter='\t', encoding='utf-8')
            words.append(tagged_text[:, 0])
            pos_tags.append(tagged_text[:, 1])
            lemmas.append(tagged_text[:, 2])
            targets.append(list(map(lambda target_tag: target_tag.split('-')[0], tagged_text[:, 3])))
        df = pd.DataFrame({'words': words, 'pos_tags': pos_tags, 'lemmas': lemmas, 'targets': targets})
        df.to_csv(os.path.join(cache_folder, 'en.tags.pd.extracted'), index=False)
        return df
    else:
        df = pd.read_csv(os.path.join(cache_folder, 'en.tags.pd.extracted'))
        

# Read the data

In [11]:
! pwd

/home/bender/PycharmProjects/NLP/notebooks


In [12]:
! ls -alth ../data/datasets/gmb-2.2.0/

итого 36K
drwxrwxr-x.   3 bender bender 4,0K сен 26 22:02 ..
-rw-r-----.   1 bender bender 7,4K июл  4  2014 README
-rw-rw-r--.   1 bender bender 2,0K июл  4  2014 NEWS
drwxr-x---.   4 bender bender 4,0K июл  2  2014 .
-rw-r-----.   1 bender bender  497 июл  2  2014 THANKS
drwxr-x---. 102 bender bender 4,0K июл  2  2014 data
-rw-r-----.   1 bender bender  279 июн 23  2014 LICENSE
drwxr-x---.   2 bender bender 4,0K авг 21  2013 doc


In [19]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

HBox(children=(IntProgress(value=0), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

In [20]:
raw_texts = get_raw_texts(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

HBox(children=(IntProgress(value=0, max=9167), HTML(value='')))

In [None]:
tagged_texts = get_tagged_texts(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

HBox(children=(IntProgress(value=0, max=9167), HTML(value='')))

In [None]:
tagged_texts = get_tagged_texts(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

HBox(children=(IntProgress(value=0, max=9167), HTML(value='')))

# Gather a bit of statistics and draw a bit of plots

In [None]:
character_counts = list(map(len, tqdm(raw_texts)))

In [None]:
word_counts = list(map(lambda tagged_text: len(tagged_text['lemma']), tqdm(tagged_texts)))

In [None]:
word_sizes = [
    len(word)
    for text in tqdm(tagged_texts)
    for word in text['lemma']
]