# All imports necessary

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

In [3]:
import seaborn as sns
sns.set(color_codes=True)

In [4]:
from collections import Counter as counter

In [6]:
from tqdm.autonotebook import tqdm

In [7]:
%matplotlib inline

# Auxiliary methods

In [8]:
def filter_by_subcorpus(data_root_folder, subcorpus_name):
    data_folders = [
        os.path.join(data_root_folder, folder, subfolder)
        for folder in tqdm(os.listdir(data_root_folder))
        for subfolder in os.listdir(os.path.join(data_root_folder, folder))
    ]
    mask = []
    for folder in tqdm(data_folders):
        with open(os.path.join(folder, 'en.met'), 'r', encoding='utf-8') as myfile:
            mask.append(subcorpus_name in myfile.read().replace('\n', ''))
    return np.array(data_folders)[mask]

In [9]:
def get_raw_texts(folders):
    texts = []
    for folder in tqdm(folders):
        with open(os.path.join(folder, 'en.raw'), 'r', encoding='utf-8') as myfile:
            texts.append(myfile.read().replace('\n', ''))
    return texts

In [11]:
def get_tagged_texts(folders):
    tagged_texts = []
    for folder in tqdm(folders):
        tagged_text = np.loadtxt(os.path.join(folder, 'en.tags'), dtype=str, delimiter='\t', encoding='utf-8')
        tagged_texts.append({
            'word': tagged_text[:, 0],
            'POS-tag': tagged_text[:, 1],
            'lemma': tagged_text[:, 2],
            'TARGET': list(map(lambda target_tag: target_tag.split('-')[0], tagged_text[:, 3]))
        })
    return tagged_texts

# Read the data

In [12]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/data/', 'subcorpus: Voice of America')

HBox(children=(IntProgress(value=0), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [13]:
raw_texts = get_raw_texts(target_subcorpus_folders)

HBox(children=(IntProgress(value=0, max=9167), HTML(value='')))




In [14]:
tagged_texts = get_tagged_texts(target_subcorpus_folders)

HBox(children=(IntProgress(value=0, max=9167), HTML(value='')))




# Gather a bit of statistics and draw a bit of plots

In [None]:
character_counts = list(map(len, tqdm(raw_texts)))

In [None]:
word_counts = list(map(lambda tagged_text: len(tagged_text['lemma']), tqdm(tagged_texts)))

In [None]:
word_sizes = [
    len(word)
    for text in tqdm(tagged_texts)
    for word in text['lemma']
]

In [None]:
target_tags = [
    tag
    for text in tqdm(tagged_texts)
    for tag in text['TARGET']
]

In [None]:
target_tags = counter(target_tags)

In [None]:
target_tags.most_common()

In [None]:
plt.figure(figsize=(15, 5))
plt.hist(character_counts, bins=50)
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
plt.hist(word_counts, bins=50)
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
plt.hist(word_sizes, bins=50)
plt.show()