# Overview

## Objective

### Guiding Questions:
1. Find sequence length frequencies to make an educated decision about padding.

### Key findings
1.

#### Imports and setup

In [2]:
# must go first
%matplotlib inline
%config InlineBackend.figure_format='retina'

# plotting
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

sns.set()
sns.set_context('poster', font_scale=1.3)
sns.set_style("white")

# Update matplotlib defaults to something nicer
mpl_update = {
    'font.size': 16,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'figure.figsize': [12.0, 8.0],
    'axes.labelsize': 20,
    'axes.labelcolor': '#677385',
    'axes.titlesize': 20,
    'lines.color': '#0055A7',
    'lines.linewidth': 3,
    'text.color': '#677385',
    # 'font.family': 'sans-serif',
    # 'font.sans-serif': 'Tahoma'
}
mpl.rcParams.update(mpl_update)

print('Imports 1/2 done!')

Imports 1/2 done!


In [3]:
import os
import pickle
import time
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from typing import List, Any

# basic wrangling
import numpy as np

# TensorFlow
import tensorflow as tf

# babil
from babil.data.preprocessing import Dataset, ConllData
from babil.utils.config import PathTracker, set_global_seed

print('Imports 2/2 done!')



Imports 2/2 done!


#### Setting the global seed

In [4]:
set_global_seed()
print('Global seed set!')

Global seed set!


#### Load up train and dev data

In [5]:
print('Reading data from conll...')
path_to = PathTracker.from_json(
    os.path.join(os.pardir, 'local_config.json')
)
train = ConllData(path_to.train)
dev = ConllData(path_to.dev)

print('Done!')

Reading data from conll...
Done!


#### Read vocab from pickled file

In [6]:
print('Loading pickled vocab...')
vocab = None
with open(os.path.join(path_to.data, 'Vocab.pickle'), 'rb') as f:
    vocab = pickle.load(f)

print('Done!')

Loading pickled vocab...
Done!


#### Convert from ConllData to Dataset

In [7]:
print('Converting to Dataset...')
train_ds = Dataset(train, vocab)
dev_ds = Dataset(dev, vocab)

print('Done!')

Converting to Dataset...
Done!


#### Define a simple plotting function

In [8]:
def plot_histogram(sequences: List[List[Any]], bins=10):
    t = time.strftime('%m%d_%H-%M-%S')
    name = f'{t}_seq_len_hist.png'
    sizes = [len(_) for _ in sequences]
    hist_array = np.histogram(sizes, bins=bins)

    plt.hist(hist_array)
    plt.title('Sequence length histogram')
    plt.tight_layout()
    plt.show()
    plt.savefig(os.path.join(path_to.project_root), 'figures', name)


For the training dataset, plot a histogram with 10 bins.

In [None]:
print('Plotting histogram...')
plot_histogram(train_ds.X, bins=5)

Hmmm. No.

In [None]:
sizes = [len(_) for _ in train_ds.X]
sizes.sort()

In [None]:
hist_array = np.histogram(sizes)

In [None]:
print(f'Num sentences: {len(sizes)}\n'
      f'Shortest sentence: {sizes[0]}\n'
      f'Longest sentence: {sizes[-1]}\n')

In [None]:
def plot_freqs(counter):
    """Plot a bar chart showing the word length frequency."""
    len_, freq = zip(*counter.items())

    # sort your values in descending order
    # Returns the indices that would sort an array:
    # Perform an indirect sort along the given axis using the algorithm
    # specified by the kind keyword. It returns an array of indices of the
    # same shape as a that index data along the given axis in sorted order.
    ind_sort = np.argsort(freq)[::-1]
    len_ = np.array(len_)[ind_sort]
    freq = np.array(freq)[ind_sort]

    indexes = np.arange(len(len_))

    # bar_width = 0.35
    plt.bar(indexes, freq)
    plt.title('Sentence length by frequency')
    plt.xlabel('Sentence length')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
c = Counter(sizes)

In [None]:
plot_freqs(c)
