# All imports necessary

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import sys

In [None]:
sys.path.append('..')

In [None]:
from source.code.utils.utils import filter_by_subcorpus
from source.code.utils.utils import get_tagged_texts_as_pd

In [None]:
from source.code.utils.preprocessing import additional_features
from source.code.utils.preprocessing import filtrations

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
from tqdm.autonotebook import tqdm

In [None]:
sns.set(color_codes=True)

In [None]:
sns.set(font_scale=2)

In [None]:
%matplotlib inline

# Read the data

## Where are we at the moment

In [None]:
! pwd

In [None]:
! ls -alth ../data/datasets/gmb-2.2.0/

## Get the subset of folders for target subcorpus (Voice of America)

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

## Read subcorpus

In [None]:
tagged_texts_as_pd = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

## Do a bit of preprocessing

In [None]:
tagged_texts_as_pd_f = filtrations(tagged_texts_as_pd)

## Add new features describing words

In [None]:
tagged_texts_as_pd_f_add_f = additional_features(tagged_texts_as_pd_f)

## Take a bit of a look into the data

In [None]:
tagged_texts_as_pd_f_add_f.head().T

In [None]:
tagged_texts_as_pd_f_add_f.info()

In [None]:
tagged_texts_as_pd_f_add_f.ner_tag.value_counts()

# Words distributions (for all NER-tags in general)

## Words distribution across NER-tags

In [None]:
plt.figure(figsize=(15, 5))
ax = sns.countplot('ner_tag', data=tagged_texts_as_pd_f_add_f)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

In [None]:
tagged_texts_as_pd_f_add_f.ner_tag.value_counts()

## Words distribution across sense numbers

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot('word_net_sense_number', data=tagged_texts_as_pd_f_add_f)
plt.show()

## Words distribution across POS-tags

In [None]:
plt.figure(figsize=(15, 5))
ax = sns.countplot('pos_tag', data=tagged_texts_as_pd_f_add_f, orient='h')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across semantic relation prepositions

In [None]:
plt.figure(figsize=(15, 5))
ax = sns.countplot('semantic_relation', data=tagged_texts_as_pd_f_add_f, orient='h')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across animacy-tags

In [None]:
plt.figure(figsize=(15, 5))
ax = sns.countplot('animacy_tag', data=tagged_texts_as_pd_f_add_f, orient='h')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

# Words distributions (separated by each NER-tag)

## Words distribution across sense numbers

In [None]:
sns.catplot('word_net_sense_number', col='ner_tag', data=tagged_texts_as_pd_f_add_f, kind="count", orient='h', col_wrap=1, sharex=False, sharey=False, aspect=3)
plt.show()

## Words distribution across POS-tags

In [None]:
g = sns.catplot('pos_tag', col='ner_tag', data=tagged_texts_as_pd_f_add_f, kind="count", orient='h', col_wrap=1, sharex=False, sharey=False, aspect=3)
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across semantic relation prepositions

In [None]:
g = sns.catplot('semantic_relation', col='ner_tag', data=tagged_texts_as_pd_f_add_f, kind="count", orient='h', col_wrap=1, sharex=False, sharey=False, aspect=3)
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across animacy-tags

In [None]:
g = sns.catplot('animacy_tag', col='ner_tag', data=tagged_texts_as_pd_f_add_f, kind="count", orient='h', col_wrap=1, sharex=False, sharey=False, aspect=3)
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across word lengths

In [None]:
sns.catplot('word_len', col='ner_tag', data=tagged_texts_as_pd_f_add_f, kind="count", orient='h', col_wrap=1, sharex=False, sharey=False, aspect=3)
plt.show()

## Words distribution across semantic relation existence

In [None]:
g = sns.catplot('ner_tag', col='semantic_relation_tagged', data=tagged_texts_as_pd_f_add_f, kind="count", orient='h', col_wrap=1, sharex=False, sharey=False, aspect=3)
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across animacy tag existence

In [None]:
g = sns.catplot('ner_tag', col='animacy_tagged', data=tagged_texts_as_pd_f_add_f, kind="count", orient='h', col_wrap=1, sharex=False, sharey=False, aspect=3)
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across word sense existence

In [None]:
g = sns.catplot('ner_tag', col='word_sense_exists', data=tagged_texts_as_pd_f_add_f, kind="count", col_wrap=1, sharex=False, sharey=False, aspect=3)
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across NER-tags (for titled and not titled words)

In [None]:
g = sns.catplot('ner_tag', col='is_title', data=tagged_texts_as_pd_f_add_f, kind="count", col_wrap=1, sharex=False, sharey=False, aspect=3)
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Words distribution across NER-tags (for words with and without digits)

In [None]:
g = sns.catplot('ner_tag', col='contains_digits', data=tagged_texts_as_pd_f_add_f, kind="count", col_wrap=1, sharex=False, sharey=False, aspect=3)
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Distribution of lambda-DSR mean lengths across NER-tags

In [None]:
plt.figure(figsize=(15, 5))
ax = sns.barplot("ner_tag", "lambda_dsr_len", data=tagged_texts_as_pd_f_add_f)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

## Distribution of word mean lengths across NER-tags

In [None]:
plt.figure(figsize=(15, 5))
ax = sns.barplot("ner_tag", "word_len", data=tagged_texts_as_pd_f_add_f)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
plt.tight_layout()
plt.show()

# Conclusion