# Notebook to investigate getting stats for puddin files

In [136]:
import pandas as pd
from pathlib import Path
import statistics as st
import pyconll
from collections import namedtuple

DATA_GRP = 'val'
DATA_DIR = Path('data/puddin')


In [137]:
data_code = f'Pcc{DATA_GRP[:2].capitalize()}'
group_dir = DATA_DIR.joinpath(f'{data_code}.conll')
# for group_info_file in DATA_DIR.joinpath('info/validation_by_group/status-overview/'
#                                          ).glob('*status-info*pkl*'):
#     # for testing
#     if not group_info_file.stem.startswith(data_code):
#         continue
#     else:
#         group_info = pd.read_pickle(group_info_file)
#         break
# group_info


I had been using `egrep` to more quickly generate counts for different units. However, I don't think that is the most effective way to go about getting counts for anything below the conllu file level; i.e. per document or per sentence stats. Those will need to employ `pyconll` and actually parse the conllu formatting. 

In [138]:
def count_contents(conllu_path):
    reader = pyconll.iter_from_file(conllu_path)
    conllu_df = pd.DataFrame(reader)
    return conllu_df


In [139]:
if not group_dir.is_dir():
    print('CANNOT FIND CONLLU DIR FOR', data_code)
else:
    stem_col = group_info.conllu_stem
    for stem in stem_col.unique():
        conllu_path = group_dir.joinpath(f'{stem}.conllu')
        if conllu_path.is_file():# and conllu_path.stat().st_size > 0:

            print(f'conllu: {conllu_path}')
        # else:
        #     print(f'{conllu_path} does not exist, or is not a file.')
        #> this should go within loop, but only doing one for devel so pulling it out
        # d_counts = count_contents(conllu_path)
    print(f'Counting data in {conllu_path}...')
    # d_counts = count_contents(conllu_path)

# > developing method for counting with pyconll object with only last path of loop
# conllu_reader = pyconll.iter_from_file(conllu_path)


seeing if pandas can do anything with the pyconll object...

In [141]:
# def gen_sentences(conll_df):

#     for i in range(len(conll_df)):
#         sentence = []
#         for x in range(len(conll_df.columns)):
#             word = conll_df.iat[i,x]
#             if word:
#                 sentence.append(word)
#         # print(*[w.form for w in sentence])
#         yield sentence


# sentences_gen = gen_sentences(s_df)
# for s in sentences_gen:
#     print(*[w.form for w in s])
# # df = pd.DataFrame(pyconll.iter_from_file(conllu_path))
# # sentence_gen = gen_sences(df)


This is no better--worse even probably--than just iterating through the pyconll object.

In [142]:
def print_col_info(df):
    width = max(len(c) for c in df.columns)
    for c in df.columns:
        print(
            f'{c.rjust(width)} : {str(df[c].dtype).ljust(8)}{type(df[c][0])}')
    mem_use = df.memory_usage('deep')
    print('>> memory usage <<\n',mem_use.to_string(), sep='')


In [143]:
sent_tuple = namedtuple('sent_counts',
                        ['s_id', 's_txt', 's_wrd', 's_chr', 's_wlen_list', 's_md_wlen', 's_mn_wlen', 's_cpw'])


In [144]:
def gen_sentence_info(conllu_path):

    for sentence in pyconll.iter_from_file(conllu_path):
        word_lengths = [len(w.form)
                        for w in sentence._tokens if w.deprel != 'punct']
        # print(word_lengths)
        word_count = len(word_lengths)
        md_word_len = st.median(word_lengths)
        mn_word_len = st.mean(word_lengths)
        char_count = sum(word_lengths)
        char_per_word = char_count/word_count
        yield sent_tuple(sentence.id, sentence.text, word_count, char_count, word_lengths,
                         md_word_len, mn_word_len, char_per_word)


# conllu_counts_df = pd.DataFrame(gen_sentence_info(conllu_path))
# conllu_counts_df.head()
s_df = pd.DataFrame(gen_sentence_info(
    'data/puddin/PccSa1.conll/pcc_eng_sample-1-01.conllu'))



In [145]:
str_cols = s_df.columns.str.endswith(('id', 'xt'))
s_df.loc[:, str_cols] = s_df.loc[:, str_cols].astype('string')

In [146]:
sent_id_split = s_df.s_id.str.rsplit('_', 1)

s_df = s_df.assign(d_id=sent_id_split.str.get(0),
                   #  s_ix = sent_id_split.str.get(1),
                   ).convert_dtypes()

## Compare Dtypes

In [147]:
count_cols = s_df.columns.str.endswith(('chr', 'wrd'))
print('Initial')
print_col_info(s_df.loc[:, count_cols])

# downcast unsigned
print('\nUNsigned Downcast')
uint_df = s_df.copy()
uint_df.loc[:, count_cols] = uint_df.loc[:, count_cols].apply(
    lambda c: pd.to_numeric(c, downcast='unsigned'))
print_col_info(uint_df.loc[:, count_cols])

#downcast signed
print('\nsigned downcast')
int_df = s_df.copy()
int_df.loc[:, count_cols] = int_df.loc[:, count_cols].apply(
    lambda c: pd.to_numeric(c, downcast='signed'))
print_col_info(int_df.loc[:, count_cols])

Initial
s_wrd : Int64   <class 'numpy.int64'>
s_chr : Int64   <class 'numpy.int64'>
>> memory usage <<
Index     128
s_wrd    1521
s_chr    1521

UNsigned Downcast
s_wrd : UInt8   <class 'numpy.uint8'>
s_chr : UInt8   <class 'numpy.uint8'>
>> memory usage <<
Index    128
s_wrd    338
s_chr    338

signed downcast
s_wrd : Int8    <class 'numpy.int8'>
s_chr : Int16   <class 'numpy.int16'>
>> memory usage <<
Index    128
s_wrd    338
s_chr    507


## Group by document and add stats at document level

In [156]:
def get_doc_stats(sentence_df):
    
    d_dicts = []
    d_tups = []
    d_sers = []
    print(f'\n### input data type: {sentence_df.s_chr.dtype}')
    for doc, gdf in sentence_df.groupby('d_id'):
        # print(doc)
        # pprint(gdf.s_txt.to_list())
        s_chr = gdf.s_chr
        # print(s_chr.dtype)

        d_chr = s_chr.sum()
        d_mn_s_chr = s_chr.mean()
        d_md_s_chr = s_chr.median()
        d_min_s_chr = s_chr.min()
        # d_max_s_chr =
        # print(f'least characters in a sentence: {d_min_s_chr}',f'most characters in a sentence: {d_max_s_chr}', sep='\n')

        s_wrd = gdf.s_wrd
        # print(s_wrd.dtype)
        # pprint(s_wrd.to_list())
        d_wrd = s_wrd.sum()
        d_mn_s_wrd = s_wrd.mean()
        d_md_s_wrd = s_wrd.median()
        d_min_s_wrd = s_wrd.min()
        d_max_s_wrd = s_wrd.max()
        # print(f'least words in a sentence: {d_min_s_wrd}',f'most words in a sentence: {d_max_s_wrd}', sep='\n',end='\n\n')

        d_snt = len(gdf)

        d_mn_s_cpw = gdf.s_cpw.mean()
        d_mn_s_mn_wlen = gdf.s_mn_wlen.mean()  # this should be same as above
        d_mn_s_md_wlen = gdf.s_md_wlen.mean()

        d_md_s_md_wlen = gdf.s_md_wlen.median()

        d_mo_s_md_wlen = gdf.s_md_wlen.mode()
        ddict = {
            'd_id': doc,        # document id
            # total characters in doc (~ doc length in characters)
            'd_chr': d_chr,
            'd_mn_s_chr': d_mn_s_chr,  # mean sent length in characters
            'd_md_s_chr': d_md_s_chr,  # median sent length in characters
            # total words in doc (~ doc length in words)
            'd_wrd': d_wrd,
            'd_mn_s_wrd': d_mn_s_wrd,  # mean sent length in words
            'd_md_s_wrd': d_md_s_wrd,  # median sent length in words
            # total sentences in doc (~ doc length in sentences)
            'd_snt': d_snt,
            # total char in doc / total words in doc # * same as mean(d_chr/d_wrd)?
            'd_cpw': d_chr/d_wrd,
            # total words in doc / total sentences in doc # * same as mean(d_wrd/d_snt)?
            'd_wps': d_wrd/d_snt,
            # mean of char/word value for all sentences in doc ~ mean word length
            'd_mn_s_cpw': d_mn_s_cpw,

            'd_mn_s_mn_wlen': d_mn_s_mn_wlen,  # doc mean of sent mean word length
            'd_mn_s_md_wlen': d_mn_s_md_wlen,  # doc mean of sent median word length
            'd_md_s_md_wlen': d_md_s_md_wlen,  # doc median of sent median word length

            #! causes issues because not always a single value
            # 'd_mo_s_md_wlen': d_mo_s_md_wlen,  # doc mode of sent median word length

            'd_min_s_chr': d_min_s_chr,
            'd_max_s_chr': s_chr.max(),

            'd_min_s_wrd': d_min_s_wrd,
            'd_max_s_wrd': d_max_s_wrd,
        }
        # print(ddict['d_max_s_chr'], end='\n\n')

        d_dicts.append(ddict)
    return d_dicts


In [163]:
for df in [s_df, uint_df, int_df]:
    doc_dicts = get_doc_stats(df)
    print('\n# Dict Values and Types:')
    for chr_val in (d['d_max_s_chr'] for d in doc_dicts): 
        print(chr_val, type(chr_val), sep='\t')
    d_stats = pd.DataFrame(doc_dicts)
    print('\n# Results when converted to dataframe:')
    print(f'dtype: {d_stats.d_max_s_chr.dtype}')
    d_stats.d_max_s_chr
    
    actual_max_chr = pd.Series(d['d_max_s_chr'] for d in doc_dicts)
    mod_dstats = d_stats.assign(given_max_char=d_stats.d_max_s_chr,
                  accurate_max_char_int=pd.to_numeric(actual_max_chr, downcast='signed'))
    print(mod_dstats
          .assign(accurate_max_char_uint=pd.to_numeric(
              d_stats.accurate_max_char_int, downcast='unsigned'))
          .sort_values('accurate_max_char')
          .loc[:, ['given_max_char','accurate_max_char']]
          .to_string())



### input data type: Int64

# Dict Values and Types:
108	<class 'numpy.int64'>
90	<class 'numpy.int64'>
67	<class 'numpy.int64'>
57	<class 'numpy.int64'>
83	<class 'numpy.int64'>
164	<class 'numpy.int64'>
111	<class 'numpy.int64'>
197	<class 'numpy.int64'>
185	<class 'numpy.int64'>
249	<class 'numpy.int64'>
119	<class 'numpy.int64'>
138	<class 'numpy.int64'>

# Results when converted to dataframe:
dtype: int64


AttributeError: 'DataFrame' object has no attribute 'accurate_max_char_int'

In [None]:
# pylint: disable=missing-module-docstring
trans_d_df = d_stats.transpose()
trans_d_df


In [None]:
same_vals = trans_d_df.duplicated(keep=False)
equiv_metric = trans_d_df.loc[same_vals, :]
equiv_metric


In [None]:
round_transdf = d_stats.set_index('d_id').transpose().round()
round_transdf


In [None]:
almost_same = round_transdf.duplicated(keep=False)
similar_metric = round_transdf.loc[almost_same, :]
similar_metric.loc[~similar_metric.index.isin(equiv_metric.index), :]


In [None]:

# by_d_df = d_stats.loc[:,
#     # columns to perpetuate
#     d_stats.columns.str.startswith('doc')
# ].round(3)
# by_d_df


Seeing if it comes out the same to average on top of averages or just do the calculation directly

In [None]:
# any(by_d_df.d_mean_s_avg_word_len != by_d_df.d_char_per_word)
