# Notebook to investigate getting stats for puddin files

In [1]:
# coding=utf-8
#> imports
import pandas as pd
# import statistics as st
import pyconll

from collections import namedtuple
from pathlib import Path
from pprint import pprint

#> constants
# DATA_GRP = 'val'
# DATA_DIR = Path('data/puddin')
CONLLU_SAMPLE_PATH = Path('/home/arh234/projects/puddin/demo/data/puddin/PccSa1.conll/pcc_eng_sample-1-01.conllu')


In [2]:
# def print_col_info(df):
    # width = max(len(c) for c in df.columns)
    # for c in df.columns:
    #     print(
    #         f'{c.rjust(width)} : {str(df[c].dtype).ljust(8)}{type(df[c][0])}')
    # print('>> memory usage <<\n', df.memory_usage('deep').to_string(), sep='')

I had been using `egrep` to more quickly generate counts for different units. However, I don't think that is the most effective way to go about getting counts for anything below the conllu file level; i.e. per document or per sentence stats. Those will need to employ `pyconll` and actually parse the conllu formatting. 

In [2]:
def gen_sentence_info(conllu_path):

    sent_tuple = namedtuple(
        'sent_counts',
        ['sid', 'did', 'txt',
         'lmm_list', 'wrd_list', 
         #? is `wlen_list` truly needed?
         'wlen_list',
         'wrd_count', 'chr_count',
         #//  'wlen_median',
         'wlen_mean'])

    doc = None
    for sentence in pyconll.iter_from_file(conllu_path):

        if sentence.meta_present('newdoc id'):
            doc = sentence.meta_value('newdoc id')
            print(doc)
        elif not doc:
            print('! WARNING: doc info not found!')
        elif not sentence.id.startswith(doc):
            print('~!~ WARNING: doc and sentence ids do not match!')
        # print(sentence.text)

        #* NOTE: this excludes all punctuation symbols!
        tok_objects = [
            tok for tok in sentence._tokens if tok.deprel != 'punct']
        lemmas = [tok.lemma for tok in tok_objects]
        words = [tok.form for tok in tok_objects]
        word_lengths = [len(word) for word in words]
        # print(word_lengths)
        word_count = len(word_lengths)
        #// md_word_len = st.median(word_lengths)
        char_count = sum(word_lengths)
        char_per_word = char_count/word_count
        yield sent_tuple(sentence.id, doc, sentence.text,
                         lemmas, words, word_lengths,
                         word_count, char_count,
                         #//  md_word_len,
                         char_per_word)



In [3]:
s_conll_iter = gen_sentence_info(CONLLU_SAMPLE_PATH)
sdf = pd.DataFrame(s_conll_iter).set_index('sid')
sdf.sample(3).sort_index()

pcc_eng_sample-1_1.01_x01
pcc_eng_sample-1_1.02_x02
pcc_eng_sample-1_1.03_x04
pcc_eng_sample-1_1.04_x07
pcc_eng_sample-1_1.05_x08
pcc_eng_sample-1_1.06_x09
pcc_eng_sample-1_1.07_x11
pcc_eng_sample-1_1.08_x12
pcc_eng_sample-1_1.09_x13
pcc_eng_sample-1_1.10_x16
pcc_eng_sample-1_1.11_x18
pcc_eng_sample-1_1.12_x19


Unnamed: 0_level_0,did,txt,lmm_list,wrd_list,wlen_list,wrd_count,chr_count,wlen_mean
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
pcc_eng_sample-1_1.03_x04_6,pcc_eng_sample-1_1.03_x04,The bag is made after a template from Stinne K...,"[the, bag, be, make, after, a, template, from,...","[The, bag, is, made, after, a, template, from,...","[3, 3, 2, 4, 5, 1, 8, 4, 6, 5]",10,41,4.1
pcc_eng_sample-1_1.07_x11_05,pcc_eng_sample-1_1.07_x11,YOUR WEBSITE.,"[you, website]","[YOUR, WEBSITE]","[4, 7]",2,11,5.5
pcc_eng_sample-1_1.12_x19_8,pcc_eng_sample-1_1.12_x19,The UN health agency says virus samples from t...,"[the, UN, health, agency, say, virus, sample, ...","[The, UN, health, agency, says, virus, samples...","[3, 2, 6, 6, 4, 5, 7, 4, 3, 7, 4, 6, 9, 2, 5, ...",27,120,4.444444


In [4]:
print(sdf.did.describe())
sdf.describe().round(2)

count                           169
unique                           12
top       pcc_eng_sample-1_1.07_x11
freq                             41
Name: did, dtype: object


Unnamed: 0,wrd_count,chr_count,wlen_mean
count,169.0,169.0,169.0
mean,14.04,65.23,4.85
std,9.42,44.25,1.11
min,1.0,5.0,2.75
25%,5.0,26.0,4.1
50%,14.0,67.0,4.71
75%,20.0,90.0,5.5
max,45.0,249.0,8.5


## Group by document and add stats at document level

In [5]:
sdf.columns.to_list()

['did',
 'txt',
 'lmm_list',
 'wrd_list',
 'wlen_list',
 'wrd_count',
 'chr_count',
 'wlen_mean']

In [6]:
def describe_counts(df, prefix: str='s'):
    
    doc_dict = {}
    counts = df.loc[:, df.columns.str.endswith('count')]
    
    #> replaces code below for renaming
    counts.columns = counts.columns.str.replace('count','').str.strip('_')
    
    # first descriptor is "count" ~ do not need, so drop it here
    counts_desc = counts.describe().iloc[1:, :]
    # add median and sum rows, but "assign" as columns
    t_counts_desc = counts_desc.transpose().assign(
        #! turns out median is identical with `50%` value already in `counts_desc`
        #   and `wrd_total` is redundant with lmm_count but leaving that
        # median=counts.median(),
        total=counts.sum())

    #// # rename data type labels: e.g. wrd_count -> wrd
    #// #// t_counts_desc.index = t_counts_desc.index.str.replace('count', '').str.replace('_','',1)
    #// t_counts_desc.index = t_counts_desc.index.str.split('_').str.get(0)

    # for each row of combined descriptive stats automated df:
    #   1. pull out row as its *own* dataframe
    #   2. rename cols to indicate row/data (i.e. 'wrd_' or 'chr_')
    for row_ix in t_counts_desc.index:
        row_df = t_counts_desc.loc[[row_ix], :]
        # print('generalized `describe` metrics:',row_df.columns.to_list(),sep='\n')
        row_df.columns = prefix+row_ix+'_'+row_df.columns
        # print('--become-->')
        # print('individualized `describe` metrics:', row_df.columns.to_list(), sep='\n')
        row = row_df.iloc[0, :]
        row_dict = row.to_dict()
        doc_dict.update(row_dict)
    # pprint(doc_dict)
    return doc_dict

#### Loop through by-sentence dataframe

d_dicts = []
prefix = 'S'
# TODO: make this a method/function
for doc, gdf in sdf.groupby('did'):
    # print(doc)
    doc_dict = describe_counts(gdf,prefix)
    doc_dict = {f'D_{k}':v for k,v in doc_dict.items()}
    # (moved to independent function)
    # doc_dict = {}
    # s_counts = gdf.loc[:, gdf.columns.str.endswith('count')]
    
    # # first descriptor is "count" ~ do not need, so drop it here
    # counts_desc = s_counts.describe().iloc[1:, :]
    
    # # add median and sum rows, but "assign" as columns
    # t_counts_desc = counts_desc.transpose().assign(
    #     median=s_counts.median(),
    #     sum=s_counts.sum()
    # )

    # # rename data type labels: e.g. s_wrd_count -> wrd
    # #// t_counts_desc.index = t_counts_desc.index.str.replace('count', '').str.replace('_','',1)
    # t_counts_desc.index = t_counts_desc.index.str.split('_').str.get(1)

    # # for each row of combined descriptive stats automated df
    # #   pull out row as its *own* df
    # #   rename cols to indicate row/data (i.e. wrd vs. chr)
    # for row_ix in t_counts_desc.index:
    #     row_df = t_counts_desc.loc[[row_ix], :]
    #     print(row_df.columns.to_list())
    #     row_df.columns = 's'+row_ix+'_'+row_df.columns
    #     print('--become-->')
    #     print(row_df.columns.to_list())
    #     row = row_df.iloc[0, :]
    #     row_dict = row.to_dict()
    #     doc_dict.update(row_dict)
    # pprint(doc_dict)

    # (previous method of getting values individually)
    #// s_chr = gdf.s_chr_count
    #// s_chr_des = s_chr.describe()
    #// d_chr = s_chr.sum()
    #// d_mn_s_chr = s_chr.mean()
    #// d_md_s_chr = s_chr.median()
    #// d_least_char = s_chr.min()
    #// d_most_char = s_chr.max()
    #// s_wrd = gdf.s_wrd_count
    #// s_wrd.describe()
    #// d_wrd = s_wrd.sum()
    #// d_mn_s_wrd = s_wrd.mean()  # * same
    #// d_md_s_wrd = s_wrd.median()
    #// d_least_words = s_wrd.min()
    #// d_most_words = s_wrd.max()

    #TODO: 👉 use `s_wlen_list` col to calculate mean doc wlen (or not??)
    #NOTE: decided best to calculate average word length from raw word lengths for entire doc, 
    # rather than the mean of sentence specific mean word length. 
    #// d_mn_s_mn_wlen = gdf.s_wlen_mean.mean()
    #// d_mn_s_md_wlen = gdf.s_wlen_median.mean()
    #// d_md_s_md_wlen = gdf.s_wlen_median.median()
    
    #! mode is problematic for dataframes: returns list ~OR~ number
    #// d_mo_s_md_wlen = gdf.s_wlen_median.mode()
    doc_lemmas = pd.Series(lm for lm_list in gdf.lmm_list for lm in lm_list)
    doc_wlens = pd.Series(wl for wl_list in gdf.wlen_list for wl in wl_list)
    doc_add = {
        #> discarded values/approaches
        # (determined better to calculate directly vs. `mean(mean(SVAL))`)
        #// 'd_mean_scomplexity': d_mn_s_mn_wlen,  # doc mean of sent mean word length
        
        #! causes issues because not always a single value
        # // 'd_mo_s_md_wlen': d_mo_s_md_wlen,  # doc mode of sent median word length
        
        # (previous approach values. Replaced with `describe()` output)
        #// total characters in doc (~ doc length in characters)
        #// 'd_chr_count': d_chr,
        #// 'd_slenc_mean': d_mn_s_chr,  # mean sent length in characters
        #// 'd_slenc_median': d_md_s_chr,  # median sent length in characters
        #// total words in doc (~ doc length in words)
        #// 'd_wrd_count': d_wrd,
        #// 'd_slenw_mean': d_mn_s_wrd,  # mean sent length in words
        #// 'd_slenw_median': d_md_s_wrd,  # median sent length in words
        #// 'd_min_s_chr': d_least_char,
        #// 'd_max_s_chr': d_most_char,
        #// 'd_min_s_wrd': d_least_words,
        #// 'd_max_s_wrd': d_most_words,

        # document id
        'D_id': doc,  
              
        # total sentences in doc (~ sentences/per doc)
        # (synonymous with the `count` descriptors dropped above)
        'D_snt_count': len(gdf),

        # total char in doc / total words in doc
        'D_wlen_mean': doc_dict[f'D_{prefix}chr_total']/doc_dict[f'D_{prefix}wrd_total'],
        # NOTE: this 👆 could also be done by getting the mean of all 
        #   the wlen elements for each value/cell of `s_wlen_list` col, 
        #   but that is unnecessary: this returns identical result

        #? How do these values differ from existing values output by `describe_sent_counts`?
        #?// Does median word length for all words in doc require access to `s_wlen_list`?
        #^ think I had determined median word length was unneccesary
        #// 'd_mn_s_md_wlen': d_mn_s_md_wlen,  # doc mean of sent median word length
        #//'d_md_s_md_wlen': d_md_s_md_wlen,  # doc median of sent median word length
        'D_lemmas': doc_lemmas, 
        'D_wlens': doc_wlens
    }
    doc_dict.update(doc_add)
    
    doc_lemmas_desc = doc_lemmas.describe()
    doc_lemmas_desc.index = 'D_lmm_'+doc_lemmas_desc.index
    doc_dict.update(doc_lemmas_desc.to_dict())
    d_dicts.append(doc_dict)
d_stats = pd.DataFrame(d_dicts).convert_dtypes()
d_stats = d_stats.assign(D_id=d_stats.D_id.astype('string')).set_index('D_id')
# print('\n# dtypes as created:')
# print_col_info(d_stats)
d_stats.head(5)

Unnamed: 0_level_0,D_Swrd_mean,D_Swrd_std,D_Swrd_min,D_Swrd_25%,D_Swrd_50%,D_Swrd_75%,D_Swrd_max,D_Swrd_total,D_Schr_mean,D_Schr_std,...,D_Schr_max,D_Schr_total,D_snt_count,D_wlen_mean,D_lemmas,D_wlens,D_lmm_count,D_lmm_unique,D_lmm_top,D_lmm_freq
D_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pcc_eng_sample-1_1.01_x01,9.666667,7.28011,3,4.0,9.0,13.0,24,87,42.444444,33.537707,...,108,382,9,4.390805,0 what 1 if 2 we 3 ...,0 4 1 2 2 2 3 4 4 3  ....,87,60,we,6
pcc_eng_sample-1_1.02_x02,12.0,7.641989,5,6.0,9.5,18.25,22,72,48.0,29.509321,...,90,288,6,4.0,0 I 1 be 2 get 3 ...,0 1 1 2 2 7 3 5 4 3  ....,72,53,be,5
pcc_eng_sample-1_1.03_x04,7.833333,6.853223,1,1.75,7.0,13.0,17,47,32.5,26.994444,...,67,195,6,4.148936,0 page 1 Sunday 2 Jul...,0 5 1 6 2 4 3 2 4 4 5...,47,39,be,3
pcc_eng_sample-1_1.04_x07,11.25,2.629956,9,9.75,10.5,12.0,15,45,46.5,11.269428,...,57,186,4,4.133333,0 the 1 banknote 2 ...,0 3 1 9 2 5 3 2 4 4 5...,45,30,the,4
pcc_eng_sample-1_1.05_x08,14.0,8.717798,4,11.0,18.0,19.0,20,42,60.666667,33.62043,...,83,182,3,4.333333,0 Redstone 1 Way 2 ...,0 8 1 3 2 3 3 8 4 3 5...,42,30,have,3


🚩 Apparently converting dtypes makes them *larger* slightly... this seems to coincide with the strange capitalization difference... 🤔

In [22]:
# print('# dtypes after auto dtype conversion:')
# print_col_info(d_stats.convert_dtypes())

In [31]:
hashable_stats = d_stats.loc[:, d_stats.dtypes != 'object']
trans_d_df = hashable_stats.transpose()
same_vals = trans_d_df.duplicated(keep=False)
equivalent_metrics = trans_d_df.loc[same_vals, :]
if equivalent_metrics.empty: 
    print('No metrics are identical/redundant.')
else: 
    print('These metrics are redundant:', ''.join('\n'+i for i in equivalent_metrics.index))
equivalent_metrics.sort_values(equivalent_metrics.columns[0])
    

These metrics are redundant: 
D_Swrd_50%
D_Swrd_median
D_Swrd_total
D_Schr_50%
D_Schr_median
D_lmm_count


D_id,pcc_eng_sample-1_1.01_x01,pcc_eng_sample-1_1.02_x02,pcc_eng_sample-1_1.03_x04,pcc_eng_sample-1_1.04_x07,pcc_eng_sample-1_1.05_x08,pcc_eng_sample-1_1.06_x09,pcc_eng_sample-1_1.07_x11,pcc_eng_sample-1_1.08_x12,pcc_eng_sample-1_1.09_x13,pcc_eng_sample-1_1.10_x16,pcc_eng_sample-1_1.11_x18,pcc_eng_sample-1_1.12_x19
D_Swrd_50%,9.0,9.5,7.0,10.5,18.0,17.5,9.0,18.0,17.0,12.5,4.0,19.0
D_Swrd_median,9.0,9.5,7.0,10.5,18.0,17.5,9.0,18.0,17.0,12.5,4.0,19.0
D_Schr_50%,31.0,37.5,28.5,47.5,77.0,77.5,27.0,83.0,106.0,67.0,15.0,90.0
D_Schr_median,31.0,37.5,28.5,47.5,77.0,77.5,27.0,83.0,106.0,67.0,15.0,90.0
D_Swrd_total,87.0,72.0,47.0,45.0,42.0,467.0,385.0,667.0,131.0,210.0,83.0,137.0
D_lmm_count,87.0,72.0,47.0,45.0,42.0,467.0,385.0,667.0,131.0,210.0,83.0,137.0


👆 apparently `describe()`'s `50%` output is the same thing as `median()` (which makes sense, but I was not aware it wasn't a different metric)

In [33]:
round_transdf = hashable_stats.transpose().round()
almost_same = round_transdf.duplicated(keep=False)
similar_metrics = round_transdf.loc[almost_same, :]
similar_metrics = similar_metrics.loc[~similar_metrics.index.isin(
    equivalent_metrics.index), :]
if similar_metrics.empty:
    print('No other metrics are similar enough to collapse entirely.')
else:
    print('These metrics could be collapsed:')
    similar_metrics


No other metrics are similar enough to collapse entirely.


In [None]:

# by_d_df = d_stats.loc[:,
#     # columns to perpetuate
#     d_stats.columns.str.startswith('doc')
# ].round(3)
# by_d_df


Seeing if it comes out the same to average on top of averages or just do the calculation directly

In [None]:
# any(by_d_df.d_mean_s_avg_word_len != by_d_df.d_char_per_word)
