# Notebook to investigate getting stats for puddin files

In [1]:
# coding=utf-8
#> imports
import pandas as pd
import pyconll
import sys

from collections import namedtuple
from itertools import islice
from pathlib import Path
from pprint import pprint

#> constants
DATA_GRP = 'val'
DATA_DIR = Path('data/puddin')
CAP = 2000


### Select dataset and load info dataframe

In [2]:
data_code = f'Pcc{DATA_GRP[:2].capitalize()}'
group_dir = DATA_DIR.joinpath(f'{data_code}.conll')

info_path = DATA_DIR.joinpath('info/validation_by_group/status-overview/')

for group_info_file in info_path.glob('*status-info*pkl*'):
    # for testing
    if not group_info_file.stem.startswith(data_code):
        continue
    else:
        group_info = pd.read_pickle(group_info_file)
        break
group_info.sample(3)


Unnamed: 0_level_0,conll_id,data_group,conllu_stem,docs_in_conllu,excl_type,known_fail,success,slice,text_altered,missing,slice_code
raw_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
pcc_val_12931,pcc_eng_val_1.7954_x12931,val,pcc_eng_val-01,9999,,False,True,pcc_eng_val_1,,False,1.0
pcc_val_15927,pcc_eng_val_1.9820_x15927,val,pcc_eng_val-01,9999,,False,True,pcc_eng_val_1,,False,1.0
pcc_val_16966,,val,,0,a0wrd,False,False,,True,False,


### Get conllu path from info dataframe

In [3]:
if not group_dir.is_dir():
    sys.exist('ERROR: cannot find conllu dir for', data_code, '\n>> Program Terminated.')
else:
    stem_col = group_info.conllu_stem
    for stem in stem_col.unique():
        conllu_path = group_dir.joinpath(f'{stem}.conllu')
        if conllu_path.is_file():# and conllu_path.stat().st_size > 0:
            print(f'conllu: {conllu_path}')
            #TODO: temp for debugging! REMOVE
            break
        else:
            print(f'{conllu_path} does not exist, or is not a file.')
    #! this should go within loop, but only doing one for devel so pulling it out
    print(f'Counting data in {conllu_path}...')

conllu: data/puddin/PccVa.conll/pcc_eng_val-01.conllu
Counting data in data/puddin/PccVa.conll/pcc_eng_val-01.conllu...


### Iterate over conllu file and collect word & character counts for each sentence

In [4]:
def read_sentence(sentence):
    sent_tuple = namedtuple(
        'sent_counts',
        ['Sid', 'Did', 'txt',
         'lemmas', 'words', 
         #? is `wlens` (word lengths) truly needed? 
         #  ^(mean wlen can be calculated by `chr_count/wrd_count` at any level)
         'wlens',
         'wrd_count', 'chr_count',
         'wlen_mean'])
    #* NOTE: this excludes all punctuation symbols!
    tok_objects = tuple(
        tok for tok in sentence._tokens if tok.deprel != 'punct')
    # lemmas = pd.Series(tok.lemma for tok in tok_objects)
    words = pd.Series(tok.form for tok in tok_objects)
    word_lengths = pd.Series(len(word) for word in words)
    word_count = len(words)
    char_count = sum(word_lengths)
    doc_id = sentence.id.rsplit('_',1)[0]
    yield sent_tuple(sentence.id, doc_id, sentence.text,
                    pd.Series(tok.lemma for tok in tok_objects), 
                    words, word_lengths,
                    word_count, char_count,
                    char_count/word_count)

def gen_sentence_info(conllu_path, sentence_cap: int=None):

    # doc_id = None
    file_iter = pyconll.iter_from_file(conllu_path)
    if sentence_cap:
        print(f'IN DEVELOPMENT: only reading first {sentence_cap} sentences')
        file_iter = islice(file_iter, sentence_cap)
        
    for sentence in file_iter:
        # if sentence.meta_present('newdoc id'):# and sentence.meta_value('newdoc_id') != doc_id:
        #     doc_id = sentence.meta_value('newdoc id')
        #     print(doc_id)
        # elif not doc_id:
        #     print(f'! WARNING: doc {doc_id} info not found!')
        # elif not sentence.id.startswith(doc_id):
        #     print(f'~!~ WARNING: doc {doc_id} and sentence ids do not match!')
        # # print(sentence.text)
        yield from read_sentence(sentence)
        
conll_iter = gen_sentence_info(conllu_path, sentence_cap=CAP)
sdf = pd.DataFrame(conll_iter)
sdf = sdf.set_index('Sid')
sdf.sample(3).sort_index()  
print(sdf.Did.describe())
sdf.describe().round(2)

IN DEVELOPMENT: only reading first 2000 sentences
count                          2000
unique                           91
top       pcc_eng_val_1.0016_x00024
freq                            133
Name: Did, dtype: object


Unnamed: 0,wrd_count,chr_count,wlen_mean
count,2000.0,2000.0,2000.0
mean,17.3,82.0,4.79
std,12.25,59.76,0.98
min,1.0,1.0,1.0
25%,8.0,37.0,4.19
50%,15.0,70.0,4.7
75%,24.0,114.0,5.3
max,76.0,392.0,12.0


## Group by document and add stats at document level

In [5]:
def describe_counts(df, prefix: str='s'):
    
    doc_dict = {}
    counts = df.loc[:, df.columns.str.endswith('count')]
    #> pull out only the distinguishing str
    counts.columns = counts.columns.str.replace('count','').str.strip('_')
    
    # first descriptor is "count" ~ do not need, so drop it here
    counts_desc = counts.describe().iloc[1:, :]
    # add median and sum rows, but "assign" as columns
    t_counts_desc = counts_desc.transpose().assign(
        #! turns out median is identical with `50%` value already in `counts_desc`
        #   and `wrd_total` is redundant with lmm_count but leaving that
        # median=counts.median(),
        total=counts.sum())

    # for each row of combined descriptive stats automated df:
    #   1. pull out row as its *own* dataframe
    #   2. rename cols to indicate row/data (i.e. 'wrd_' or 'chr_')
    for row_ix in t_counts_desc.index:
        row_df = t_counts_desc.loc[[row_ix], :]
        #// print('generalized `describe` metrics:',row_df.columns.to_list(),sep='\n')
        row_df.columns = prefix+row_ix+'_'+row_df.columns
        #// print('--become-->')
        #// print('individualized `describe` metrics:', row_df.columns.to_list(), sep='\n')
        row = row_df.iloc[0, :]
        row_dict = row.to_dict()
        doc_dict.update(row_dict)
    # pprint(doc_dict)
    return doc_dict



### Loop through by-sentence dataframe


In [6]:
def describe_word_level_series(wrd_lvl_ser: pd.Series, metric_prefix: str):
    
    if not metric_prefix.endswith('_'): 
        metric_prefix += '_'
    ser_desc = wrd_lvl_ser.describe()
    ser_desc = ser_desc.loc[ser_desc.index != 'count']
    ser_desc.index = metric_prefix + ser_desc.index
    doc_dict.update(ser_desc.to_dict())
    
    return doc_dict

d_dicts = []
prefix = 'S'
# TODO: make this a method/function
for doc, gdf in sdf.groupby('Did'):
    # print(doc)
    doc_dict = describe_counts(gdf,prefix)
    doc_dict = {f'D_{k}':v for k,v in doc_dict.items()}

    #TODO: 👉 use `s_wlen_list` col to calculate mean doc wlen (or not??)
    
    doc_lemmas = pd.Series(lm for lm_list in gdf.lemmas for lm in lm_list)
    doc_wlens = pd.Series(wl for wl_list in gdf.wlens for wl in wl_list)
    doc_add = {

        # document id
        'D_id': doc,  
              
        # total sentences in doc (~ sentences/per doc)
        # (synonymous with the `count` descriptors dropped above)
        'D_snt_count': len(gdf),
        
        'D_lemmas': doc_lemmas, 
        'D_lemmas_concat': ' '.join(doc_lemmas),
        
        # REPLACE ALL OF THIS MANUAL ENTRY WITH `describe()`
        # 'D_wlens': doc_wlens,
        # # total char in doc / total words in doc
        # # 'D_wlen_mean.0': doc_dict[f'D_{prefix}chr_total']/doc_dict[f'D_{prefix}wrd_total'],
        # # NOTE: this 👆 could also be done by getting the mean of all 
        # #   the wlen elements for each value/cell of `s_wlen_list` col, 
        # #   but that is unnecessary -> returns identical result:
        # # ^ But, if creating series of all word lengths _anyway_, this is cleaner:
        # 'D_wlen_mean': doc_wlens.mean(),
        
        # 'D_wlen_median': doc_wlens.median(),
        
        #? maybe don't need to keep the literal word lengths for stats above doc level?
        'D_wlens': doc_wlens

    }
    doc_dict.update(doc_add)
    doc_dict = describe_word_level_series(doc_lemmas, 'D_lmm')
    doc_dict = describe_word_level_series(doc_wlens, 'D_wlen')
    
    d_dicts.append(doc_dict)
    
d_stats = pd.DataFrame(d_dicts).convert_dtypes()
d_stats = d_stats.assign(D_id=d_stats.D_id.astype('string')).set_index('D_id')
d_stats.columns = d_stats.columns.str.replace('50%', 'median').str.replace('freq', 'topfreq')
d_stats.head(5)

Unnamed: 0_level_0,D_Swrd_mean,D_Swrd_std,D_Swrd_min,D_Swrd_25%,D_Swrd_median,D_Swrd_75%,D_Swrd_max,D_Swrd_total,D_Schr_mean,D_Schr_std,...,D_lmm_unique,D_lmm_top,D_lmm_topfreq,D_wlen_mean,D_wlen_std,D_wlen_min,D_wlen_25%,D_wlen_median,D_wlen_75%,D_wlen_max
D_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pcc_eng_val_1.0001_x00001,7.75,6.670832,2,3.75,6.0,9.0,23,62,34.625,24.142065,...,53,a,3,4.467742,2.244978,1,3.0,4.0,6.0,9
pcc_eng_val_1.0002_x00002,13.444444,8.719391,2,8.0,12.0,20.0,29,121,62.888889,42.609988,...,78,a,8,4.677686,2.523935,1,3.0,4.0,7.0,11
pcc_eng_val_1.0003_x00005,13.92,7.915807,1,8.0,14.0,18.0,35,348,62.24,32.156233,...,153,you,19,4.471264,2.284336,1,3.0,4.0,6.0,13
pcc_eng_val_1.0004_x00006,21.076923,14.068186,5,10.25,20.5,26.5,72,548,89.846154,58.064579,...,227,the,40,4.262774,2.323145,1,2.0,4.0,6.0,12
pcc_eng_val_1.0005_x00007,10.0,9.433981,1,1.0,9.0,17.0,22,50,50.2,48.215143,...,35,the,6,5.02,2.90313,1,3.0,4.5,7.0,12


In [7]:
def print_col_info(df):
    width = max(len(c) for c in df.columns)
    for c in df.columns:
        print(
            f'{c.rjust(width)} : {str(df[c].dtype).ljust(8)}{type(df[c][0])}')
    print('>> memory usage <<\n', df.memory_usage('deep').to_string(), sep='')
print_col_info(d_stats.loc[:,d_stats.dtypes.astype('string').str.startswith(('obj', 'str'))])

       D_lemmas : object  <class 'pandas.core.series.Series'>
D_lemmas_concat : string  <class 'str'>
        D_wlens : object  <class 'pandas.core.series.Series'>
      D_lmm_top : string  <class 'str'>
>> memory usage <<
Index              728
D_lemmas           728
D_lemmas_concat    728
D_wlens            728
D_lmm_top          728


In [8]:
# print_col_info(d_stats.convert_dtypes())
d_stats.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, pcc_eng_val_1.0001_x00001 to pcc_eng_val_1.0091_x00145
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   D_Swrd_mean      91 non-null     Float64
 1   D_Swrd_std       89 non-null     Float64
 2   D_Swrd_min       91 non-null     Int64  
 3   D_Swrd_25%       91 non-null     Float64
 4   D_Swrd_median    91 non-null     Float64
 5   D_Swrd_75%       91 non-null     Float64
 6   D_Swrd_max       91 non-null     Int64  
 7   D_Swrd_total     91 non-null     Int64  
 8   D_Schr_mean      91 non-null     Float64
 9   D_Schr_std       89 non-null     Float64
 10  D_Schr_min       91 non-null     Int64  
 11  D_Schr_25%       91 non-null     Float64
 12  D_Schr_median    91 non-null     Float64
 13  D_Schr_75%       91 non-null     Float64
 14  D_Schr_max       91 non-null     Int64  
 15  D_Schr_total     91 non-null     Int64  
 16  D_snt_count      91 no

In [9]:

# * FLOATS
is_float = d_stats.dtypes.astype('string').str.startswith(('float', 'Float'))

# print('\nfloats unchanged')
# print_col_info(d_stats.loc[:,is_float])

#> downcast float
# print('\nfloat Downcast')
dwncst_df = d_stats.copy()
dwncst_df.loc[:, is_float] = dwncst_df.loc[:, is_float].apply(
    lambda c: pd.to_numeric(c, downcast='float'))
# print_col_info(dwncst_df.loc[:, is_float])
# 
# # * INTEGERS
is_int = d_stats.dtypes.astype('string').str.startswith(('int','Int'))

# print('\nintegers unchanged')
# print_col_info(d_stats.loc[:,is_int])
# 
# #> downcast unsigned
# print('\nUNsigned Downcast')
# dwncst_uint_df = dwncst_df.copy()
# dwncst_uint_df.loc[:, is_int] = dwncst_uint_df.loc[:, is_int].apply(
#     lambda c: pd.to_numeric(c, downcast='unsigned'))
# print_col_info(dwncst_uint_df.loc[:, is_int])
# 
# #> downcast signed
# print('\nsigned downcast')
# dwncst_int_df = dwncst_df.copy()
# dwncst_int_df.loc[:, is_int] = dwncst_int_df.loc[:, is_int].apply(
#     lambda c: pd.to_numeric(c, downcast='signed'))
# print_col_info(dwncst_int_df.loc[:, is_int])

# float_d_stats = d_stats.loc[:,d_stats.dtypes.str.startswith(('float','Float'))]
# float_downcast = float_d_stats.apply(lambda c: pd.to_numeric(c, downcast='float'))
# int_d_stats = d_stats.loc[:,d_stats.dtypes.str.startswith(('int','Int'))]
# int_downcast = int_d_stats

dwncst_df.loc[:, is_int] = dwncst_df.loc[:, is_int].apply(
    lambda c: pd.to_numeric(c, downcast='integer'))

dwncst_df.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, pcc_eng_val_1.0001_x00001 to pcc_eng_val_1.0091_x00145
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   D_Swrd_mean      91 non-null     Float32
 1   D_Swrd_std       89 non-null     Float32
 2   D_Swrd_min       91 non-null     Int8   
 3   D_Swrd_25%       91 non-null     Float32
 4   D_Swrd_median    91 non-null     Float32
 5   D_Swrd_75%       91 non-null     Float32
 6   D_Swrd_max       91 non-null     Int8   
 7   D_Swrd_total     91 non-null     Int16  
 8   D_Schr_mean      91 non-null     Float32
 9   D_Schr_std       89 non-null     Float32
 10  D_Schr_min       91 non-null     Int16  
 11  D_Schr_25%       91 non-null     Float32
 12  D_Schr_median    91 non-null     Float32
 13  D_Schr_75%       91 non-null     Float32
 14  D_Schr_max       91 non-null     Int16  
 15  D_Schr_total     91 non-null     Int16  
 16  D_snt_count      91 no

In [10]:
d_stats = dwncst_df

In [11]:
hashable_stats = d_stats.loc[:, d_stats.dtypes != 'object']
trans_d_df = hashable_stats.transpose()
same_vals = trans_d_df.duplicated(keep=False)
equivalent_metrics = trans_d_df.loc[same_vals, :]
if equivalent_metrics.empty: 
    print('No metrics are identical/redundant.')
else: 
    print('These metrics are redundant:', ''.join('\n'+i for i in equivalent_metrics.index))
equivalent_metrics.sort_values(equivalent_metrics.columns[0])
    

No metrics are identical/redundant.


D_id,pcc_eng_val_1.0001_x00001,pcc_eng_val_1.0002_x00002,pcc_eng_val_1.0003_x00005,pcc_eng_val_1.0004_x00006,pcc_eng_val_1.0005_x00007,pcc_eng_val_1.0006_x00008,pcc_eng_val_1.0007_x00009,pcc_eng_val_1.0008_x00010,pcc_eng_val_1.0009_x00011,pcc_eng_val_1.0010_x00012,...,pcc_eng_val_1.0082_x00128,pcc_eng_val_1.0083_x00130,pcc_eng_val_1.0084_x00131,pcc_eng_val_1.0085_x00132,pcc_eng_val_1.0086_x00136,pcc_eng_val_1.0087_x00137,pcc_eng_val_1.0088_x00140,pcc_eng_val_1.0089_x00141,pcc_eng_val_1.0090_x00144,pcc_eng_val_1.0091_x00145


In [12]:
round_transdf = hashable_stats.transpose().round()
almost_same = round_transdf.duplicated(keep=False)
similar_metrics = round_transdf.loc[almost_same, :]
similar_metrics = similar_metrics.loc[~similar_metrics.index.isin(
    equivalent_metrics.index), :]
if similar_metrics.empty:
    print('No other metrics are similar enough to collapse entirely.')
else:
    print('These metrics could be collapsed:')
    similar_metrics


No other metrics are similar enough to collapse entirely.
