# Notebook to investigate getting stats for puddin files

In [None]:
import pandas as pd
from pathlib import Path
DATA_GRP = 'val'
DATA_DIR = Path('data/puddin')

In [None]:
data_code = f'Pcc{DATA_GRP[:2].capitalize()}'
for group_info_file in DATA_DIR.joinpath('info/validation_by_group/status-overview/'
                                         ).glob('*status-info*pkl*'): 
    # for testing
    if not group_info_file.stem.startswith(data_code): 
        continue
    else: 
        group_info = pd.read_pickle(group_info_file)
        break
group_info

I had been using `egrep` to more quickly generate counts for different units. However, I don't think that is the most effective way to go about getting counts for anything below the conllu file level; i.e. per document or per sentence stats. Those will need to employ `pyconll` and actually parse the conllu formatting. 

In [None]:
import pyconll
from collections import namedtuple

group_dir = DATA_DIR.joinpath(f'{data_code}.conll')

In [None]:
def count_contents(conllu_path):
    reader = pyconll.iter_from_file(conllu_path)
    conllu_df = pd.DataFrame(reader)
    return conllu_df


In [None]:
if not group_dir.is_dir(): 
    print('CANNOT FIND CONLLU DIR FOR', data_code)
else: 
    stem_col = group_info.conllu_stem
    for stem in stem_col.unique(): 
        conllu_path = group_dir.joinpath(f'{stem}.conllu')
        if conllu_path.is_file():# and conllu_path.stat().st_size > 0:
            
            print(f'conllu: {conllu_path}')
        # else: 
        #     print(f'{conllu_path} does not exist, or is not a file.')
        #> this should go within loop, but only doing one for devel so pulling it out
        # doc_counts = count_contents(conllu_path)
    print(f'Counting data in {conllu_path}...')
    # doc_counts = count_contents(conllu_path)

In [None]:
#> developing method for counting with pyconll object with only last path of loop
# conllu_reader = pyconll.iter_from_file(conllu_path)

seeing if pandas can do anything with the pyconll object...

In [None]:
# def gen_sentences(conll_df): 

#     for i in range(len(conll_df)): 
#         sentence = []
#         for x in range(len(conll_df.columns)): 
#             word = conll_df.iat[i,x]
#             if word:
#                 sentence.append(word)
#         # print(*[w.form for w in sentence])
#         yield sentence

    
# sentences_gen = gen_sentences(sample_df)
# for s in sentences_gen: 
#     print(*[w.form for w in s])
# # df = pd.DataFrame(pyconll.iter_from_file(conllu_path))
# # sentence_gen = gen_sentences(df)

This is no better--worse even probably--than just iterating through the pyconll object.

In [26]:
sent_tuple = namedtuple('sent_counts', ['sent_id', 'text', 'word_count', 'char_count'])

In [27]:
def gen_sentence_info(conllu_path):
    
    for sentence in pyconll.iter_from_file(conllu_path):
        # print(sentence.text)
        # if sentence.meta_present('newdoc_id'): 
        #     doc_id = sentence.newdoc_id
        # sentence.set_meta(key='doc_id', value=doc_id)
        word_lengths = [len(w.form) for w in sentence._tokens if w.deprel != 'punct']
        # print(word_lengths)
        word_count = len(word_lengths)
        char_count = sum(word_lengths)
        yield sent_tuple(sentence.id, sentence.text, word_count, char_count)

In [None]:
# conllu_counts_df = pd.DataFrame(gen_sentence_info(conllu_path))
# conllu_counts_df.head()

In [30]:
sample_df = pd.DataFrame(gen_sentence_info('data/puddin/PccSa1.conll/pcc_eng_sample-1-01.conllu'))
sample_df

Unnamed: 0,sent_id,text,word_count,char_count
0,pcc_eng_sample-1_1.01_x01_1,What If We Weren't so Focused on Price?,9,31
1,pcc_eng_sample-1_1.01_x01_2,"January 1, 2011",3,12
2,pcc_eng_sample-1_1.01_x01_3,Would quality get better?,4,21
3,pcc_eng_sample-1_1.01_x01_4,Would service get better?,4,21
4,pcc_eng_sample-1_1.01_x01_5,Would there be jobs in the manufacturing secto...,13,57
...,...,...,...,...
164,pcc_eng_sample-1_1.12_x19_5,"In 2003, Sars killed hundreds of people, mostl...",14,66
165,pcc_eng_sample-1_1.12_x19_6,Britain's Health Protection,3,25
166,pcc_eng_sample-1_1.12_x19_7,Agency said in a statement late on Sunday that...,20,90
167,pcc_eng_sample-1_1.12_x19_8,The UN health agency says virus samples from t...,27,120


In [32]:
sample_df = sample_df.assign(doc_id = sample_df.,
                             char_per_word=sample_df.char_count/sample_df.word_count)
sample_df.describe()

Unnamed: 0,word_count,char_count,char_per_word
count,169.0,169.0,169.0
mean,14.04142,65.230769,4.845452
std,9.417787,44.251715,1.107977
min,1.0,5.0,2.75
25%,5.0,26.0,4.1
50%,14.0,67.0,4.714286
75%,20.0,90.0,5.5
max,45.0,249.0,8.5
