# Full Processing Stats from the Cluster

This notebook loads the processing info for data slices completely processed by `script/parse_pile.py`.

In [24]:
import pandas as pd
from pathlib import Path
from datetime import datetime
tstamp = datetime.fromtimestamp
DATA_DIR = Path('/share/compling/data/puddin')

Load the data:

In [25]:
info_full = pd.read_csv(DATA_DIR.joinpath('all-completed-slices_meta-index.csv'))
#// process_cols = ['slice_name', 'total_texts', 'data_origin_group', 'slice_number',
#//                 'started_at', 'finished_at', 'parsing_time',
#//                 'final_df_path', 'final_slice_path', 'conllu_path']
#// info = info_full.loc[:, process_cols].convert_dtypes()

adjust data types, set index, create "record" column from input line number

In [26]:
info = info_full.convert_dtypes()

info = info.assign(slice_number_str = info.slice_number.astype('string').apply(lambda s: s.zfill(3)),
                   data_origin_group = info.data_origin_group.apply(lambda s: s.zfill(2)))

time_cols = info.columns.str.endswith('at')
time_info = info.loc[:, time_cols].apply(pd.to_datetime)
info.loc[:, time_cols] = time_info
info = info.assign(days=pd.to_numeric(info.parsing_time.str.split(' ').str.get(0)), 
                   time=info.parsing_time.str.split(' ').str.get(-1).apply(pd.to_timedelta), 
                   slice_number = info.slice_number.apply(lambda n: pd.to_numeric(n, downcast='unsigned')))

info = info.assign(seconds=info.time.apply(lambda td: pd.to_numeric(round(td.total_seconds()),downcast='unsigned')))
ix = pd.Series(info.index.astype('string')).apply(lambda i: i.zfill(len(str(info.index.max()))))
info = info.assign(record=info.data_origin_group.str.upper().apply(lambda s: s[:2]) +'-'+info.slice_number_str+'-'+ix)
info = info.set_index('record')

is_path = info.columns.str.endswith('ath')
for path_col in info.columns[is_path]: 
    info.loc[:,path_col] = info.loc[:,path_col].apply(lambda p: Path(p))
    

Add size and last modification time for final conllu file paths

In [27]:
info = info.assign(
    kept_df_mtime=info.final_df_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(p).stat().st_mtime)),
    excl_df_mtime=info.exclusions_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(p).stat().st_mtime)),
    slice_df_mtime=info.tmp_slice_path.apply(
        lambda p: pd.Timestamp.fromtimestamp( DATA_DIR.joinpath(p).stat().st_mtime)),
    conllu_mtime=info.conllu_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(p).stat().st_mtime)))


In [28]:

info = info.assign(
    kept_df_gzMB=info.final_df_path.apply(
        lambda p: (DATA_DIR.joinpath(p).stat().st_size) / 1048576),
    excl_df_gzMB=info.exclusions_path.apply(
        lambda p: (DATA_DIR.joinpath(p).stat().st_size) / 1048576),
    slice_df_gzMB=info.tmp_slice_path.apply(
        lambda p: (DATA_DIR.joinpath(p).stat().st_size) / 1048576),
    conllu_MB=info.conllu_path.apply(
        lambda p: (DATA_DIR.joinpath(p).stat().st_size) / 1048576)
)


In [29]:
size_cols = info.columns[info.columns.str.endswith('MB')]
info.loc[:, size_cols] = info[size_cols].round(1) 


In [30]:
dtcols =  info.select_dtypes(include='datetime')
info.loc[:,dtcols.columns] = dtcols.apply(lambda c: c.dt.round("min"))
info_load_full = info

In [31]:
# info = info_load_full

In [32]:
# info = info_load_full
info.sample(5)[[c for c in info.columns if c.endswith(('seconds','B', 'mtime', 'finished_at'))]]

Unnamed: 0_level_0,finished_at,seconds,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime,kept_df_gzMB,excl_df_gzMB,slice_df_gzMB,conllu_MB
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
22-040-1945,2022-05-02 19:12:00,125,2022-04-28 00:11:00,2022-05-27 21:50:00,2022-04-28 00:13:00,2022-05-26 04:54:00,1013.1,2242.6,9.6,391.8
05-099-1329,2022-04-22 03:11:00,2043,2022-04-13 03:00:00,2022-04-22 08:54:00,2022-04-13 03:06:00,2022-04-22 03:11:00,1012.5,1758.6,9.5,383.8
13-060-1028,2022-04-21 07:41:00,2551,2022-04-13 03:38:00,2022-04-22 18:21:00,2022-04-13 03:41:00,2022-04-21 07:41:00,1015.6,1755.2,9.4,382.3
22-019-1790,2022-04-28 16:54:00,3118,2022-04-28 00:11:00,2022-05-27 21:50:00,2022-04-28 00:12:00,2022-05-25 08:34:00,1013.1,2242.6,9.4,379.3
22-103-3138,2022-05-27 21:05:00,2374,2022-04-28 00:11:00,2022-05-27 21:50:00,2022-04-28 00:16:00,2022-05-27 21:05:00,1013.1,2242.6,9.4,381.2


In [33]:
# was_replaced = info.duplicated(keep=False, subset=['data_origin_group', 'slice_number', 'final_slice_path', 'conllu_path'])
# info.loc[was_replaced, :].sort_values('record')
was_replaced = info.duplicated(keep='last', subset=['data_origin_group', 'slice_number', 'conllu_path'])
replaced = info.loc[was_replaced,:].sort_values('record')

In [34]:
replaced.loc[replaced.conllu_mtime != replaced.finished_at, replaced.columns.str.endswith(('name','time','_at'))].sort_values('conllu_mtime')

Unnamed: 0_level_0,slice_name,started_at,finished_at,parsing_time,time,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
29-001-0006,29_001,2022-03-27 02:17:00,2022-03-27 03:03:00,0 days 00:46:13,0 days 00:46:13,2022-03-27 02:12:00,2022-04-30 03:31:00,2022-04-12 21:17:00,2022-04-12 22:13:00
29-002-0007,29_002,2022-03-27 03:03:00,2022-03-27 03:51:00,0 days 00:48:11,0 days 00:48:11,2022-03-27 02:12:00,2022-04-30 03:31:00,2022-04-12 21:17:00,2022-04-12 23:06:00
29-003-0008,29_003,2022-03-27 03:51:00,2022-03-27 04:36:00,0 days 00:44:57,0 days 00:44:57,2022-03-27 02:12:00,2022-04-30 03:31:00,2022-04-12 21:17:00,2022-04-12 23:55:00
29-004-0009,29_004,2022-03-27 04:36:00,2022-03-27 05:23:00,0 days 00:46:38,0 days 00:46:38,2022-03-27 02:12:00,2022-04-30 03:31:00,2022-04-12 21:17:00,2022-04-13 00:47:00
29-005-0010,29_005,2022-03-27 05:23:00,2022-03-27 06:08:00,0 days 00:45:22,0 days 00:45:22,2022-03-27 02:12:00,2022-04-30 03:31:00,2022-04-12 21:17:00,2022-04-13 01:37:00
...,...,...,...,...,...,...,...,...,...
24-100-2766,Pcc24_100,2022-05-05 07:22:00,2022-05-05 07:49:00,0 days 00:27:09,0 days 00:27:09,2022-04-20 01:41:00,2022-06-28 17:23:00,2022-04-20 01:46:00,2022-06-28 17:15:00
01-061-2416,Pcc01_061,2022-05-04 04:09:00,2022-05-04 04:47:00,0 days 00:38:05,0 days 00:38:05,2022-04-13 03:31:00,2022-06-28 23:06:00,2022-04-13 03:33:00,2022-06-28 22:00:00
02-080-2902,Pcc02_080,2022-05-25 20:06:00,2022-05-25 20:40:00,0 days 00:34:23,0 days 00:34:23,2022-04-13 03:39:00,2022-06-28 22:15:00,2022-04-13 03:43:00,2022-06-28 22:06:00
01-064-2447,Pcc01_064,2022-05-04 06:02:00,2022-05-04 06:41:00,0 days 00:38:29,0 days 00:38:29,2022-04-13 03:31:00,2022-06-28 23:06:00,2022-04-13 03:34:00,2022-06-28 22:33:00


save processing records that have been replaced to a separate file, `replaced_slice-index-records.csv`, and remove them from the main `info` dataframe:

In [35]:
replaced.to_csv(DATA_DIR.joinpath('replaced_slice-index-records.csv'))
replaced.to_pickle(DATA_DIR.joinpath('replaced_slice-index-records.pkl'))
info = info.loc[~was_replaced,:]
info_load = info

In [36]:
# info = info_load

In [37]:
info = info.assign(end_timedelta = (info.conllu_mtime - info.finished_at).astype('timedelta64[m]'))
info.end_timedelta.describe().round(2)

count    3246.00
mean       -0.04
std         0.19
min        -1.00
25%         0.00
50%         0.00
75%         0.00
max         0.00
Name: end_timedelta, dtype: float64

This should only be slices that are currently running or were interrupted.

In [38]:
info.loc[info.end_timedelta > 2, ['slice_name','final_df_path','conllu_path','kept_df_mtime','conllu_mtime','finished_at','end_timedelta']].sort_values('conllu_mtime')

Unnamed: 0_level_0,slice_name,final_df_path,conllu_path,kept_df_mtime,conllu_mtime,finished_at,end_timedelta
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [39]:
# info = info_load

In [40]:
info.sort_values(['seconds', 'total_texts'], ascending=False)

Unnamed: 0_level_0,slice_name,total_texts,first_text_id,last_text_id,tmp_slice_path,final_slice_path,conllu_path,origin_filepath,data_origin_group,final_df_path,...,seconds,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime,kept_df_gzMB,excl_df_gzMB,slice_df_gzMB,conllu_MB,end_timedelta
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24-040-2011,Pcc24_040,9999,pcc_eng_24_040.0001_x0630363,pcc_eng_24_040.9999_x0646500,pile_tables/slices/Pcc24/tmp/pile_24-040_Pile-...,pile_tables/slices/Pcc24/pile_24-040_Pile-CC_d...,Pcc24.conll/pcc_eng_24-040.conllu,/share/compling/data/pile/train/24.jsonl,24,pile_tables/pile_24_Pile-CC_df.pkl.gz,...,5427,2022-04-20 01:41:00,2022-06-28 17:23:00,2022-04-20 01:43:00,2022-05-02 23:21:00,1013.4,1751.0,9.7,397.7,0.0
18-091-2816,Pcc18_091,9999,pcc_eng_18_091.0001_x1457489,pcc_eng_18_091.9999_x1473649,pile_tables/slices/Pcc18/tmp/pile_18-091_Pile-...,pile_tables/slices/Pcc18/pile_18-091_Pile-CC_d...,Pcc18.conll/pcc_eng_18-091.conllu,/share/compling/data/pile/train/18.jsonl,18,pile_tables/pile_18_Pile-CC_df.pkl.gz,...,4903,2022-04-20 01:52:00,2022-06-28 15:25:00,2022-04-20 01:56:00,2022-05-24 22:04:00,1010.5,1758.8,9.3,379.3,0.0
10-024-1558,Pcc10_024,9999,pcc_eng_10_024.0001_x0371544,pcc_eng_10_024.9999_x0387720,pile_tables/slices/Pcc10/tmp/pile_10-024_Pile-...,pile_tables/slices/Pcc10/pile_10-024_Pile-CC_d...,Pcc10.conll/pcc_eng_10-024.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,4855,2022-04-13 02:54:00,2022-06-28 15:49:00,2022-04-13 02:55:00,2022-04-26 22:02:00,1015.6,1740.5,9.7,391.7,0.0
10-048-1630,Pcc10_048,9999,pcc_eng_10_048.0001_x0760323,pcc_eng_10_048.9999_x0776333,pile_tables/slices/Pcc10/tmp/pile_10-048_Pile-...,pile_tables/slices/Pcc10/pile_10-048_Pile-CC_d...,Pcc10.conll/pcc_eng_10-048.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,4276,2022-04-13 02:54:00,2022-06-28 15:49:00,2022-04-13 02:56:00,2022-04-27 21:34:00,1015.6,1740.5,9.5,389.6,0.0
10-026-1573,Pcc10_026,9999,pcc_eng_10_026.0001_x0403864,pcc_eng_10_026.9999_x0420194,pile_tables/slices/Pcc10/tmp/pile_10-026_Pile-...,pile_tables/slices/Pcc10/pile_10-026_Pile-CC_d...,Pcc10.conll/pcc_eng_10-026.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,4244,2022-04-13 02:54:00,2022-06-28 15:49:00,2022-04-13 02:55:00,2022-04-27 00:22:00,1015.6,1740.5,9.5,384.3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12-051-2564,Pcc12_051,9999,pcc_eng_12_051.0001_x0808148,pcc_eng_12_051.9999_x0824308,pile_tables/slices/Pcc12/tmp/pile_12-051_Pile-...,pile_tables/slices/Pcc12/pile_12-051_Pile-CC_d...,Pcc12.conll/pcc_eng_12-051.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1526,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 14:14:00,1012.8,1753.8,9.3,379.2,0.0
10-072-3372,Pcc10_072,9999,pcc_eng_10_072.0001_x1147792,pcc_eng_10_072.9999_x1163925,pile_tables/slices/Pcc10/tmp/pile_10-072_Pile-...,pile_tables/slices/Pcc10/pile_10-072_Pile-CC_d...,Pcc10.conll/pcc_eng_10-072.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,1525,2022-04-13 02:54:00,2022-06-28 15:49:00,2022-04-13 02:57:00,2022-06-28 15:08:00,1015.6,1740.5,9.3,374.9,0.0
12-047-2535,Pcc12_047,9999,pcc_eng_12_047.0001_x0743541,pcc_eng_12_047.9999_x0759569,pile_tables/slices/Pcc12/tmp/pile_12-047_Pile-...,pile_tables/slices/Pcc12/pile_12-047_Pile-CC_d...,Pcc12.conll/pcc_eng_12-047.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1524,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 12:16:00,1012.8,1753.8,9.4,380.7,0.0
12-050-2556,Pcc12_050,9999,pcc_eng_12_050.0001_x0792018,pcc_eng_12_050.9999_x0808147,pile_tables/slices/Pcc12/tmp/pile_12-050_Pile-...,pile_tables/slices/Pcc12/pile_12-050_Pile-CC_d...,Pcc12.conll/pcc_eng_12-050.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1523,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 13:42:00,1012.8,1753.8,9.4,381.8,0.0


In [41]:
sec_dstats = (info.seconds).describe()

lower = sec_dstats[4] + (sec_dstats[4] - sec_dstats[5]) *1.5
fast = info.loc[info.seconds < lower, :]
fast.sort_values('seconds')

Unnamed: 0_level_0,slice_name,total_texts,first_text_id,last_text_id,tmp_slice_path,final_slice_path,conllu_path,origin_filepath,data_origin_group,final_df_path,...,seconds,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime,kept_df_gzMB,excl_df_gzMB,slice_df_gzMB,conllu_MB,end_timedelta
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12-045-2523,Pcc12_045,9999,pcc_eng_12_045.0001_x0711367,pcc_eng_12_045.9999_x0727554,pile_tables/slices/Pcc12/tmp/pile_12-045_Pile-...,pile_tables/slices/Pcc12/pile_12-045_Pile-CC_d...,Pcc12.conll/pcc_eng_12-045.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1512,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 11:25:00,1012.8,1753.8,9.3,378.6,0.0
12-050-2556,Pcc12_050,9999,pcc_eng_12_050.0001_x0792018,pcc_eng_12_050.9999_x0808147,pile_tables/slices/Pcc12/tmp/pile_12-050_Pile-...,pile_tables/slices/Pcc12/pile_12-050_Pile-CC_d...,Pcc12.conll/pcc_eng_12-050.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1523,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 13:42:00,1012.8,1753.8,9.4,381.8,0.0
12-047-2535,Pcc12_047,9999,pcc_eng_12_047.0001_x0743541,pcc_eng_12_047.9999_x0759569,pile_tables/slices/Pcc12/tmp/pile_12-047_Pile-...,pile_tables/slices/Pcc12/pile_12-047_Pile-CC_d...,Pcc12.conll/pcc_eng_12-047.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1524,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 12:16:00,1012.8,1753.8,9.4,380.7,0.0
10-072-3372,Pcc10_072,9999,pcc_eng_10_072.0001_x1147792,pcc_eng_10_072.9999_x1163925,pile_tables/slices/Pcc10/tmp/pile_10-072_Pile-...,pile_tables/slices/Pcc10/pile_10-072_Pile-CC_d...,Pcc10.conll/pcc_eng_10-072.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,1525,2022-04-13 02:54:00,2022-06-28 15:49:00,2022-04-13 02:57:00,2022-06-28 15:08:00,1015.6,1740.5,9.3,374.9,0.0
12-051-2564,Pcc12_051,9999,pcc_eng_12_051.0001_x0808148,pcc_eng_12_051.9999_x0824308,pile_tables/slices/Pcc12/tmp/pile_12-051_Pile-...,pile_tables/slices/Pcc12/pile_12-051_Pile-CC_d...,Pcc12.conll/pcc_eng_12-051.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1526,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 14:14:00,1012.8,1753.8,9.3,379.2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12-031-2420,Pcc12_031,9999,pcc_eng_12_031.0001_x0485727,pcc_eng_12_031.9999_x0501937,pile_tables/slices/Pcc12/tmp/pile_12-031_Pile-...,pile_tables/slices/Pcc12/pile_12-031_Pile-CC_d...,Pcc12.conll/pcc_eng_12-031.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1676,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:33:00,2022-05-04 04:57:00,1012.8,1753.8,9.5,383.3,0.0
12-035-2450,Pcc12_035,9999,pcc_eng_12_035.0001_x0550035,pcc_eng_12_035.9999_x0566224,pile_tables/slices/Pcc12/tmp/pile_12-035_Pile-...,pile_tables/slices/Pcc12/pile_12-035_Pile-CC_d...,Pcc12.conll/pcc_eng_12-035.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1677,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:33:00,2022-05-04 06:53:00,1012.8,1753.8,9.5,384.6,0.0
02-090-2939,Pcc02_090,9999,pcc_eng_02_090.0001_x1438898,pcc_eng_02_090.9999_x1455094,pile_tables/slices/Pcc02/tmp/pile_02-090_Pile-...,pile_tables/slices/Pcc02/pile_02-090_Pile-CC_d...,Pcc02.conll/pcc_eng_02-090.conllu,/share/compling/data/pile/train/02.jsonl,02,pile_tables/pile_02_Pile-CC_df.pkl.gz,...,1683,2022-04-13 03:39:00,2022-06-28 22:15:00,2022-04-13 03:43:00,2022-05-26 02:12:00,1015.8,1753.6,9.3,378.6,0.0
02-093-2948,Pcc02_093,9999,pcc_eng_02_093.0001_x1487434,pcc_eng_02_093.9999_x1503696,pile_tables/slices/Pcc02/tmp/pile_02-093_Pile-...,pile_tables/slices/Pcc02/pile_02-093_Pile-CC_d...,Pcc02.conll/pcc_eng_02-093.conllu,/share/compling/data/pile/train/02.jsonl,02,pile_tables/pile_02_Pile-CC_df.pkl.gz,...,1683,2022-04-13 03:39:00,2022-06-28 22:15:00,2022-04-13 03:43:00,2022-05-26 03:44:00,1015.8,1753.6,9.3,377.4,0.0


In [42]:
upper = sec_dstats[6] + (sec_dstats[6] - sec_dstats[5]) * 1.5
slow = info.loc[info.seconds > upper, :]
slow.sort_values('seconds', ascending=False)[['slice_name', 'time', 'total_texts', 'conllu_path', 'conllu_MB', 'conllu_mtime', 'finished_at']]

Unnamed: 0_level_0,slice_name,time,total_texts,conllu_path,conllu_MB,conllu_mtime,finished_at
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
24-040-2011,Pcc24_040,0 days 01:30:27,9999,Pcc24.conll/pcc_eng_24-040.conllu,397.7,2022-05-02 23:21:00,2022-05-02 23:21:00
18-091-2816,Pcc18_091,0 days 01:21:43,9999,Pcc18.conll/pcc_eng_18-091.conllu,379.3,2022-05-24 22:04:00,2022-05-24 22:04:00
10-024-1558,Pcc10_024,0 days 01:20:55,9999,Pcc10.conll/pcc_eng_10-024.conllu,391.7,2022-04-26 22:02:00,2022-04-26 22:02:00
10-048-1630,Pcc10_048,0 days 01:11:16,9999,Pcc10.conll/pcc_eng_10-048.conllu,389.6,2022-04-27 21:34:00,2022-04-27 21:34:00
10-026-1573,Pcc10_026,0 days 01:10:44,9999,Pcc10.conll/pcc_eng_10-026.conllu,384.3,2022-04-27 00:22:00,2022-04-27 00:22:00
...,...,...,...,...,...,...,...
16-004-0106,Pcc16_4,0 days 00:48:53,9999,Pcc16.conll/pcc_eng_16-004.conllu,364.5,2022-04-13 07:27:00,2022-04-13 07:27:00
20-096-1538,Pcc20_096,0 days 00:48:52,9999,Pcc20.conll/pcc_eng_20-096.conllu,380.2,2022-04-22 21:53:00,2022-04-22 21:53:00
10-039-1615,Pcc10_039,0 days 00:48:52,9999,Pcc10.conll/pcc_eng_10-039.conllu,391.2,2022-04-27 11:35:00,2022-04-27 11:35:00
13-004-0098,Pcc13_4,0 days 00:48:51,9999,Pcc13.conll/pcc_eng_13-004.conllu,378.6,2022-04-13 07:07:00,2022-04-13 07:07:00


In [43]:
slow.sort_values('seconds', ascending=False)[['slice_name', 'time', 'total_texts', 'conllu_path', 'conllu_MB', 'conllu_mtime', 'finished_at']].describe().round(2)

Unnamed: 0,time,total_texts,conllu_MB
count,211,211.0,211.0
mean,0 days 00:53:17.658767772,10008.63,382.51
std,0 days 00:05:57.817813653,98.63,7.97
min,0 days 00:48:44,9999.0,364.5
25%,0 days 00:50:08.500000,9999.0,377.6
50%,0 days 00:51:32,9999.0,381.2
75%,0 days 00:53:48.500000,9999.0,386.0
max,0 days 01:30:27,11015.0,425.4


In [44]:
info.loc[:, ['finished_at', 'total_texts', 'time']].sort_values('total_texts').head(10)


Unnamed: 0_level_0,finished_at,total_texts,time
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
29-108-1917,2022-04-30 03:57:00,6829,0 days 00:26:00
29-109-1919,2022-04-30 04:24:00,6830,0 days 00:27:27
08-108-2650,2022-05-04 20:27:00,9274,0 days 00:29:22
08-107-2643,2022-05-04 19:58:00,9274,0 days 00:29:56
11-108-1457,2022-04-22 12:10:00,9515,0 days 00:42:19
11-107-1447,2022-04-22 11:28:00,9515,0 days 00:41:11
25-107-1531,2022-04-22 20:15:00,9873,0 days 00:35:29
25-108-1534,2022-04-22 20:51:00,9873,0 days 00:35:38
18-107-2864,2022-05-25 07:59:00,9882,0 days 00:33:45
18-108-2867,2022-05-25 08:33:00,9883,0 days 00:33:41


In [45]:
info.loc[:, ['finished_at', 'total_texts', 'time']].sort_values('total_texts').tail(10)

Unnamed: 0_level_0,finished_at,total_texts,time
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15-107-1458,2022-04-22 12:12:00,11062,0 days 00:43:59
15-108-1466,2022-04-22 12:57:00,11063,0 days 00:45:03
21-107-2539,2022-05-04 12:26:00,11149,0 days 00:38:17
21-108-2548,2022-05-04 13:05:00,11150,0 days 00:38:12
VA-002-3367,2022-06-22 14:35:00,11306,0 days 00:43:24
09-107-1490,2022-04-22 15:04:00,11306,0 days 00:44:08
VA-003-3369,2022-06-22 15:17:00,11307,0 days 00:42:00
09-108-1499,2022-04-22 15:48:00,11307,0 days 00:44:23
TE-002-3366,2022-06-22 14:33:00,11349,0 days 00:40:53
TE-003-3368,2022-06-22 15:14:00,11350,0 days 00:41:02


In [46]:
info.to_csv(DATA_DIR.joinpath('completed-puddin_meta-index.csv'))
info.to_pickle(DATA_DIR.joinpath('comleted-puddin_meta-index.pkl'))