# Full Processing Stats from the Cluster

This notebook loads the processing info for data slices completely processed by `script/parse_pile.py`.

In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime
tstamp = datetime.fromtimestamp
DATA_DIR = Path('/share/compling/data/puddin')

## Load and process meta info for puddin processing

In [2]:
info_full = pd.read_csv(DATA_DIR.joinpath('all-completed-slices_meta-index.csv'))
#// process_cols = ['slice_name', 'total_texts', 'data_origin_group', 'slice_number',
#//                 'started_at', 'finished_at', 'parsing_time',
#//                 'final_df_path', 'final_slice_path', 'conllu_path']
#// info = info_full.loc[:, process_cols].convert_dtypes()

adjust data types, set index, create "record" column from input line number

In [3]:
info = info_full.convert_dtypes()

info = info.assign(slice_number_str = info.slice_number.astype('string').apply(lambda s: s.zfill(3)),
                   data_origin_group = info.data_origin_group.apply(lambda s: s.zfill(2)))

time_cols = info.columns.str.endswith('at')
time_info = info.loc[:, time_cols].apply(pd.to_datetime)
info.loc[:, time_cols] = time_info
info = info.assign(days=pd.to_numeric(info.parsing_time.str.split(' ').str.get(0)), 
                   time=info.parsing_time.str.split(' ').str.get(-1).apply(pd.to_timedelta), 
                   slice_number = info.slice_number.apply(lambda n: pd.to_numeric(n, downcast='unsigned')))

info = info.assign(seconds=info.time.apply(lambda td: pd.to_numeric(round(td.total_seconds()),downcast='unsigned')))
ix = pd.Series(info.index.astype('string')).apply(lambda i: i.zfill(len(str(info.index.max()))))
info = info.assign(record=info.data_origin_group.str.upper().apply(lambda s: s[:2]) +'-'+info.slice_number_str+'-'+ix)
info = info.set_index('record')

is_path = info.columns.str.endswith('ath')
for path_col in info.columns[is_path]: 
    info.loc[:,path_col] = info.loc[:,path_col].apply(lambda p: Path(p))
    

Add size and last modification time for final conllu file paths. Round size columns to 1 decimal.

In [4]:
info = info.assign(
    kept_df_mtime=info.final_df_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(p).stat().st_mtime)),
    excl_df_mtime=info.exclusions_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(p).stat().st_mtime)),
    slice_df_mtime=info.tmp_slice_path.apply(
        lambda p: pd.Timestamp.fromtimestamp( DATA_DIR.joinpath(p).stat().st_mtime)),
    conllu_mtime=info.conllu_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(p).stat().st_mtime)),

    kept_df_gzMB=info.final_df_path.apply(
        lambda p: (DATA_DIR.joinpath(p).stat().st_size) / 1048576),
    excl_df_gzMB=info.exclusions_path.apply(
        lambda p: (DATA_DIR.joinpath(p).stat().st_size) / 1048576),
    slice_df_gzMB=info.tmp_slice_path.apply(
        lambda p: (DATA_DIR.joinpath(p).stat().st_size) / 1048576),
    conllu_MB=info.conllu_path.apply(
        lambda p: (DATA_DIR.joinpath(p).stat().st_size) / 1048576)
)

size_cols = info.columns[info.columns.str.endswith('MB')]
info.loc[:, size_cols] = info[size_cols].round(1) 


Make time columns type `datetime` and round time values to the minute

In [5]:
dtcols =  info.select_dtypes(include='datetime')
info.loc[:,dtcols.columns] = dtcols.apply(lambda c: c.dt.round("min"))

In [6]:
info.sample(5)[[c for c in info.columns if c.endswith(('seconds','B', 'mtime', 'finished_at'))]]

Unnamed: 0_level_0,finished_at,seconds,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime,kept_df_gzMB,excl_df_gzMB,slice_df_gzMB,conllu_MB
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21-025-1821,2022-04-28 21:47:00,3042,2022-04-28 00:16:00,2022-06-28 16:28:00,2022-04-28 00:18:00,2022-04-28 21:47:00,1014.8,1748.7,9.5,382.6
28-059-2501,2022-05-04 09:59:00,2174,2022-04-02 06:13:00,2022-06-28 15:20:00,2022-04-12 21:29:00,2022-05-04 09:59:00,1014.1,1750.6,9.7,394.1
09-022-0302,2022-04-20 00:45:00,2457,2022-04-13 03:04:00,2022-04-22 12:24:00,2022-04-13 03:06:00,2022-04-20 00:45:00,1013.6,1764.9,9.2,372.8
VA-003-0018,2022-04-12 23:14:00,2956,2022-03-23 21:41:00,2022-03-23 21:41:00,2022-04-12 20:49:00,2022-06-22 15:17:00,30.5,53.1,10.5,424.3
22-056-3012,2022-05-26 15:15:00,2097,2022-04-28 00:11:00,2022-05-27 21:50:00,2022-04-28 00:14:00,2022-05-26 15:15:00,1013.1,2242.6,9.4,382.2


## Pull records for data that was replaced/reprocessed

Save processing records that have been replaced to a separate file, `replaced_slice-index-records.csv`, and remove them from the main `info` dataframe.

In [7]:
was_replaced = info.duplicated(keep='last', subset=['data_origin_group', 'slice_number', 'conllu_path'])
replaced = info.loc[was_replaced,:].sort_values('record')
replaced.loc[:,['finished_at','conllu_mtime']].assign(time_before_final=(replaced.conllu_mtime - replaced.finished_at))

Unnamed: 0_level_0,finished_at,conllu_mtime,time_before_final
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00-047-2903,2022-05-25 20:44:00,2022-06-28 16:04:00,33 days 19:20:00
00-065-2971,2022-05-26 07:13:00,2022-06-28 16:44:00,33 days 09:31:00
01-034-1930,2022-05-02 17:50:00,2022-06-28 15:19:00,56 days 21:29:00
01-061-2416,2022-05-04 04:47:00,2022-06-28 22:00:00,55 days 17:13:00
01-064-2447,2022-05-04 06:41:00,2022-06-28 22:33:00,55 days 15:52:00
...,...,...,...
VA-002-0015,2022-04-12 22:24:00,2022-06-22 14:35:00,70 days 16:11:00
VA-002-3360,2022-06-21 22:57:00,2022-06-22 14:35:00,0 days 15:38:00
VA-003-0005,2022-03-24 00:35:00,2022-06-22 15:17:00,90 days 14:42:00
VA-003-0018,2022-04-12 23:14:00,2022-06-22 15:17:00,70 days 16:03:00


In [8]:
replaced.to_csv(DATA_DIR.joinpath('info','replaced_slice-index-records.csv'))
replaced.to_pickle(DATA_DIR.joinpath('info','replaced_slice-index-records.pkl'))
info = info.loc[~was_replaced,:]
info_load = info

In [9]:
# info = info_load

In [10]:
info = info.assign(end_timedelta = (info.conllu_mtime - info.finished_at).astype('timedelta64[m]'))
info.end_timedelta.describe().round(2)

count    3246.00
mean       -0.04
std         0.19
min        -1.00
25%         0.00
50%         0.00
75%         0.00
max         0.00
Name: end_timedelta, dtype: float64

_This should only be slices that are currently running or were interrupted:_

In [11]:
info.loc[info.end_timedelta > 2, ['slice_name','final_df_path','conllu_path','kept_df_mtime','conllu_mtime','finished_at','end_timedelta']].sort_values('conllu_mtime')

Unnamed: 0_level_0,slice_name,final_df_path,conllu_path,kept_df_mtime,conllu_mtime,finished_at,end_timedelta
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


## Inspect processing times

Sorted by average parsing rate of texts per second, slowest to fastest:

In [12]:
rate_info = info.assign(avg_texts_per_s = (info.total_texts / info.seconds).round(1)).loc[:,['slice_name', 'conllu_MB', 'slice_df_gzMB', 'total_texts', 'seconds','avg_texts_per_s']]
rate_info.describe().round(1)

Unnamed: 0,conllu_MB,slice_df_gzMB,total_texts,seconds,avg_texts_per_s
count,3246.0,3246.0,3246.0,3246.0,3246.0
mean,382.0,9.4,10006.0,2386.1,4.3
std,7.8,0.2,125.7,363.5,0.6
min,260.3,6.4,6829.0,1512.0,1.8
25%,377.6,9.3,9999.0,2106.2,3.8
50%,381.5,9.4,9999.0,2383.0,4.2
75%,385.6,9.5,9999.0,2599.0,4.8
max,437.5,10.8,11350.0,5427.0,6.6


In [13]:
rate_info.sort_values('avg_texts_per_s')

Unnamed: 0_level_0,slice_name,conllu_MB,slice_df_gzMB,total_texts,seconds,avg_texts_per_s
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
24-040-2011,Pcc24_040,397.7,9.7,9999,5427,1.8
18-091-2816,Pcc18_091,379.3,9.3,9999,4903,2.0
10-024-1558,Pcc10_024,391.7,9.7,9999,4855,2.1
10-048-1630,Pcc10_048,389.6,9.5,9999,4276,2.3
10-044-1620,Pcc10_044,383.5,9.5,9999,4151,2.4
...,...,...,...,...,...,...
12-051-2564,Pcc12_051,379.2,9.3,9999,1526,6.6
12-045-2523,Pcc12_045,378.6,9.3,9999,1512,6.6
12-050-2556,Pcc12_050,381.8,9.4,9999,1523,6.6
12-047-2535,Pcc12_047,380.7,9.4,9999,1524,6.6


In [14]:
sec_dstats = (info.seconds).describe().round(1)


In [15]:

lower = sec_dstats[4] + (sec_dstats[4] - sec_dstats[5]) *1.5
fast = info.loc[info.seconds < lower, :]
fast.sort_values('seconds')

Unnamed: 0_level_0,slice_name,total_texts,first_text_id,last_text_id,tmp_slice_path,final_slice_path,conllu_path,origin_filepath,data_origin_group,final_df_path,...,seconds,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime,kept_df_gzMB,excl_df_gzMB,slice_df_gzMB,conllu_MB,end_timedelta
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12-045-2523,Pcc12_045,9999,pcc_eng_12_045.0001_x0711367,pcc_eng_12_045.9999_x0727554,pile_tables/slices/Pcc12/tmp/pile_12-045_Pile-...,pile_tables/slices/Pcc12/pile_12-045_Pile-CC_d...,Pcc12.conll/pcc_eng_12-045.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1512,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 11:25:00,1012.8,1753.8,9.3,378.6,0.0
12-050-2556,Pcc12_050,9999,pcc_eng_12_050.0001_x0792018,pcc_eng_12_050.9999_x0808147,pile_tables/slices/Pcc12/tmp/pile_12-050_Pile-...,pile_tables/slices/Pcc12/pile_12-050_Pile-CC_d...,Pcc12.conll/pcc_eng_12-050.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1523,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 13:42:00,1012.8,1753.8,9.4,381.8,0.0
12-047-2535,Pcc12_047,9999,pcc_eng_12_047.0001_x0743541,pcc_eng_12_047.9999_x0759569,pile_tables/slices/Pcc12/tmp/pile_12-047_Pile-...,pile_tables/slices/Pcc12/pile_12-047_Pile-CC_d...,Pcc12.conll/pcc_eng_12-047.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1524,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 12:16:00,1012.8,1753.8,9.4,380.7,0.0
10-072-3372,Pcc10_072,9999,pcc_eng_10_072.0001_x1147792,pcc_eng_10_072.9999_x1163925,pile_tables/slices/Pcc10/tmp/pile_10-072_Pile-...,pile_tables/slices/Pcc10/pile_10-072_Pile-CC_d...,Pcc10.conll/pcc_eng_10-072.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,1525,2022-04-13 02:54:00,2022-06-28 15:49:00,2022-04-13 02:57:00,2022-06-28 15:08:00,1015.6,1740.5,9.3,374.9,0.0
12-051-2564,Pcc12_051,9999,pcc_eng_12_051.0001_x0808148,pcc_eng_12_051.9999_x0824308,pile_tables/slices/Pcc12/tmp/pile_12-051_Pile-...,pile_tables/slices/Pcc12/pile_12-051_Pile-CC_d...,Pcc12.conll/pcc_eng_12-051.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1526,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:34:00,2022-05-04 14:14:00,1012.8,1753.8,9.3,379.2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12-031-2420,Pcc12_031,9999,pcc_eng_12_031.0001_x0485727,pcc_eng_12_031.9999_x0501937,pile_tables/slices/Pcc12/tmp/pile_12-031_Pile-...,pile_tables/slices/Pcc12/pile_12-031_Pile-CC_d...,Pcc12.conll/pcc_eng_12-031.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1676,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:33:00,2022-05-04 04:57:00,1012.8,1753.8,9.5,383.3,0.0
12-035-2450,Pcc12_035,9999,pcc_eng_12_035.0001_x0550035,pcc_eng_12_035.9999_x0566224,pile_tables/slices/Pcc12/tmp/pile_12-035_Pile-...,pile_tables/slices/Pcc12/pile_12-035_Pile-CC_d...,Pcc12.conll/pcc_eng_12-035.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,1677,2022-04-13 03:31:00,2022-06-28 15:27:00,2022-04-13 03:33:00,2022-05-04 06:53:00,1012.8,1753.8,9.5,384.6,0.0
02-090-2939,Pcc02_090,9999,pcc_eng_02_090.0001_x1438898,pcc_eng_02_090.9999_x1455094,pile_tables/slices/Pcc02/tmp/pile_02-090_Pile-...,pile_tables/slices/Pcc02/pile_02-090_Pile-CC_d...,Pcc02.conll/pcc_eng_02-090.conllu,/share/compling/data/pile/train/02.jsonl,02,pile_tables/pile_02_Pile-CC_df.pkl.gz,...,1683,2022-04-13 03:39:00,2022-06-28 22:15:00,2022-04-13 03:43:00,2022-05-26 02:12:00,1015.8,1753.6,9.3,378.6,0.0
02-093-2948,Pcc02_093,9999,pcc_eng_02_093.0001_x1487434,pcc_eng_02_093.9999_x1503696,pile_tables/slices/Pcc02/tmp/pile_02-093_Pile-...,pile_tables/slices/Pcc02/pile_02-093_Pile-CC_d...,Pcc02.conll/pcc_eng_02-093.conllu,/share/compling/data/pile/train/02.jsonl,02,pile_tables/pile_02_Pile-CC_df.pkl.gz,...,1683,2022-04-13 03:39:00,2022-06-28 22:15:00,2022-04-13 03:43:00,2022-05-26 03:44:00,1015.8,1753.6,9.3,377.4,0.0


In [16]:
upper = sec_dstats[6] + (sec_dstats[6] - sec_dstats[5]) * 1.5
slow = info.loc[info.seconds > upper, :]
slow.sort_values('seconds', ascending=False)[['slice_name', 'time', 'total_texts', 'conllu_path', 'conllu_MB', 'conllu_mtime', 'finished_at']]

Unnamed: 0_level_0,slice_name,time,total_texts,conllu_path,conllu_MB,conllu_mtime,finished_at
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
24-040-2011,Pcc24_040,0 days 01:30:27,9999,Pcc24.conll/pcc_eng_24-040.conllu,397.7,2022-05-02 23:21:00,2022-05-02 23:21:00
18-091-2816,Pcc18_091,0 days 01:21:43,9999,Pcc18.conll/pcc_eng_18-091.conllu,379.3,2022-05-24 22:04:00,2022-05-24 22:04:00
10-024-1558,Pcc10_024,0 days 01:20:55,9999,Pcc10.conll/pcc_eng_10-024.conllu,391.7,2022-04-26 22:02:00,2022-04-26 22:02:00
10-048-1630,Pcc10_048,0 days 01:11:16,9999,Pcc10.conll/pcc_eng_10-048.conllu,389.6,2022-04-27 21:34:00,2022-04-27 21:34:00
10-026-1573,Pcc10_026,0 days 01:10:44,9999,Pcc10.conll/pcc_eng_10-026.conllu,384.3,2022-04-27 00:22:00,2022-04-27 00:22:00
...,...,...,...,...,...,...,...
16-004-0106,Pcc16_4,0 days 00:48:53,9999,Pcc16.conll/pcc_eng_16-004.conllu,364.5,2022-04-13 07:27:00,2022-04-13 07:27:00
20-096-1538,Pcc20_096,0 days 00:48:52,9999,Pcc20.conll/pcc_eng_20-096.conllu,380.2,2022-04-22 21:53:00,2022-04-22 21:53:00
10-039-1615,Pcc10_039,0 days 00:48:52,9999,Pcc10.conll/pcc_eng_10-039.conllu,391.2,2022-04-27 11:35:00,2022-04-27 11:35:00
13-004-0098,Pcc13_4,0 days 00:48:51,9999,Pcc13.conll/pcc_eng_13-004.conllu,378.6,2022-04-13 07:07:00,2022-04-13 07:07:00


In [17]:
slow.sort_values('seconds', ascending=False)[['slice_name', 'time', 'total_texts', 'conllu_path', 'conllu_MB', 'conllu_mtime', 'finished_at']].describe().round(2)

Unnamed: 0,time,total_texts,conllu_MB
count,211,211.0,211.0
mean,0 days 00:53:17.658767772,10008.63,382.51
std,0 days 00:05:57.817813653,98.63,7.97
min,0 days 00:48:44,9999.0,364.5
25%,0 days 00:50:08.500000,9999.0,377.6
50%,0 days 00:51:32,9999.0,381.2
75%,0 days 00:53:48.500000,9999.0,386.0
max,0 days 01:30:27,11015.0,425.4


Slices with the least documents:

In [18]:
info.loc[:, ['finished_at', 'total_texts', 'time']].sort_values('total_texts').head(10)


Unnamed: 0_level_0,finished_at,total_texts,time
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
29-108-1917,2022-04-30 03:57:00,6829,0 days 00:26:00
29-109-1919,2022-04-30 04:24:00,6830,0 days 00:27:27
08-108-2650,2022-05-04 20:27:00,9274,0 days 00:29:22
08-107-2643,2022-05-04 19:58:00,9274,0 days 00:29:56
11-108-1457,2022-04-22 12:10:00,9515,0 days 00:42:19
11-107-1447,2022-04-22 11:28:00,9515,0 days 00:41:11
25-107-1531,2022-04-22 20:15:00,9873,0 days 00:35:29
25-108-1534,2022-04-22 20:51:00,9873,0 days 00:35:38
18-107-2864,2022-05-25 07:59:00,9882,0 days 00:33:45
18-108-2867,2022-05-25 08:33:00,9883,0 days 00:33:41


Slices with the most documents:

In [19]:
info.loc[:, ['finished_at', 'total_texts', 'time']].sort_values('total_texts').tail(10)

Unnamed: 0_level_0,finished_at,total_texts,time
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15-107-1458,2022-04-22 12:12:00,11062,0 days 00:43:59
15-108-1466,2022-04-22 12:57:00,11063,0 days 00:45:03
21-107-2539,2022-05-04 12:26:00,11149,0 days 00:38:17
21-108-2548,2022-05-04 13:05:00,11150,0 days 00:38:12
VA-002-3367,2022-06-22 14:35:00,11306,0 days 00:43:24
09-107-1490,2022-04-22 15:04:00,11306,0 days 00:44:08
VA-003-3369,2022-06-22 15:17:00,11307,0 days 00:42:00
09-108-1499,2022-04-22 15:48:00,11307,0 days 00:44:23
TE-002-3366,2022-06-22 14:33:00,11349,0 days 00:40:53
TE-003-3368,2022-06-22 15:14:00,11350,0 days 00:41:02


In [20]:
info.to_csv(DATA_DIR.joinpath('info', 'completed-puddin_meta-index.csv'))
info.to_pickle(DATA_DIR.joinpath('info', 'completed-puddin_meta-index.pkl'))