# Full Processing Stats from the Cluster

This notebook loads the processing info for data slices completely processed by `script/parse_pile.py`.

In [46]:
import pandas as pd
from pathlib import Path
from datetime import datetime
tstamp = datetime.fromtimestamp
DATA_DIR = Path('/share/compling/data/puddin')

In [47]:
info_full = pd.read_csv('data/all-completed-slices_meta-index.csv')
# process_cols = ['slice_name', 'total_texts', 'data_origin_group', 'slice_number',
#                 'started_at', 'finished_at', 'parsing_time',
#                 'final_df_path', 'final_slice_path', 'conllu_path']
# info = info_full.loc[:, process_cols].convert_dtypes()
info = info_full.convert_dtypes()

info = info.assign(slice_number_str = info.slice_number.astype('string').apply(lambda s: s.zfill(3)),
                   data_origin_group = info.data_origin_group.apply(lambda s: s.zfill(2)))

time_cols = info.columns.str.endswith('at')
time_info = info.loc[:, time_cols].apply(pd.to_datetime)
info.loc[:, time_cols] = time_info
info = info.assign(days=pd.to_numeric(info.parsing_time.str.split(' ').str.get(0)), 
                   time=info.parsing_time.str.split(' ').str.get(-1).apply(pd.to_timedelta), 
                   slice_number = info.slice_number.apply(lambda n: pd.to_numeric(n, downcast='unsigned')))

info = info.assign(seconds=info.time.apply(lambda td: pd.to_numeric(round(td.total_seconds()),downcast='unsigned')))
ix = pd.Series(info.index.astype('string')).apply(lambda i: i.zfill(len(str(info.index.max()))))
info = info.assign(record=info.data_origin_group.str.upper().apply(lambda s: s[:2]) +'-'+info.slice_number_str+'-'+ix)
info = info.set_index('record')
info_load_full = info


Add size and last modification time for final conllu file paths

In [48]:
info = info_load_full

In [49]:
info = info.assign(
    kept_df_mtime=info.final_df_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(Path(p)).stat().st_mtime)),
    excl_df_mtime=info.exclusions_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(Path(p)).stat().st_mtime)),
    slice_df_mtime=info.tmp_slice_path.apply(
        lambda p: pd.Timestamp.fromtimestamp( DATA_DIR.joinpath(Path(p)).stat().st_mtime)),
    conllu_mtime=info.conllu_path.apply(
        lambda p: pd.Timestamp.fromtimestamp(DATA_DIR.joinpath(Path(p)).stat().st_mtime)),

    kept_df_gzMB=info.final_df_path.apply(
        lambda p: (DATA_DIR.joinpath(Path(p)).stat().st_size) / 1048576).round(2),
    excl_df_gzMB=info.exclusions_path.apply(
        lambda p: (DATA_DIR.joinpath(Path(p)).stat().st_size) / 1048576).round(2),
    slice_df_gzMB=info.tmp_slice_path.apply(
        lambda p: (DATA_DIR.joinpath(Path(p)).stat().st_size) / 1048576).round(2),
    conllu_MB=info.conllu_path.apply(
        lambda p: (DATA_DIR.joinpath(Path(p)).stat().st_size) / 1048576).round(2)
)


In [50]:
dtcols =  info.select_dtypes(include='datetime')
info.loc[:,dtcols.columns] = dtcols.apply(lambda c: c.dt.round("min"))


In [51]:
# info = info_load_full
info.sample(5)[[c for c in info.columns if c.endswith(('B', 'mtime', 'finished_at'))]]

Unnamed: 0_level_0,finished_at,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime,kept_df_gzMB,excl_df_gzMB,slice_df_gzMB,conllu_MB
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
19-057-3180,2022-05-31 00:25:00,2022-04-20 02:15:00,2022-06-01 07:50:00,2022-04-20 02:18:00,2022-05-31 00:25:00,1012.36,1754.32,9.45,384.65
22-009-1722,2022-04-28 07:44:00,2022-04-28 00:11:00,2022-05-27 21:50:00,2022-04-28 00:12:00,2022-05-25 02:15:00,1013.06,2242.64,9.36,380.02
27-098-2729,2022-05-05 04:18:00,2022-04-28 01:25:00,2022-05-05 09:31:00,2022-04-28 01:30:00,2022-05-05 04:18:00,1014.82,1749.68,9.47,384.21
04-053-3302,2022-06-01 21:11:00,2022-04-13 03:36:00,2022-06-03 07:10:00,2022-04-13 03:39:00,2022-06-01 21:11:00,1013.64,1742.39,9.35,378.25
12-066-2817,2022-05-24 22:18:00,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:35:00,2022-05-24 22:18:00,1012.83,1753.77,9.41,381.2


In [52]:
# was_replaced = info.duplicated(keep=False, subset=['data_origin_group', 'slice_number', 'final_slice_path', 'conllu_path'])
# info.loc[was_replaced, :].sort_values('record')
was_replaced = info.duplicated(keep='last', subset=['data_origin_group', 'slice_number', 'conllu_path'])
replaced = info.loc[was_replaced,:].sort_values('record')

save processing records that have been replaced to a separate file, `replaced_slice-index-records.csv`, and remove them from the main `info` dataframe:

In [53]:
replaced.to_csv('data/replaced_slice-index-records.csv')
info = info.loc[~was_replaced,:]
info_load = info

In [54]:
# info = info_load

In [55]:
info.sort_values(['seconds', 'total_texts'], ascending=False)

Unnamed: 0_level_0,slice_name,total_texts,first_text_id,last_text_id,tmp_slice_path,final_slice_path,conllu_path,origin_filepath,data_origin_group,final_df_path,...,time,seconds,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime,kept_df_gzMB,excl_df_gzMB,slice_df_gzMB,conllu_MB
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24-040-2011,Pcc24_040,9999,pcc_eng_24_040.0001_x0630363,pcc_eng_24_040.9999_x0646500,pile_tables/slices/Pcc24/tmp/pile_24-040_Pile-...,pile_tables/slices/Pcc24/pile_24-040_Pile-CC_d...,Pcc24.conll/pcc_eng_24-040.conllu,/share/compling/data/pile/train/24.jsonl,24,pile_tables/pile_24_Pile-CC_df.pkl.gz,...,0 days 01:30:27,5427,2022-04-20 01:41:00,2022-05-05 11:51:00,2022-04-20 01:43:00,2022-05-02 23:21:00,1013.42,1750.89,9.74,397.72
18-091-2816,Pcc18_091,9999,pcc_eng_18_091.0001_x1457489,pcc_eng_18_091.9999_x1473649,pile_tables/slices/Pcc18/tmp/pile_18-091_Pile-...,pile_tables/slices/Pcc18/pile_18-091_Pile-CC_d...,Pcc18.conll/pcc_eng_18-091.conllu,/share/compling/data/pile/train/18.jsonl,18,pile_tables/pile_18_Pile-CC_df.pkl.gz,...,0 days 01:21:43,4903,2022-04-20 01:52:00,2022-05-25 06:52:00,2022-04-20 01:56:00,2022-05-24 22:04:00,1010.50,1758.81,9.34,379.33
10-024-1558,Pcc10_024,9999,pcc_eng_10_024.0001_x0371544,pcc_eng_10_024.9999_x0387720,pile_tables/slices/Pcc10/tmp/pile_10-024_Pile-...,pile_tables/slices/Pcc10/pile_10-024_Pile-CC_d...,Pcc10.conll/pcc_eng_10-024.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,0 days 01:20:55,4855,2022-04-13 02:54:00,2022-05-03 03:45:00,2022-04-13 02:55:00,2022-04-26 22:02:00,1015.61,1740.48,9.66,391.74
10-048-1630,Pcc10_048,9999,pcc_eng_10_048.0001_x0760323,pcc_eng_10_048.9999_x0776333,pile_tables/slices/Pcc10/tmp/pile_10-048_Pile-...,pile_tables/slices/Pcc10/pile_10-048_Pile-CC_d...,Pcc10.conll/pcc_eng_10-048.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,0 days 01:11:16,4276,2022-04-13 02:54:00,2022-05-03 03:45:00,2022-04-13 02:56:00,2022-04-27 21:34:00,1015.61,1740.48,9.54,389.63
10-026-1573,Pcc10_026,9999,pcc_eng_10_026.0001_x0403864,pcc_eng_10_026.9999_x0420194,pile_tables/slices/Pcc10/tmp/pile_10-026_Pile-...,pile_tables/slices/Pcc10/pile_10-026_Pile-CC_d...,Pcc10.conll/pcc_eng_10-026.conllu,/share/compling/data/pile/train/10.jsonl,10,pile_tables/pile_10_Pile-CC_df.pkl.gz,...,0 days 01:10:44,4244,2022-04-13 02:54:00,2022-05-03 03:45:00,2022-04-13 02:55:00,2022-04-27 00:22:00,1015.61,1740.48,9.48,384.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12-051-2564,Pcc12_051,9999,pcc_eng_12_051.0001_x0808148,pcc_eng_12_051.9999_x0824308,pile_tables/slices/Pcc12/tmp/pile_12-051_Pile-...,pile_tables/slices/Pcc12/pile_12-051_Pile-CC_d...,Pcc12.conll/pcc_eng_12-051.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:25:26,1526,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 14:14:00,1012.83,1753.77,9.31,379.22
12-047-2535,Pcc12_047,9999,pcc_eng_12_047.0001_x0743541,pcc_eng_12_047.9999_x0759569,pile_tables/slices/Pcc12/tmp/pile_12-047_Pile-...,pile_tables/slices/Pcc12/pile_12-047_Pile-CC_d...,Pcc12.conll/pcc_eng_12-047.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:25:24,1524,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 12:16:00,1012.83,1753.77,9.37,380.66
12-050-2556,Pcc12_050,9999,pcc_eng_12_050.0001_x0792018,pcc_eng_12_050.9999_x0808147,pile_tables/slices/Pcc12/tmp/pile_12-050_Pile-...,pile_tables/slices/Pcc12/pile_12-050_Pile-CC_d...,Pcc12.conll/pcc_eng_12-050.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:25:23,1523,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 13:42:00,1012.83,1753.77,9.44,381.85
12-045-2523,Pcc12_045,9999,pcc_eng_12_045.0001_x0711367,pcc_eng_12_045.9999_x0727554,pile_tables/slices/Pcc12/tmp/pile_12-045_Pile-...,pile_tables/slices/Pcc12/pile_12-045_Pile-CC_d...,Pcc12.conll/pcc_eng_12-045.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:25:12,1512,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 11:25:00,1012.83,1753.77,9.30,378.61


In [56]:
sec_dstats = (info.seconds).describe()

lower = sec_dstats[4] + (sec_dstats[4] - sec_dstats[5]) *1.5
fast = info.loc[info.seconds < lower, :]
fast.sort_values('seconds')

Unnamed: 0_level_0,slice_name,total_texts,first_text_id,last_text_id,tmp_slice_path,final_slice_path,conllu_path,origin_filepath,data_origin_group,final_df_path,...,time,seconds,kept_df_mtime,excl_df_mtime,slice_df_mtime,conllu_mtime,kept_df_gzMB,excl_df_gzMB,slice_df_gzMB,conllu_MB
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29-009-3370,Pcc29_009,9999,pcc_eng_29_009.0001_x0129300,pcc_eng_29_009.9999_x0145503,pile_tables/slices/Pcc29/tmp/pile_29-009_Pile-...,pile_tables/slices/Pcc29/pile_29-009_Pile-CC_d...,Pcc29.conll/pcc_eng_29-009.conllu,/share/compling/data/pile/train/29.jsonl,29,pile_tables/pile_29_Pile-CC_df.pkl.gz,...,0 days 00:01:40,100,2022-03-27 02:12:00,2022-04-30 03:31:00,2022-04-12 21:17:00,2022-06-22 18:19:00,1015.20,0.09,9.18,371.70
12-045-2523,Pcc12_045,9999,pcc_eng_12_045.0001_x0711367,pcc_eng_12_045.9999_x0727554,pile_tables/slices/Pcc12/tmp/pile_12-045_Pile-...,pile_tables/slices/Pcc12/pile_12-045_Pile-CC_d...,Pcc12.conll/pcc_eng_12-045.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:25:12,1512,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 11:25:00,1012.83,1753.77,9.30,378.61
12-050-2556,Pcc12_050,9999,pcc_eng_12_050.0001_x0792018,pcc_eng_12_050.9999_x0808147,pile_tables/slices/Pcc12/tmp/pile_12-050_Pile-...,pile_tables/slices/Pcc12/pile_12-050_Pile-CC_d...,Pcc12.conll/pcc_eng_12-050.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:25:23,1523,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 13:42:00,1012.83,1753.77,9.44,381.85
12-047-2535,Pcc12_047,9999,pcc_eng_12_047.0001_x0743541,pcc_eng_12_047.9999_x0759569,pile_tables/slices/Pcc12/tmp/pile_12-047_Pile-...,pile_tables/slices/Pcc12/pile_12-047_Pile-CC_d...,Pcc12.conll/pcc_eng_12-047.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:25:24,1524,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 12:16:00,1012.83,1753.77,9.37,380.66
12-051-2564,Pcc12_051,9999,pcc_eng_12_051.0001_x0808148,pcc_eng_12_051.9999_x0824308,pile_tables/slices/Pcc12/tmp/pile_12-051_Pile-...,pile_tables/slices/Pcc12/pile_12-051_Pile-CC_d...,Pcc12.conll/pcc_eng_12-051.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:25:26,1526,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 14:14:00,1012.83,1753.77,9.31,379.22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12-059-2620,Pcc12_059,9999,pcc_eng_12_059.0001_x0937734,pcc_eng_12_059.9999_x0953936,pile_tables/slices/Pcc12/tmp/pile_12-059_Pile-...,pile_tables/slices/Pcc12/pile_12-059_Pile-CC_d...,Pcc12.conll/pcc_eng_12-059.conllu,/share/compling/data/pile/train/12.jsonl,12,pile_tables/pile_12_Pile-CC_df.pkl.gz,...,0 days 00:28:07,1687,2022-04-13 03:31:00,2022-05-24 21:42:00,2022-04-13 03:34:00,2022-05-04 18:12:00,1012.83,1753.77,9.39,380.56
02-104-2982,Pcc02_104,9999,pcc_eng_02_104.0001_x1665566,pcc_eng_02_104.9999_x1681747,pile_tables/slices/Pcc02/tmp/pile_02-104_Pile-...,pile_tables/slices/Pcc02/pile_02-104_Pile-CC_d...,Pcc02.conll/pcc_eng_02-104.conllu,/share/compling/data/pile/train/02.jsonl,02,pile_tables/pile_02_Pile-CC_df.pkl.gz,...,0 days 00:28:12,1692,2022-04-13 03:39:00,2022-05-26 10:38:00,2022-04-13 03:44:00,2022-05-26 09:05:00,1015.80,1753.59,9.30,377.82
02-091-2941,Pcc02_091,9999,pcc_eng_02_091.0001_x1455096,pcc_eng_02_091.9999_x1471268,pile_tables/slices/Pcc02/tmp/pile_02-091_Pile-...,pile_tables/slices/Pcc02/pile_02-091_Pile-CC_d...,Pcc02.conll/pcc_eng_02-091.conllu,/share/compling/data/pile/train/02.jsonl,02,pile_tables/pile_02_Pile-CC_df.pkl.gz,...,0 days 00:28:13,1693,2022-04-13 03:39:00,2022-05-26 10:38:00,2022-04-13 03:43:00,2022-05-26 02:40:00,1015.80,1753.59,9.37,379.79
02-097-2961,Pcc02_097,9999,pcc_eng_02_097.0001_x1552144,pcc_eng_02_097.9999_x1568272,pile_tables/slices/Pcc02/tmp/pile_02-097_Pile-...,pile_tables/slices/Pcc02/pile_02-097_Pile-CC_d...,Pcc02.conll/pcc_eng_02-097.conllu,/share/compling/data/pile/train/02.jsonl,02,pile_tables/pile_02_Pile-CC_df.pkl.gz,...,0 days 00:28:13,1693,2022-04-13 03:39:00,2022-05-26 10:38:00,2022-04-13 03:43:00,2022-05-26 05:45:00,1015.80,1753.59,9.36,380.30


In [57]:
upper = sec_dstats[6] + (sec_dstats[6] - sec_dstats[5]) * 1.5
slow = info.loc[info.seconds > upper, :]
slow.sort_values('seconds', ascending=False)[['slice_name', 'time', 'total_texts', 'conllu_path', 'conllu_MB', 'conllu_mtime', 'finished_at']]

Unnamed: 0_level_0,slice_name,time,total_texts,conllu_path,conllu_MB,conllu_mtime,finished_at
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
24-040-2011,Pcc24_040,0 days 01:30:27,9999,Pcc24.conll/pcc_eng_24-040.conllu,397.72,2022-05-02 23:21:00,2022-05-02 23:21:00
18-091-2816,Pcc18_091,0 days 01:21:43,9999,Pcc18.conll/pcc_eng_18-091.conllu,379.33,2022-05-24 22:04:00,2022-05-24 22:04:00
10-024-1558,Pcc10_024,0 days 01:20:55,9999,Pcc10.conll/pcc_eng_10-024.conllu,391.74,2022-04-26 22:02:00,2022-04-26 22:02:00
10-048-1630,Pcc10_048,0 days 01:11:16,9999,Pcc10.conll/pcc_eng_10-048.conllu,389.63,2022-04-27 21:34:00,2022-04-27 21:34:00
10-026-1573,Pcc10_026,0 days 01:10:44,9999,Pcc10.conll/pcc_eng_10-026.conllu,384.32,2022-04-27 00:22:00,2022-04-27 00:22:00
...,...,...,...,...,...,...,...
13-004-0098,Pcc13_4,0 days 00:48:51,9999,Pcc13.conll/pcc_eng_13-004.conllu,378.65,2022-04-13 07:07:00,2022-04-13 07:07:00
14-103-1600,Pcc14_103,0 days 00:48:44,9999,Pcc14.conll/pcc_eng_14-103.conllu,390.02,2022-04-27 05:16:00,2022-04-27 05:16:00
12-003-0079,Pcc12_3,0 days 00:48:43,9999,Pcc12.conll/pcc_eng_12-003.conllu,371.03,2022-04-13 06:13:00,2022-04-13 06:13:00
29-048-1752,Pcc29_048,0 days 00:48:43,9999,Pcc29.conll/pcc_eng_29-048.conllu,384.12,2022-04-28 11:24:00,2022-04-28 11:25:00


In [58]:
slow.sort_values('seconds', ascending=False)[['slice_name', 'time', 'total_texts', 'conllu_path', 'conllu_MB', 'conllu_mtime', 'finished_at']].describe()

Unnamed: 0,time,total_texts,conllu_MB
count,216,216.0,216.0
mean,0 days 00:53:12.342592592,10008.402778,382.512407
std,0 days 00:05:55.432462744,97.489032,7.984703
min,0 days 00:48:42,9999.0,364.48
25%,0 days 00:50:05.500000,9999.0,377.525
50%,0 days 00:51:30.500000,9999.0,381.245
75%,0 days 00:53:47,9999.0,386.0125
max,0 days 01:30:27,11015.0,425.39


In [59]:
# for g, d in info.groupby('data_origin_group'): 
#     print('\n',g)
#     print(list(d.conllu_path.unique()))

In [60]:
info.loc[:, ['finished_at', 'total_texts', 'time']].sort_values('total_texts').head(10)


Unnamed: 0_level_0,finished_at,total_texts,time
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
29-108-1917,2022-04-30 03:57:00,6829,0 days 00:26:00
29-109-1919,2022-04-30 04:24:00,6830,0 days 00:27:27
08-107-2643,2022-05-04 19:58:00,9274,0 days 00:29:56
08-108-2650,2022-05-04 20:27:00,9274,0 days 00:29:22
11-107-1447,2022-04-22 11:28:00,9515,0 days 00:41:11
11-108-1457,2022-04-22 12:10:00,9515,0 days 00:42:19
25-107-1531,2022-04-22 20:15:00,9873,0 days 00:35:29
25-108-1534,2022-04-22 20:51:00,9873,0 days 00:35:38
18-107-2864,2022-05-25 07:59:00,9882,0 days 00:33:45
18-108-2867,2022-05-25 08:33:00,9883,0 days 00:33:41


In [61]:
info.loc[:, ['finished_at', 'total_texts', 'time']].sort_values('total_texts').tail(10)

Unnamed: 0_level_0,finished_at,total_texts,time
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15-107-1458,2022-04-22 12:12:00,11062,0 days 00:43:59
15-108-1466,2022-04-22 12:57:00,11063,0 days 00:45:03
21-107-2539,2022-05-04 12:26:00,11149,0 days 00:38:17
21-108-2548,2022-05-04 13:05:00,11150,0 days 00:38:12
09-107-1490,2022-04-22 15:04:00,11306,0 days 00:44:08
VA-002-3367,2022-06-22 14:35:00,11306,0 days 00:43:24
09-108-1499,2022-04-22 15:48:00,11307,0 days 00:44:23
VA-003-3369,2022-06-22 15:17:00,11307,0 days 00:42:00
TE-002-3366,2022-06-22 14:33:00,11349,0 days 00:40:53
TE-003-3368,2022-06-22 15:14:00,11350,0 days 00:41:02


In [63]:
info.to_csv('/share/compling/data/puddin/completed-puddin_meta-index.csv')