In [1]:
import pandas as pd
import json

## init data from parquet source

In [None]:
source_data_path = 'data/summarization/cnn_dailymail/cnn_dailymail_test.parquet'

In [43]:
cnn_data = pd.read_parquet(source_data_path)
cnn_data['word_count'] = cnn_data['article'].str.split().apply(len).values
cnn_data = cnn_data.rename({'article': 'text', 'highlights': 'golden'}, axis=1)
cnn_data = cnn_data.drop('id', axis=1)
cnn_data = cnn_data[['text', 'word_count', 'golden']]
cnn_data.head(3)

Unnamed: 0,text,word_count,golden
0,(CNN)The Palestinian Authority officially beca...,567,Membership gives the ICC jurisdiction over all...
1,(CNN)Never mind cats having nine lives. A stra...,411,"Theia, a bully breed mix, was apparently hit b..."
2,"(CNN)If you've been following the news lately,...",704,Mohammad Javad Zarif has spent more time with ...


In [133]:
# cnn_data.word_count.hist()

In [45]:
cnn_data_sampled_short_input = cnn_data[cnn_data.word_count < 200].sample(3, random_state=42)
cnn_data_sampled_medium_input = cnn_data[(cnn_data.word_count > 200) &
                                         (cnn_data.word_count < 500)].sample(3, random_state=42)
cnn_data_sampled_long_input = cnn_data[(cnn_data.word_count > 750)].sample(3, random_state=42)

In [46]:
cnn_data_sampled = pd.concat([cnn_data_sampled_short_input,
                              cnn_data_sampled_medium_input,
                              cnn_data_sampled_long_input]).reset_index(drop=True)

In [47]:
cnn_data_sampled

Unnamed: 0,text,word_count,golden
0,(CNN)A mammoth fire broke out Friday morning i...,143,Fire breaks out at the General Electric Applia...
1,"Hull, Leicester and Swansea City are following...",147,Hull's chief scout Stan Ternent has watched M...
2,Sportsmail have teamed up with Golfbidder to o...,123,Sportsmail have teamed up with Golfbidder for ...
3,The time is right for Manchester United target...,260,Mats Hummels admits he is considering his Boru...
4,A road rage incident caught on camera shows a ...,422,A man was caught on video savagely beating an ...
5,Usually parents get to work teaching children ...,213,American YouTube user KentuckyFriedIdiot filme...
6,Jose Mourinho insists Manchester United's inju...,1093,"United are without Marcos Rojo, Phil Jones, Mi..."
7,Real Madrid closed the gap on Barcelona to jus...,1226,Sergio Ramos opened the scoring for Real Madri...
8,A new model for HIV progression shows that it ...,820,New model accurately predicted patients' progr...


In [48]:
# add reference columns
cols_to_add = ['notion', 'grammarly']
cnn_data_sampled[cols_to_add] = ""
# cnn_data_sampled = cnn_data_sampled.reindex(columns = cnn_data_sampled.columns.tolist() + cols_to_add)
cnn_data_sampled

Unnamed: 0,text,word_count,golden,notion,grammarly
0,(CNN)A mammoth fire broke out Friday morning i...,143,Fire breaks out at the General Electric Applia...,,
1,"Hull, Leicester and Swansea City are following...",147,Hull's chief scout Stan Ternent has watched M...,,
2,Sportsmail have teamed up with Golfbidder to o...,123,Sportsmail have teamed up with Golfbidder for ...,,
3,The time is right for Manchester United target...,260,Mats Hummels admits he is considering his Boru...,,
4,A road rage incident caught on camera shows a ...,422,A man was caught on video savagely beating an ...,,
5,Usually parents get to work teaching children ...,213,American YouTube user KentuckyFriedIdiot filme...,,
6,Jose Mourinho insists Manchester United's inju...,1093,"United are without Marcos Rojo, Phil Jones, Mi...",,
7,Real Madrid closed the gap on Barcelona to jus...,1226,Sergio Ramos opened the scoring for Real Madri...,,
8,A new model for HIV progression shows that it ...,820,New model accurately predicted patients' progr...,,


In [68]:
json_path = 'data/summarization/text_files'
data_file_name = 'txt_dataset_summarizer.json'
cnn_data_sampled.to_json(f'{json_path}/{data_file_name}', orient='index', indent=4, force_ascii=True)

## Generate config json

In [27]:
conf_file_name = 'txt_dataset_summarizer_config.json'

In [31]:
config_dict = {
    "current_doc_summarizer": {
        'model_name': 'text-davinci-003',
        'api_version': '2023-05-15',
        'temperature': 0.0,
        'max_tokens': 200,
        'prompt_template': 'summarize the following text: {input_text}'
    },
    "current_update_summarizer": {
        'model_name': 'text-davinci-003',
        'api_version': '2023-05-15',
        'temperature': 0.0,
        'max_tokens': 200,
        'prompt_template': 'summarize the following text: {input_text}'
    }
}

In [32]:
with open(f'{json_path}/{conf_file_name}', 'w', encoding='utf-8') as f:
    json.dump(config_dict, f, ensure_ascii=False, indent=4)