In [42]:
import pandas as pd
import json

## init data from parquet source

In [43]:
cnn_data = pd.read_parquet('data/summarization/cnn_dailymail/cnn_dailymail_test.parquet')
cnn_data['word_count'] = cnn_data['article'].str.split().apply(len).values
cnn_data = cnn_data.rename({'article': 'text', 'highlights': 'golden'}, axis=1)
cnn_data = cnn_data.drop('id', axis=1)
cnn_data = cnn_data[['text', 'word_count', 'golden']]
cnn_data.head(3)

Unnamed: 0,text,word_count,golden
0,(CNN)The Palestinian Authority officially beca...,567,Membership gives the ICC jurisdiction over all...
1,(CNN)Never mind cats having nine lives. A stra...,411,"Theia, a bully breed mix, was apparently hit b..."
2,"(CNN)If you've been following the news lately,...",704,Mohammad Javad Zarif has spent more time with ...


In [133]:
# cnn_data.word_count.hist()

In [45]:
cnn_data_sampled_short_input = cnn_data[cnn_data.word_count < 200].sample(3, random_state=42)
cnn_data_sampled_medium_input = cnn_data[(cnn_data.word_count > 200) &
                                         (cnn_data.word_count < 500)].sample(3, random_state=42)
cnn_data_sampled_long_input = cnn_data[(cnn_data.word_count > 750)].sample(3, random_state=42)

In [46]:
cnn_data_sampled = pd.concat([cnn_data_sampled_short_input,
                              cnn_data_sampled_medium_input,
                              cnn_data_sampled_long_input]).reset_index(drop=True)

In [47]:
cnn_data_sampled

Unnamed: 0,text,word_count,golden
0,(CNN)A mammoth fire broke out Friday morning i...,143,Fire breaks out at the General Electric Applia...
1,"Hull, Leicester and Swansea City are following...",147,Hull's chief scout Stan Ternent has watched M...
2,Sportsmail have teamed up with Golfbidder to o...,123,Sportsmail have teamed up with Golfbidder for ...
3,The time is right for Manchester United target...,260,Mats Hummels admits he is considering his Boru...
4,A road rage incident caught on camera shows a ...,422,A man was caught on video savagely beating an ...
5,Usually parents get to work teaching children ...,213,American YouTube user KentuckyFriedIdiot filme...
6,Jose Mourinho insists Manchester United's inju...,1093,"United are without Marcos Rojo, Phil Jones, Mi..."
7,Real Madrid closed the gap on Barcelona to jus...,1226,Sergio Ramos opened the scoring for Real Madri...
8,A new model for HIV progression shows that it ...,820,New model accurately predicted patients' progr...


In [48]:
# add reference columns
cols_to_add = ['notion', 'grammarly']
cnn_data_sampled[cols_to_add] = ""
# cnn_data_sampled = cnn_data_sampled.reindex(columns = cnn_data_sampled.columns.tolist() + cols_to_add)
cnn_data_sampled

Unnamed: 0,text,word_count,golden,notion,grammarly
0,(CNN)A mammoth fire broke out Friday morning i...,143,Fire breaks out at the General Electric Applia...,,
1,"Hull, Leicester and Swansea City are following...",147,Hull's chief scout Stan Ternent has watched M...,,
2,Sportsmail have teamed up with Golfbidder to o...,123,Sportsmail have teamed up with Golfbidder for ...,,
3,The time is right for Manchester United target...,260,Mats Hummels admits he is considering his Boru...,,
4,A road rage incident caught on camera shows a ...,422,A man was caught on video savagely beating an ...,,
5,Usually parents get to work teaching children ...,213,American YouTube user KentuckyFriedIdiot filme...,,
6,Jose Mourinho insists Manchester United's inju...,1093,"United are without Marcos Rojo, Phil Jones, Mi...",,
7,Real Madrid closed the gap on Barcelona to jus...,1226,Sergio Ramos opened the scoring for Real Madri...,,
8,A new model for HIV progression shows that it ...,820,New model accurately predicted patients' progr...,,


In [68]:
json_path = 'data/summarization/text_files/txt_dataset.json'
cnn_data_sampled.to_json(json_path, orient='index', indent=4, force_ascii=True)

# read json

In [177]:
json_path = 'data/summarization/text_files/txt_dataset.json'

In [178]:
data = pd.read_json(json_path, orient='index')

In [109]:
# data = data.fillna("")
# data.head(3)

In [179]:
data

Unnamed: 0,text,word_count,golden,notion,grammarly,current_doc_summarizier,current_update_summarizer
0,(CNN)A mammoth fire broke out Friday morning i...,143,Fire breaks out at the General Electric Applia...,A massive fire broke out at the General Electr...,A massive fire broke out at the General Electr...,A large fire broke out Friday morning in the G...,A large fire broke out Friday morning in the G...
1,"Hull, Leicester and Swansea City are following...",147,Hull's chief scout Stan Ternent has watched M...,"Hull, Leicester, and Swansea City are interest...","Hull, Leicester, and Swansea City are interest...","Maciej Rybus, a 25-year-old Polish internation...","Maciej Rybus, a 25-year-old Polish internation..."
2,Sportsmail have teamed up with Golfbidder to o...,123,Sportsmail have teamed up with Golfbidder for ...,Sportsmail and Golfbidder have partnered to of...,Sportsmail and Golfbidder are running a compet...,Sportsmail have partnered with Golfbidder to o...,Sportsmail and Golfbidder have teamed up to of...
3,The time is right for Manchester United target...,260,Mats Hummels admits he is considering his Boru...,Germany legend Franz Beckenbauer believes that...,German football legend Franz Beckenbauer has s...,Germany legend Franz Beckenbauer believes that...,Mats Hummels is considering his future at Boru...
4,Jose Mourinho insists Manchester United's inju...,1093,"United are without Marcos Rojo, Phil Jones, Mi...",Jose Mourinho believes Manchester United's inj...,<not support this length>,Jose Mourinho believes Manchester United will ...,Jose Mourinho believes Manchester United will ...
5,\nWe’ve made a few updates to Workforms recent...,108,,Workforms has introduced new updates including...,"Workforms has recently undergone updates, incl...",Workforms has recently released several update...,Workforms has recently been updated with two n...


### add new record

In [130]:
# adding new record

text = """
We’ve made a few updates to Workforms recently, so we thought we’d summarise the enhancements into a  short and sweet  post for our community!
NEW UPDATES
\u2022 The People Column is now supported: Assign an item to a person or team without creating an automation!
\u2022 1:1 question format
COMING SOON 
\u2022 The Connect Board Column: Specify which items of the connected board are related to his/her submission.
\u2022 Submission analytics page: Take reporting to the next level, pull insightful analytics such as submission rate, average submission time, and split by geolocation, browser, and device of the form submitters.
Check out more details here 24 and stay tuned! 
"""

word_count = len(text.split())

new_record_dict = {
    "text": text,
    "word_count": word_count,
    "golden": "",
    "notion": "",
    "grammarly": "",
    "current_doc_summarizier": "",
    "current_update_summarizer": "",
}

new_record = pd.DataFrame(new_record_dict, index=[0])

data = pd.concat([data, new_record]).reset_index(drop=True)
data


Unnamed: 0,text,word_count,golden,notion,grammarly,current_doc_summarizier,current_update_summarizer
0,\nWe’ve made a few updates to Workforms recent...,108,,,,,


In [132]:
# save
data.to_json(json_path, orient='index', indent=4, force_ascii=True)

# Building new prompts

In [134]:
import openai
import os
from dotenv import load_dotenv

load_dotenv()

azure_openai_key = os.environ.get('AZURE_OPENAI_KEY')
azure_openai_endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT')

openai.api_key = azure_openai_key
openai.api_base = azure_openai_endpoint
openai.api_type = "azure"

In [180]:
class OpenaiCompletionBase:
    def __init__(self, model_name="text-davinci-003",
                 api_version="2023-05-15", temperature=0.0, max_tokens=200, prompt_template=""):
        self.model_name = model_name
        self.api_version = api_version
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.prompt_template = prompt_template

    def _get_response(self, **input_args):
        prompt = self.prompt_template.format(**input_args)
        response = openai.Completion.create(
            engine=self.model_name,
            api_version=self.api_version,
            prompt=prompt,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
        )

        return response

    def call_llm(self, input_text):
        input_args = {"input_text": input_text}
        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text

    def call_llm_with_res(self, input_text):
        input_args = {"input_text": input_text}
        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text, response



class OpenaiCompletionWordLimit(OpenaiCompletionBase):
    def __init__(self, model_name="text-davinci-003",
                 api_version="2023-05-15", temperature=0.0, max_tokens=200, prompt_template="", max_word_ratio=0.6):
        super().__init__(model_name, api_version, temperature, max_tokens, prompt_template)
        self.max_word_ratio = max_word_ratio

    def call_llm(self, input_text):
        input_text_word_count = len(input_text.split())
        max_words = int(round(input_text_word_count * self.max_word_ratio))
        input_args = {"max_words": max_words, "input_text": input_text}

        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text

    def call_llm_with_res(self, input_text):
        input_text_word_count = len(input_text.split())
        max_words = int(round(input_text_word_count * self.max_word_ratio))
        input_args = {"max_words": max_words, "input_text": input_text}

        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text, response


In [167]:
test_template = """summarize the following text: {input_text}"""

current_update_sum = OpenaiCompletionBase(
    model_name="text-davinci-003",
    api_version="2023-05-15",
    temperature=0.0,
    max_tokens=200,
    prompt_template=test_template
)

In [169]:
current_update_sum.call_llm("(CNN)A mammoth fire broke out Friday morning in a Kentucky industrial park, sending plumes of thick smoke over the area as authorities worked to contain the damage. The blaze began shortly before 7 a.m. at the General Electric Appliance Park in Louisville, according to Mike Weimer from the city's emergency management agency. He said that there were no reports of anyone injured or trapped. Video showed both smoke and bright orange flames. Firefighters took up positions around the affected buildings, spraying water from the periphery. Weimer told CNN that authorities didn't know what had caused the fire, which had gone to at least four alarms. According to a GE website, its facility in the Louisville Appliance Park is \"revitalizing manufacturing in the United States.\" The park is large, such that 34 football fields could fit in one of its warehouses in the facility.")

'A large fire broke out Friday morning in a Kentucky industrial park, with no reports of injuries or people trapped. Video showed smoke and orange flames, and firefighters were spraying water from the periphery. The cause of the fire is unknown, and the facility is a large one, with enough space for 34 football fields in one of its warehouses.'

In [183]:
version_1_template = """Summarize the key points of the text provided between the []. The summary MUST be concise and shorter than {max_words} words. Also, the output should be in the following structure: 
\nTitle: <the title>\n
<the summary>.  
----------------------
The text to summarize: [{input_text}]

The concise summary:
"""

version_1_sum = OpenaiCompletionWordLimit(
    model_name="text-davinci-003",
    api_version="2023-05-15",
    temperature=0.0,
    max_tokens=200,
    prompt_template=version_1_template,
    max_word_ratio=0.6
)

In [182]:
output = version_1_sum.call_llm("(CNN)A mammoth fire broke out Friday morning in a Kentucky industrial park, sending plumes of thick smoke over the area as authorities worked to contain the damage. The blaze began shortly before 7 a.m. at the General Electric Appliance Park in Louisville, according to Mike Weimer from the city's emergency management agency. He said that there were no reports of anyone injured or trapped. Video showed both smoke and bright orange flames. Firefighters took up positions around the affected buildings, spraying water from the periphery. Weimer told CNN that authorities didn't know what had caused the fire, which had gone to at least four alarms. According to a GE website, its facility in the Louisville Appliance Park is \"revitalizing manufacturing in the United States.\" The park is large, such that 34 football fields could fit in one of its warehouses in the facility.")
print(output)

Title: Fire Breaks Out in Kentucky Industrial Park

A mammoth fire broke out Friday morning in a Kentucky industrial park, with no reports of anyone injured or trapped. Firefighters sprayed water from the periphery of the affected buildings, and the cause of the fire is unknown. The park is large, with one of its warehouses being able to fit 34 football fields.


### Apply on dataset

In [184]:
# completion api
data['version_1_summarizer'] = data['text'].apply(version_1_sum.call_llm)

In [185]:
data.iloc[0].current_update_summarizer

"A large fire broke out Friday morning in the General Electric Appliance Park in Louisville, Kentucky. No injuries or people trapped were reported. Video showed smoke and orange flames. Firefighters were spraying water from the periphery to contain the blaze, which had gone to at least four alarms. The Louisville Appliance Park is a large facility owned by GE, which is 'revitalizing manufacturing in the United States'. It is so large that 34 football fields could fit in one of its warehouses."

In [186]:
data.iloc[0].version_1_summarizer

'Title: Fire Breaks Out in Kentucky Industrial Park\n\nA mammoth fire broke out Friday morning in a Kentucky industrial park, with no reports of anyone injured or trapped. Firefighters sprayed water from the periphery of the affected buildings, and the cause of the fire is unknown. The park is large, with one of its warehouses being able to fit 34 football fields.'

In [187]:
# save
data.to_json(json_path, orient='index', indent=4, force_ascii=True)