In [2]:
import pandas as pd
import json

### read data and conf

In [24]:
json_path = 'data/summarization/text_files'
data_file_name = 'txt_dataset_summarizer.json'
conf_file_name = 'txt_dataset_summarizer_config.json'

In [25]:
data = pd.read_json(f'{json_path}/{data_file_name}', orient='index')
with open(f'{json_path}/{conf_file_name}') as f:
    conf_json = json.load(f)
    
data.head(3)

Unnamed: 0,text,word_count,golden,notion,grammarly,current_doc_summarizier,current_update_summarizer,version_1_summarizer
0,(CNN)A mammoth fire broke out Friday morning i...,143,Fire breaks out at the General Electric Applia...,A massive fire broke out at the General Electr...,A massive fire broke out at the General Electr...,A large fire broke out Friday morning in the G...,A large fire broke out Friday morning in the G...,Title: Fire Breaks Out in Kentucky Industrial ...
1,"Hull, Leicester and Swansea City are following...",147,Hull's chief scout Stan Ternent has watched M...,"Hull, Leicester, and Swansea City are interest...","Hull, Leicester, and Swansea City are interest...","Maciej Rybus, a 25-year-old Polish internation...","Maciej Rybus, a 25-year-old Polish internation...","Title: Maciej Rybus Followed by Hull, Leiceste..."
2,Sportsmail have teamed up with Golfbidder to o...,123,Sportsmail have teamed up with Golfbidder for ...,Sportsmail and Golfbidder have partnered to of...,Sportsmail and Golfbidder are running a compet...,Sportsmail have partnered with Golfbidder to o...,Sportsmail and Golfbidder have teamed up to of...,Title: Win a Callaway Golf Prize Bundle Worth ...


# Building new prompts

In [9]:
import openai
import os
from dotenv import load_dotenv

load_dotenv()

azure_openai_key = os.environ.get('AZURE_OPENAI_KEY')
azure_openai_endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT')

openai.api_key = azure_openai_key
openai.api_base = azure_openai_endpoint
openai.api_type = "azure"

In [10]:
class OpenaiCompletionBase:
    def __init__(self, model_name="text-davinci-003",
                 api_version="2023-05-15", temperature=0.0, max_tokens=200, prompt_template=""):
        self.model_name = model_name
        self.api_version = api_version
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.prompt_template = prompt_template

    def _get_response(self, **input_args):
        prompt = self.prompt_template.format(**input_args)
        response = openai.Completion.create(
            engine=self.model_name,
            api_version=self.api_version,
            prompt=prompt,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
        )

        return response

    def call_llm(self, input_text):
        input_args = {"input_text": input_text}
        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text

    def call_llm_with_res(self, input_text):
        input_args = {"input_text": input_text}
        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text, response

    def get_config(self):
        return self.__dict__


class OpenaiCompletionWordLimit(OpenaiCompletionBase):
    def __init__(self, model_name="text-davinci-003",
                 api_version="2023-05-15", temperature=0.0, max_tokens=200, prompt_template="", max_word_ratio=0.6):
        super().__init__(model_name, api_version, temperature, max_tokens, prompt_template)
        self.max_word_ratio = max_word_ratio

    def call_llm(self, input_text):
        input_text_word_count = len(input_text.split())
        max_words = int(round(input_text_word_count * self.max_word_ratio))
        input_args = {"max_words": max_words, "input_text": input_text}

        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text

    def call_llm_with_res(self, input_text):
        input_text_word_count = len(input_text.split())
        max_words = int(round(input_text_word_count * self.max_word_ratio))
        input_args = {"max_words": max_words, "input_text": input_text}

        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text, response


In [13]:
# test_template = """summarize the following text: {input_text}"""
# 
# current_update_sum = OpenaiCompletionBase(
#     model_name="text-davinci-003",
#     api_version="2023-05-15",
#     temperature=0.0,
#     max_tokens=200,
#     prompt_template=test_template
# )

In [11]:
version_1_template = """Summarize the key points of the text provided between the []. The summary MUST be concise and shorter than {max_words} words. Also, the output should be in the following structure: 
\nTitle: <the title>\n
<the summary>.  
----------------------
The text to summarize: [{input_text}]

The concise summary:
"""

version_1_sum = OpenaiCompletionWordLimit(
    model_name="text-davinci-003",
    api_version="2023-05-15",
    temperature=0.0,
    max_tokens=200,
    prompt_template=version_1_template,
    max_word_ratio=0.6
)

In [12]:
output = version_1_sum.call_llm(
    "(CNN)A mammoth fire broke out Friday morning in a Kentucky industrial park, sending plumes of thick smoke over the area as authorities worked to contain the damage. The blaze began shortly before 7 a.m. at the General Electric Appliance Park in Louisville, according to Mike Weimer from the city's emergency management agency. He said that there were no reports of anyone injured or trapped. Video showed both smoke and bright orange flames. Firefighters took up positions around the affected buildings, spraying water from the periphery. Weimer told CNN that authorities didn't know what had caused the fire, which had gone to at least four alarms. According to a GE website, its facility in the Louisville Appliance Park is \"revitalizing manufacturing in the United States.\" The park is large, such that 34 football fields could fit in one of its warehouses in the facility.")
print(output)

Title: Fire Breaks Out in Kentucky Industrial Park

A mammoth fire broke out Friday morning in a Kentucky industrial park, with no reports of anyone injured or trapped. Firefighters sprayed water from the periphery of the affected buildings, and the cause of the fire is unknown. The park is large, with one of its warehouses being able to fit 34 football fields.


### Apply on dataset

In [16]:
model_name = "version_1_summarizer"

In [184]:
# completion api
data[model_name] = data['text'].apply(version_1_sum.call_llm)

In [185]:
data.iloc[0].current_update_summarizer

"A large fire broke out Friday morning in the General Electric Appliance Park in Louisville, Kentucky. No injuries or people trapped were reported. Video showed smoke and orange flames. Firefighters were spraying water from the periphery to contain the blaze, which had gone to at least four alarms. The Louisville Appliance Park is a large facility owned by GE, which is 'revitalizing manufacturing in the United States'. It is so large that 34 football fields could fit in one of its warehouses."

In [186]:
data.iloc[0].version_1_summarizer

'Title: Fire Breaks Out in Kentucky Industrial Park\n\nA mammoth fire broke out Friday morning in a Kentucky industrial park, with no reports of anyone injured or trapped. Firefighters sprayed water from the periphery of the affected buildings, and the cause of the fire is unknown. The park is large, with one of its warehouses being able to fit 34 football fields.'

## Save data file

In [187]:
# save
data.to_json(json_path, orient='index', indent=4, force_ascii=True)

## Update config file

In [17]:
conf_json.update({model_name:version_1_sum.get_config()})

In [21]:
conf_json

{'current_doc_summarizer': {'model_name': 'text-davinci-003',
  'api_version': '2023-05-15',
  'temperature': 0.0,
  'max_tokens': 200,
  'prompt_template': 'summarize the following text: {input_text}'},
 'current_update_summarizer': {'model_name': 'text-davinci-003',
  'api_version': '2023-05-15',
  'temperature': 0.0,
  'max_tokens': 200,
  'prompt_template': 'summarize the following text: {input_text}'},
 'version_1_summarizer': {'model_name': 'text-davinci-003',
  'api_version': '2023-05-15',
  'temperature': 0.0,
  'max_tokens': 200,
  'prompt_template': 'Summarize the key points of the text provided between the []. The summary MUST be concise and shorter than {max_words} words. Also, the output should be in the following structure: \n\nTitle: <the title>\n\n<the summary>.  \n----------------------\nThe text to summarize: [{input_text}]\n\nThe concise summary:\n',
  'max_word_ratio': 0.6}}

In [23]:
print(conf_json[model_name]['prompt_template'])

Summarize the key points of the text provided between the []. The summary MUST be concise and shorter than {max_words} words. Also, the output should be in the following structure: 

Title: <the title>

<the summary>.  
----------------------
The text to summarize: [{input_text}]

The concise summary:


### update conf json file

In [22]:
with open(f'{json_path}/{conf_file_name}', 'w', encoding='utf-8') as f:
    json.dump(conf_json, f, ensure_ascii=False, indent=4)