In [2]:
import pandas as pd
import json

### read data and conf

In [87]:
json_path = 'data/summarization/text_files'
data_file_name = 'txt_dataset_summarizer.json'
conf_file_name = 'txt_dataset_summarizer_config.json'

In [294]:
data = pd.read_json(f'{json_path}/{data_file_name}', orient='index')
with open(f'{json_path}/{conf_file_name}') as f:
    conf_json = json.load(f)
    
data.head(3)

Unnamed: 0,text,word_count,golden,notion,grammarly,current_doc_summarizier,current_update_summarizer,version_1_summarizer,version_2_summarizer,version_3_summarizer
0,(CNN)A mammoth fire broke out Friday morning i...,143,Fire breaks out at the General Electric Applia...,A massive fire broke out at the General Electr...,A massive fire broke out at the General Electr...,A large fire broke out Friday morning in the G...,A large fire broke out Friday morning in the G...,Title: Fire Breaks Out in Kentucky Industrial ...,Title: Fire Breaks Out in Kentucky Industrial ...,A fire broke out Friday morning in a Kentucky ...
1,"Hull, Leicester and Swansea City are following...",147,Hull's chief scout Stan Ternent has watched M...,"Hull, Leicester, and Swansea City are interest...","Hull, Leicester, and Swansea City are interest...","Maciej Rybus, a 25-year-old Polish internation...","Maciej Rybus, a 25-year-old Polish internation...","Title: Maciej Rybus Followed by Hull, Leiceste...","Title: Maciej Rybus Followed by Hull, Leiceste...","Hull, Leicester and Swansea City are intereste..."
2,Sportsmail have teamed up with Golfbidder to o...,123,Sportsmail have teamed up with Golfbidder for ...,Sportsmail and Golfbidder have partnered to of...,Sportsmail and Golfbidder are running a compet...,Sportsmail have partnered with Golfbidder to o...,Sportsmail and Golfbidder have teamed up to of...,Title: Win a Callaway Golf Prize Bundle Worth ...,Title: Win a Callaway Golf Prize Bundle Worth ...,One lucky reader has the chance to win a bundl...


# Building new prompts

In [9]:
import openai
import os
from dotenv import load_dotenv

load_dotenv()

azure_openai_key = os.environ.get('AZURE_OPENAI_KEY')
azure_openai_endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT')

openai.api_key = azure_openai_key
openai.api_base = azure_openai_endpoint
openai.api_type = "azure"

In [10]:
class OpenaiCompletionBase:
    def __init__(self, model_name="text-davinci-003",
                 api_version="2023-05-15", temperature=0.0, max_tokens=200, prompt_template=""):
        self.model_name = model_name
        self.api_version = api_version
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.prompt_template = prompt_template

    def _get_response(self, **input_args):
        prompt = self.prompt_template.format(**input_args)
        response = openai.Completion.create(
            engine=self.model_name,
            api_version=self.api_version,
            prompt=prompt,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
        )

        return response

    def call_llm(self, input_text):
        input_args = {"input_text": input_text}
        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text

    def call_llm_with_res(self, input_text):
        input_args = {"input_text": input_text}
        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text, response

    def get_config(self):
        return self.__dict__


class OpenaiCompletionWordLimit(OpenaiCompletionBase):
    def __init__(self, model_name="text-davinci-003",
                 api_version="2023-05-15", temperature=0.0, max_tokens=200, prompt_template="", max_word_ratio=0.6):
        super().__init__(model_name, api_version, temperature, max_tokens, prompt_template)
        self.max_word_ratio = max_word_ratio

    def call_llm(self, input_text):
        input_text_word_count = len(input_text.split())
        max_words = int(round(input_text_word_count * self.max_word_ratio))
        input_args = {"max_words": max_words, "input_text": input_text}

        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text

    def call_llm_with_res(self, input_text):
        input_text_word_count = len(input_text.split())
        max_words = int(round(input_text_word_count * self.max_word_ratio))
        input_args = {"max_words": max_words, "input_text": input_text}

        response = self._get_response(**input_args)
        output_text = response["choices"][0]["text"].strip()
        return output_text, response


In [13]:
# test_template = """summarize the following text: {input_text}"""
# 
# current_update_sum = OpenaiCompletionBase(
#     model_name="text-davinci-003",
#     api_version="2023-05-15",
#     temperature=0.0,
#     max_tokens=200,
#     prompt_template=test_template
# )

In [186]:
version_1_template = """Summarize the key points of the text provided between the []. The summary MUST be concise and shorter than {max_words} words. Also, the output should be in the following structure: 
\nTitle: <the title>\n
<the summary>.  
----------------------
The text to summarize: [{input_text}]

The concise summary:
"""

version_1_sum = OpenaiCompletionWordLimit(
    model_name="text-davinci-003",
    api_version="2023-05-15",
    temperature=0.0,
    max_tokens=200,
    prompt_template=version_1_template,
    max_word_ratio=0.6
)

In [260]:
version_2_template = """Using your deep linguistic understanding and knowledge, condense the essence of the provided content.\n Retain the core ideas, crucial details, and significant context while omitting any extraneous information.\n Please deliver a concise and coherent summary of the following text.\n Respond only in string using less than {max_words} words.\n The output MUST be in the following structure:\nTitle: <the title>\n<the summary>.\n----------------------
\nThe text to summarize: {input_text}\nThe summary:\n"""

version_2_sum = OpenaiCompletionWordLimit(
    model_name="text-davinci-003",
    api_version="2023-05-15",
    temperature=0.0,
    max_tokens=512,
    prompt_template=version_2_template,
    max_word_ratio=0.75
)

In [285]:
version_3_template = """Using your deep linguistic understanding and knowledge, condense the essence of the provided content. \n        Retain the core ideas, crucial details, and significant context while omitting any extraneous information.\n        Please deliver a concise and coherent summary of the following text of the user.\n        Respond only in string.\n        summarize this {input_text}"""

version_3_sum = OpenaiCompletionBase(
    model_name="text-davinci-003",
    api_version="2023-05-15",
    temperature=0.0,
    max_tokens=512,
    prompt_template=version_3_template,
)

### Apply on dataset

In [287]:
model_name = "version_3_summarizer"

In [288]:
# completion api
data[model_name] = data['text'].apply(version_3_sum.call_llm)

In [289]:
print(data.iloc[0].version_1_summarizer)

Title: Fire Breaks Out in Kentucky Industrial Park

A mammoth fire broke out Friday morning in a Kentucky industrial park, with no reports of anyone injured or trapped. Firefighters sprayed water from the periphery of the affected buildings, and the cause of the fire is unknown. The park is large, with one of its warehouses being able to fit 34 football fields.


In [290]:
print(data.iloc[0][model_name])

A fire broke out Friday morning in a Kentucky industrial park, sending plumes of smoke over the area. No injuries or trapped people were reported. Firefighters sprayed water from the periphery of the affected buildings. The cause of the fire is unknown, and it had gone to at least four alarms. The Louisville Appliance Park is owned by General Electric and is "revitalizing manufacturing in the United States," with one of its warehouses being large enough to fit 34 football fields.


In [73]:
# data = data.drop('larium_devinci_summarizer',axis=1)

### update config

In [295]:
conf_json.update({model_name:version_3_sum.get_config()})

In [296]:
conf_json

{'current_doc_summarizer': {'model_name': 'text-davinci-003',
  'api_version': '2023-05-15',
  'temperature': 0.0,
  'max_tokens': 200,
  'prompt_template': 'summarize the following text: {input_text}'},
 'current_update_summarizer': {'model_name': 'text-davinci-003',
  'api_version': '2023-05-15',
  'temperature': 0.0,
  'max_tokens': 200,
  'prompt_template': 'summarize the following text: {input_text}'},
 'version_1_summarizer': {'model_name': 'text-davinci-003',
  'api_version': '2023-05-15',
  'temperature': 0.0,
  'max_tokens': 200,
  'prompt_template': 'Summarize the key points of the text provided between the []. The summary MUST be concise and shorter than {max_words} words. Also, the output should be in the following structure: \n\nTitle: <the title>\n\n<the summary>.  \n----------------------\nThe text to summarize: [{input_text}]\n\nThe concise summary:\n',
  'max_word_ratio': 0.6},
 'version_2_summarizer': {'model_name': 'text-davinci-003',
  'api_version': '2023-05-15',
 

## Save data file and conf json

In [297]:
# save
data.to_json(f"{json_path}/{data_file_name}", orient='index', indent=4, force_ascii=True)
with open(f'{json_path}/{conf_file_name}', 'w', encoding='utf-8') as f:
    json.dump(conf_json, f, ensure_ascii=False, indent=4)