# Title
[]()

In [1]:

import pandas as pd
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
from silvhua import *

In [104]:
# set the option to wrap text within cells
pd.set_option('display.max_colwidth', 100)
# pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set up for summarization

In [2]:
qna_dict = dict()
chatbot_dict = dict()
simple_summaries_dict = dict()
relevance_dict = dict()
save = True
# save_outputs = False
save_outputs = True

# Copied from [previous notebook](2023-07-11%20create%20summaries%20table.ipynb)

In [None]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship


Base = declarative_base()

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)
    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    original_headline = mapped_column(String(255))
    simple_summary = mapped_column(Text)
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=5):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi']
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                # if table == 'summaries':

                    # Check if prompt already exists in the database
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        original_headline=row['headline'],
                        simple_summary=row['simple_summary'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *
from response_processing import *
import time
import pytz
import re
from itertools import product
import openai


class Chaining:
    def __init__(self, text_id, title, text, folder_path, system_role="You are a helpful assistant.", 
            model="gpt-3.5-turbo", temperature=0.7, max_tokens=9000, 
        ):
        self.reference_id = text_id
        self.title = title
        self.text = text
        self.folder = re.sub(r'(?:.*\/)?(.*\/.*)\/?$', r'\1', folder_path)
        self.system_role = system_role
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.model = model
        print(f'***OpenAI model: {self.model}')

    def create_prompt(self, task, text):
        system_role = f'{self.system_role}'
        user_input = f"""Given the following text delimited by triple backticks: ```{text}``` \n {task}"""
        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_input},]

        print('\tDone creating prompt')
        return messages

    def gpt(self, messages, n_choices, temperature, model=None):
        model = self.model if model == None else model
        print(f'\tSending request to {model}')
        print(f'\t\tRequesting {n_choices} choices using {model}')
        openai.api_key = os.getenv('api_openai')
        response = openai.ChatCompletion.create(
            model=model, messages=messages, 
            temperature=temperature, 
            max_tokens=self.max_tokens,
            n=n_choices
            )
        print('\tDone sending request to GPT-3')
        return response

    def summarize(
            self, task, prep_step, edit_task, simplify_task, simplify_audience,
            format_task,
            n_choices=5, task_first=True):
        if task_first == True:
            full_task = f'{task}\n\n{prep_step}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        else:
            full_task = f'{prep_step}\n\n{task}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        prompt = self.create_prompt(full_task, self.text)
        firstline_pattern = r'\s?(\S*)(\n*)(.+)'
        title = re.match(firstline_pattern, self.text)[0]
        self.qna = dict() 
        self.qna['timestamp'] = str(datetime.now(pytz.timezone('Canada/Pacific')))
        self.qna['reference_id'] = self.reference_id
        self.qna['article_title'] = self.title
        self.qna['text'] = self.text
        self.qna['system_role'] = self.system_role
        self.qna['model'] = self.model
        self.qna['prep_step'] = prep_step.strip()
        self.qna['summarize_task'] = task.strip()
        self.qna['edit_task'] = edit_task.strip()
        self.qna['simplify_task'] = simplify_task.strip()
        self.qna['simplify_audience'] = simplify_audience.strip()
        self.qna['format_task'] = format_task.strip()
        self.qna['full_summarize_task'] = full_task.strip()
        self.qna['folder'] = self.folder
        self.summaries_dict = dict()
        self.article_title = title
        self.response_regex = r'response_(.*)'
        self.simple_summary_dict = dict()
        self.relevance_dict = dict()
        self.n_previous_prompts = dict()

        try:
            response = self.gpt(prompt, n_choices=n_choices, temperature=self.temperature)
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**API request failed for `.summarize()`**')
            return self.qna
        try:
            for index, choice in enumerate(response.choices):
                self.summaries_dict[f'response_{"{:02d}".format(index+1)}'] = choice["message"]["content"]
            self.qna.setdefault('summary', [])
            self.qna['summary'].extend([value for value in self.summaries_dict.values()])
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**Error with response parsing**')
    
def batch_summarize(sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chaining_bot_dict, iteration_id, task_first=True,
    system_role=None, model='gpt-3.5-turbo', max_tokens=1000, temperature=0.7, pause_per_request=0, n_choices=5,
    save_outputs=False
    ):
    prompts_df = pd.DataFrame(product(prep_step, summarize_task, edit_task, simplify_task, simplify_audience, format_task), 
        columns=['prep_step', 'summarize_task', 'edit_task', 'simplify_task', 'simplify_audience', 'format_task'])

    chaining_bot_dict[iteration_id] = dict()
    def summarize_from_df_row(text_id, title, text, chaining_bot_dict):
        for index in prompts_df.index:
            print(f'**Text #{text_id} prompt #{index+1} of {prompts_df.index.max()+1}**')
            task = prompts_df.loc[index, 'summarize_task']
            prep_step = prompts_df.loc[index, 'prep_step']
            edit_task = prompts_df.loc[index, 'edit_task']
            simplify_task = prompts_df.loc[index, 'simplify_task']
            simplify_audience = prompts_df.loc[index, 'simplify_audience']
            format_task = prompts_df.loc[index, 'format_task']
            try:
                print('Creating Chaining class instance')
                chatbot = Chaining(
                    text_id, title, text, folder_path=folder_path, system_role=system_role, 
                    model=model, max_tokens=max_tokens, temperature=temperature)
                print('Chaining class instance created')
                chatbot.summarize(
                    task=task, prep_step=prep_step, edit_task=edit_task, 
                    simplify_task=simplify_task, simplify_audience=simplify_audience,
                    format_task=format_task, n_choices=n_choices, task_first=task_first
                    )
                chaining_bot_dict[iteration_id][f'{text_id}_prompt{"{:02d}".format(index)}'] = chatbot
                print('\t...Completed')
                if pause_per_request > 0:
                    print(f'[batch_summarize()] Sleeping {pause_per_request} sec to avoid exceeding API rate limit')
                    time.sleep(pause_per_request) # Account for API rate limit of 3 API requests/limit 
            except Exception as error:
                exc_type, exc_obj, tb = sys.exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                file = f.f_code.co_filename
                print("An error occurred on line", lineno, "in", file, ":", error)
                print('\t...Error making chatbot request')
                break
    sources_df.apply(lambda row: summarize_from_df_row(row['id'], row['title'], row['text'], chaining_bot_dict), axis=1)
    
    if save_outputs:
        try:
            save_instance_to_dict(
                chaining_bot_dict[iteration_id], 
                description=f'batch_Chaining_attributes_initial',
                ext=None, json_path=folder_path
                )
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            file = f.f_code.co_filename
            print(f'An error occurred on line {lineno} in {file}: {error}')
            print('[batch_summarize_chain()] Unable to save API response')

    return chaining_bot_dict

def create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=None, 
    ):
    """
    Create DataFrame from initial ChatGPT summaries.
    """
    dfs_list = []
    chatbot_id = iteration_id if chatbot_id == None else chatbot_id
    for chatbot_key in chatbot_dict[chatbot_id].keys():
        print(f'Processing {chatbot_key}...')
        dfs_list.append(pd.DataFrame(
            chatbot_dict[chatbot_id][chatbot_key].qna, 
            index=[choice for choice in range(1, len(chatbot_dict[chatbot_id][chatbot_key].qna['summary'])+1)])
            )
    
    qna_df = pd.concat(dfs_list).reset_index(names=['choice'])
    qna_df = extract_summary(qna_df, 'summary')
    columns = qna_df.columns.tolist()
    columns.remove('choice')
    columns.insert(3, 'choice') # Move 'choice' column

    # qna_df['date'] = pd.Series('2023-06-12', index=qna_df.index)
    # columns.insert(0, 'date')


    qna_dict[iteration_id] = qna_df[columns]
    print(f'Original summaries DataFrame shape: {qna_df.shape}')
    print(f'\tOriginal summaries Dataframe columns: {qna_df.columns}')
    return qna_dict


import json
def extract_summary(df, summary_column='summary'):
    # Convert the string to JSON
    df[summary_column] = df[summary_column].apply(json.loads)

    # Extract 'headline' and 'body' values
    df['headline'] = df[summary_column].apply(lambda x: x['headline'])
    df['simple_summary'] = df[summary_column].apply(lambda x: x['audience'])
    df[summary_column] = df[summary_column].apply(lambda x: x['body'])

    return df


# Set parameters
iteration_id = 1.5
n_choices = 2
pause_per_request=0
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
model = 'gpt-3.5-turbo-16k-0613'
save_outputs=True
folder_path = '../text/2023-07-11 for db'

## Add rows from references dataframe
# bulk_append(references_df)


sources_df = get_table(table='sources', limit=3)

chaining_dict = batch_summarize(
    sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chatbot_dict,
    system_role=system_role, model=model, max_tokens=1000,
    n_choices=n_choices, pause_per_request=pause_per_request,
    iteration_id=iteration_id, save_outputs=save_outputs
    )
# # chaining_dict[iteration_id]
qna_dict = create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
    )
# Add rows from results to summaries and prompts table
bulk_append(qna_dict[iteration_id])
qna_dict[iteration_id]


# Update the SQLAlchemy objects

In [17]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship


Base = declarative_base()

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) #
    simple_summary = mapped_column(Text)
    # rating_simple_content = mapped_column(Integer) #
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi']
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                # if table == 'summaries':

                    # Check if prompt already exists in the database
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        original_headline=row['headline'],
                        simple_summary=row['simple_summary'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *
from response_processing import *
import time
import pytz
import re
from itertools import product
import openai


class Chaining:
    def __init__(self, text_id, title, text, folder_path, system_role="You are a helpful assistant.", 
            model="gpt-3.5-turbo", temperature=0.7, max_tokens=9000, 
        ):
        self.reference_id = text_id
        self.title = title
        self.text = text
        self.folder = re.sub(r'(?:.*\/)?(.*\/.*)\/?$', r'\1', folder_path)
        self.system_role = system_role
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.model = model
        print(f'***OpenAI model: {self.model}')

    def create_prompt(self, task, text):
        system_role = f'{self.system_role}'
        user_input = f"""Given the following text delimited by triple backticks: ```{text}``` \n {task}"""
        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_input},]

        print('\tDone creating prompt')
        return messages

    def gpt(self, messages, n_choices, temperature, model=None):
        model = self.model if model == None else model
        print(f'\tSending request to {model}')
        print(f'\t\tRequesting {n_choices} choices using {model}')
        openai.api_key = os.getenv('api_openai')
        response = openai.ChatCompletion.create(
            model=model, messages=messages, 
            temperature=temperature, 
            max_tokens=self.max_tokens,
            n=n_choices
            )
        print('\tDone sending request to GPT-3')
        return response

    def summarize(
            self, task, prep_step, edit_task, simplify_task, simplify_audience,
            format_task,
            n_choices=5, task_first=True):
        if task_first == True:
            full_task = f'{task}\n\n{prep_step}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        else:
            full_task = f'{prep_step}\n\n{task}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        prompt = self.create_prompt(full_task, self.text)
        firstline_pattern = r'\s?(\S*)(\n*)(.+)'
        title = re.match(firstline_pattern, self.text)[0]
        self.qna = dict() 
        self.qna['timestamp'] = str(datetime.now(pytz.timezone('Canada/Pacific')))
        self.qna['reference_id'] = self.reference_id
        self.qna['article_title'] = self.title
        self.qna['text'] = self.text
        self.qna['system_role'] = self.system_role
        self.qna['model'] = self.model        
        self.qna['temperature'] = self.temperature
        self.qna['prep_step'] = prep_step.strip()
        self.qna['summarize_task'] = task.strip()
        self.qna['edit_task'] = edit_task.strip()
        self.qna['simplify_task'] = simplify_task.strip()
        self.qna['simplify_audience'] = simplify_audience.strip()
        self.qna['format_task'] = format_task.strip()
        self.qna['full_summarize_task'] = full_task.strip()
        self.qna['folder'] = self.folder
        self.summaries_dict = dict()
        self.article_title = title
        self.response_regex = r'response_(.*)'
        self.simple_summary_dict = dict()
        self.relevance_dict = dict()
        self.n_previous_prompts = dict()

        try:
            response = self.gpt(prompt, n_choices=n_choices, temperature=self.temperature)
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**API request failed for `.summarize()`**')
            return self.qna
        try:
            for index, choice in enumerate(response.choices):
                self.summaries_dict[f'response_{"{:02d}".format(index+1)}'] = choice["message"]["content"]
            self.qna.setdefault('summary', [])
            self.qna['summary'].extend([value for value in self.summaries_dict.values()])
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**Error with response parsing**')
    
def batch_summarize(sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chaining_bot_dict, iteration_id, task_first=True,
    system_role=None, model='gpt-3.5-turbo', max_tokens=1000, temperature=0.7, pause_per_request=0, n_choices=5,
    save_outputs=False
    ):
    prompts_df = pd.DataFrame(product(prep_step, summarize_task, edit_task, simplify_task, simplify_audience, format_task), 
        columns=['prep_step', 'summarize_task', 'edit_task', 'simplify_task', 'simplify_audience', 'format_task'])

    chaining_bot_dict[iteration_id] = dict()
    def summarize_from_df_row(text_id, title, text, chaining_bot_dict):
        for index in prompts_df.index:
            print(f'**Text #{text_id} prompt #{index+1} of {prompts_df.index.max()+1}**')
            task = prompts_df.loc[index, 'summarize_task']
            prep_step = prompts_df.loc[index, 'prep_step']
            edit_task = prompts_df.loc[index, 'edit_task']
            simplify_task = prompts_df.loc[index, 'simplify_task']
            simplify_audience = prompts_df.loc[index, 'simplify_audience']
            format_task = prompts_df.loc[index, 'format_task']
            try:
                print('Creating Chaining class instance')
                chatbot = Chaining(
                    text_id, title, text, folder_path=folder_path, system_role=system_role, 
                    model=model, max_tokens=max_tokens, temperature=temperature)
                print('Chaining class instance created')
                chatbot.summarize(
                    task=task, prep_step=prep_step, edit_task=edit_task, 
                    simplify_task=simplify_task, simplify_audience=simplify_audience,
                    format_task=format_task, n_choices=n_choices, task_first=task_first
                    )
                chaining_bot_dict[iteration_id][f'{text_id}_prompt{"{:02d}".format(index)}'] = chatbot
                print('\t...Completed')
                if pause_per_request > 0:
                    print(f'[batch_summarize()] Sleeping {pause_per_request} sec to avoid exceeding API rate limit')
                    time.sleep(pause_per_request) # Account for API rate limit of 3 API requests/limit 
            except Exception as error:
                exc_type, exc_obj, tb = sys.exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                file = f.f_code.co_filename
                print("An error occurred on line", lineno, "in", file, ":", error)
                print('\t...Error making chatbot request')
                break
    sources_df.apply(lambda row: summarize_from_df_row(row['id'], row['title'], row['text'], chaining_bot_dict), axis=1)
    
    if save_outputs:
        try:
            save_instance_to_dict(
                chaining_bot_dict[iteration_id], 
                description=f'batch_Chaining_attributes_initial',
                ext=None, json_path=folder_path
                )
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            file = f.f_code.co_filename
            print(f'An error occurred on line {lineno} in {file}: {error}')
            print('[batch_summarize_chain()] Unable to save API response')

    return chaining_bot_dict

def create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=None, 
    ):
    """
    Create DataFrame from initial ChatGPT summaries.
    """
    dfs_list = []
    chatbot_id = iteration_id if chatbot_id == None else chatbot_id
    for chatbot_key in chatbot_dict[chatbot_id].keys():
        print(f'Processing {chatbot_key}...')
        dfs_list.append(pd.DataFrame(
            chatbot_dict[chatbot_id][chatbot_key].qna, 
            index=[choice for choice in range(1, len(chatbot_dict[chatbot_id][chatbot_key].qna['summary'])+1)])
            )
    
    qna_df = pd.concat(dfs_list).reset_index(names=['choice'])
    qna_df = extract_summary(qna_df, 'summary')
    columns = qna_df.columns.tolist()
    columns.remove('choice')
    columns.insert(3, 'choice') # Move 'choice' column

    # qna_df['date'] = pd.Series('2023-06-12', index=qna_df.index)
    # columns.insert(0, 'date')


    qna_dict[iteration_id] = qna_df[columns]
    print(f'Original summaries DataFrame shape: {qna_df.shape}')
    print(f'\tOriginal summaries Dataframe columns: {qna_df.columns}')
    return qna_dict


import json
def extract_summary(df, summary_column='summary'):
    # Convert the string to JSON
    df[summary_column] = df[summary_column].apply(json.loads)

    # Extract 'headline' and 'body' values
    df['headline'] = df[summary_column].apply(lambda x: x['headline'])
    df['simple_summary'] = df[summary_column].apply(lambda x: x['audience'])
    df[summary_column] = df[summary_column].apply(lambda x: x['body'])

    return df


# Set parameters
iteration_id = 1.5
n_choices = 2
pause_per_request=0
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
model = 'gpt-3.5-turbo-16k-0613'
save_outputs=True
folder_path = '../text/2023-07-11 for db'

summaries = get_table(table='summaries')


# sources_df = get_table(table='sources', limit=3)

# chaining_dict = batch_summarize(
#     sources_df, folder_path, prep_step, summarize_task, edit_task, 
#     simplify_task, simplify_audience, format_task,
#     chatbot_dict,
#     system_role=system_role, model=model, max_tokens=1000,
#     n_choices=n_choices, pause_per_request=pause_per_request,
#     iteration_id=iteration_id, save_outputs=save_outputs
#     )
# # # chaining_dict[iteration_id]
# qna_dict = create_summaries_df(
#     qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
#     )
# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])
# qna_dict[iteration_id]


Query: SELECT * from summaries


In [18]:
summaries

Unnamed: 0,id,timestamp,original_summary,original_headline,simple_summary,prompt_id,reference_id,choice,rating_original_content,rating_simple_content,model,temperature
0,2,2023-07-12 06:35:24.458418+00:00,A recent study compared the recovery response ...,New Research on Recovery from Exercise in Midd...,Check out this new research that shows how par...,5,1,1,3,3,gpt-3.5-turbo-16k-0613,0.7
1,1,2023-07-12 06:12:32.880097+00:00,A recent study compared the recovery response ...,New Study Shows How Exercise Impacts Muscle Re...,A recent study compared the recovery response ...,5,1,1,3,3,gpt-3.5-turbo-16k-0613,0.7
2,3,2023-07-12 06:35:24.458418+00:00,A recent study compared the recovery response ...,New Research Shows How Recreational Training C...,New research suggests that engaging in regular...,5,1,2,4,2,gpt-3.5-turbo-16k-0613,0.7
3,4,2023-07-12 06:35:28.609483+00:00,A recent study found that a high calcium and h...,New Study Shows Nutritional Intervention Reduc...,A recent study has shown that a simple dietary...,5,2,1,4,3,gpt-3.5-turbo-16k-0613,0.7
4,5,2023-07-12 06:35:28.609483+00:00,A recent study found that a high calcium and h...,New Study Shows Nutritional Intervention Can R...,A recent study has found that a simple dietary...,5,2,2,4,4,gpt-3.5-turbo-16k-0613,0.7
5,6,2023-07-12 06:35:33.993335+00:00,"Exercise snacks, which are short bouts of vigo...",New Research Shows Exercise Snacks Improve Hea...,New research has found that short bursts of vi...,5,3,1,3,3,gpt-3.5-turbo-16k-0613,0.7
6,7,2023-07-12 06:35:33.993335+00:00,"Exercise snacks, short bursts of vigorous exer...",Exercise Snacks: A Time-Efficient Way to Impro...,Exercise snacks are a convenient and time-effi...,5,3,2,3,4,gpt-3.5-turbo-16k-0613,0.7


# Available models

In [21]:
from orm_summarize import openai_models
models_available = openai_models(env="api_openai", query='gpt')

gpt-4-0613
gpt-4-0314
gpt-3.5-turbo-16k-0613
gpt-3.5-turbo-0613
gpt-3.5-turbo-16k
gpt-4
gpt-3.5-turbo-0301
gpt-3.5-turbo


In [22]:
models_available = openai_models(env="api_openai", query='4')

gpt-4-0613
gpt-4-0314
gpt-4


# Generate summaries with prompts imported

In [24]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) #
    simple_summary = mapped_column(Text)
    # rating_simple_content = mapped_column(Integer) #
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi']
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                # if table == 'summaries':

                    # Check if prompt already exists in the database
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        original_headline=row['headline'],
                        simple_summary=row['simple_summary'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *
from response_processing import *
import time
import pytz
import re
from itertools import product
import openai
from prompts import *

class Chaining:
    def __init__(self, text_id, title, text, folder_path, system_role="You are a helpful assistant.", 
            model="gpt-3.5-turbo", temperature=0.7, max_tokens=9000, 
        ):
        self.reference_id = text_id
        self.title = title
        self.text = text
        self.folder = re.sub(r'(?:.*\/)?(.*\/.*)\/?$', r'\1', folder_path)
        self.system_role = system_role
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.model = model
        print(f'***OpenAI model: {self.model}')

    def create_prompt(self, task, text):
        system_role = f'{self.system_role}'
        user_input = f"""Given the following text delimited by triple backticks: ```{text}``` \n {task}"""
        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_input},]

        print('\tDone creating prompt')
        return messages

    def gpt(self, messages, n_choices, temperature, model=None):
        model = self.model if model == None else model
        print(f'\tSending request to {model}')
        print(f'\t\tRequesting {n_choices} choices using {model}')
        openai.api_key = os.getenv('api_openai')
        response = openai.ChatCompletion.create(
            model=model, messages=messages, 
            temperature=temperature, 
            max_tokens=self.max_tokens,
            n=n_choices
            )
        print('\tDone sending request to GPT-3')
        return response

    def summarize(
            self, task, prep_step, edit_task, simplify_task, simplify_audience,
            format_task,
            n_choices=5, task_first=True):
        if task_first == True:
            full_task = f'{task}\n\n{prep_step}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        else:
            full_task = f'{prep_step}\n\n{task}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        prompt = self.create_prompt(full_task, self.text)
        firstline_pattern = r'\s?(\S*)(\n*)(.+)'
        title = re.match(firstline_pattern, self.text)[0]
        self.qna = dict() 
        self.qna['timestamp'] = str(datetime.now(pytz.timezone('Canada/Pacific')))
        self.qna['reference_id'] = self.reference_id
        self.qna['article_title'] = self.title
        self.qna['text'] = self.text
        self.qna['system_role'] = self.system_role
        self.qna['model'] = self.model        
        self.qna['temperature'] = self.temperature
        self.qna['prep_step'] = prep_step.strip()
        self.qna['summarize_task'] = task.strip()
        self.qna['edit_task'] = edit_task.strip()
        self.qna['simplify_task'] = simplify_task.strip()
        self.qna['simplify_audience'] = simplify_audience.strip()
        self.qna['format_task'] = format_task.strip()
        self.qna['full_summarize_task'] = full_task.strip()
        self.qna['folder'] = self.folder
        self.summaries_dict = dict()
        self.article_title = title
        self.response_regex = r'response_(.*)'
        self.simple_summary_dict = dict()
        self.relevance_dict = dict()
        self.n_previous_prompts = dict()

        try:
            response = self.gpt(prompt, n_choices=n_choices, temperature=self.temperature)
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**API request failed for `.summarize()`**')
            return self.qna
        try:
            for index, choice in enumerate(response.choices):
                self.summaries_dict[f'response_{"{:02d}".format(index+1)}'] = choice["message"]["content"]
            self.qna.setdefault('summary', [])
            self.qna['summary'].extend([value for value in self.summaries_dict.values()])
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**Error with response parsing**')
    
def batch_summarize(sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chaining_bot_dict, iteration_id, task_first=True,
    system_role=None, model='gpt-3.5-turbo', max_tokens=1000, temperature=0.7, pause_per_request=0, n_choices=5,
    save_outputs=False
    ):
    prompts_df = pd.DataFrame(product(prep_step, summarize_task, edit_task, simplify_task, simplify_audience, format_task), 
        columns=['prep_step', 'summarize_task', 'edit_task', 'simplify_task', 'simplify_audience', 'format_task'])

    chaining_bot_dict[iteration_id] = dict()
    def summarize_from_df_row(text_id, title, text, chaining_bot_dict):
        for index in prompts_df.index:
            print(f'**Text #{text_id} prompt #{index+1} of {prompts_df.index.max()+1}**')
            task = prompts_df.loc[index, 'summarize_task']
            prep_step = prompts_df.loc[index, 'prep_step']
            edit_task = prompts_df.loc[index, 'edit_task']
            simplify_task = prompts_df.loc[index, 'simplify_task']
            simplify_audience = prompts_df.loc[index, 'simplify_audience']
            format_task = prompts_df.loc[index, 'format_task']
            try:
                print('Creating Chaining class instance')
                chatbot = Chaining(
                    text_id, title, text, folder_path=folder_path, system_role=system_role, 
                    model=model, max_tokens=max_tokens, temperature=temperature)
                print('Chaining class instance created')
                chatbot.summarize(
                    task=task, prep_step=prep_step, edit_task=edit_task, 
                    simplify_task=simplify_task, simplify_audience=simplify_audience,
                    format_task=format_task, n_choices=n_choices, task_first=task_first
                    )
                chaining_bot_dict[iteration_id][f'{text_id}_prompt{"{:02d}".format(index)}'] = chatbot
                print('\t...Completed')
                if pause_per_request > 0:
                    print(f'[batch_summarize()] Sleeping {pause_per_request} sec to avoid exceeding API rate limit')
                    time.sleep(pause_per_request) # Account for API rate limit of 3 API requests/limit 
            except Exception as error:
                exc_type, exc_obj, tb = sys.exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                file = f.f_code.co_filename
                print("An error occurred on line", lineno, "in", file, ":", error)
                print('\t...Error making chatbot request')
                break
    sources_df.apply(lambda row: summarize_from_df_row(row['id'], row['title'], row['text'], chaining_bot_dict), axis=1)
    
    if save_outputs:
        try:
            save_instance_to_dict(
                chaining_bot_dict[iteration_id], 
                description=f'batch_Chaining_attributes_initial',
                ext=None, json_path=folder_path
                )
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            file = f.f_code.co_filename
            print(f'An error occurred on line {lineno} in {file}: {error}')
            print('[batch_summarize_chain()] Unable to save API response')

    return chaining_bot_dict

def create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=None, 
    ):
    """
    Create DataFrame from initial ChatGPT summaries.
    """
    dfs_list = []
    chatbot_id = iteration_id if chatbot_id == None else chatbot_id
    for chatbot_key in chatbot_dict[chatbot_id].keys():
        print(f'Processing {chatbot_key}...')
        dfs_list.append(pd.DataFrame(
            chatbot_dict[chatbot_id][chatbot_key].qna, 
            index=[choice for choice in range(1, len(chatbot_dict[chatbot_id][chatbot_key].qna['summary'])+1)])
            )
    
    qna_df = pd.concat(dfs_list).reset_index(names=['choice'])
    qna_df = extract_summary(qna_df, 'summary')
    columns = qna_df.columns.tolist()
    columns.remove('choice')
    columns.insert(3, 'choice') # Move 'choice' column

    # qna_df['date'] = pd.Series('2023-06-12', index=qna_df.index)
    # columns.insert(0, 'date')


    qna_dict[iteration_id] = qna_df[columns]
    print(f'Original summaries DataFrame shape: {qna_df.shape}')
    print(f'\tOriginal summaries Dataframe columns: {qna_df.columns}')
    return qna_dict


import json
def extract_summary(df, summary_column='summary'):
    # Convert the string to JSON
    df[summary_column] = df[summary_column].apply(json.loads)

    # Extract 'headline' and 'body' values
    df['headline'] = df[summary_column].apply(lambda x: x['headline'])
    df['simple_summary'] = df[summary_column].apply(lambda x: x['audience'])
    df[summary_column] = df[summary_column].apply(lambda x: x['body'])

    return df


# Set parameters
iteration_id = 1
n_choices = 2
pause_per_request=0
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
# model = 'gpt-3.5-turbo-16k-0613'
model = 'gpt-4'
save_outputs=True
folder_path = '../text/2023-07-11 for db'

# summaries = get_table(table='summaries')


sources_df = get_table(table='sources', limit=3)

chaining_dict = batch_summarize(
    sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chatbot_dict,
    system_role=system_role, model=model, max_tokens=1000,
    n_choices=n_choices, pause_per_request=pause_per_request,
    iteration_id=iteration_id, save_outputs=save_outputs
    )
# # chaining_dict[iteration_id]
qna_dict = create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
    )
# Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])
qna_dict[iteration_id]


Query: SELECT * from sources LIMIT 3
**Text #1 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-4
Chaining class instance created
	Done creating prompt
	Sending request to gpt-4
		Requesting 2 choices using gpt-4
An error occurred on line 246 in C:\Users\silvh\AppData\Local\Temp\ipykernel_23984\3007662155.py : Rate limit reached for 10KTPM-200RPM in organization org-4l8HUKDtXhH0T7iFErf1JSJg on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues.
	**API request failed for `.summarize()`**
	...Completed
**Text #2 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-4
Chaining class instance created
	Done creating prompt
	Sending request to gpt-4
		Requesting 2 choices using gpt-4
An error occurred on line 246 in C:\Users\silvh\AppData\Local\Temp\ipykernel_23984\3007662155.py : Rate limit reached for 10KTPM-200RPM in organization org-4l8HUKDtXhH0T7iFErf1JSJ

KeyError: 'summary'

# Iteration 1: Increase temperature

In [26]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) #
    simple_summary = mapped_column(Text)
    # rating_simple_content = mapped_column(Integer) #
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi']
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                # if table == 'summaries':

                    # Check if prompt already exists in the database
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        original_headline=row['headline'],
                        simple_summary=row['simple_summary'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *
from response_processing import *
import time
import pytz
import re
from itertools import product
import openai
from prompts import *

class Chaining:
    def __init__(self, text_id, title, text, folder_path, system_role="You are a helpful assistant.", 
            model="gpt-3.5-turbo", temperature=0.7, max_tokens=9000, 
        ):
        self.reference_id = text_id
        self.title = title
        self.text = text
        self.folder = re.sub(r'(?:.*\/)?(.*\/.*)\/?$', r'\1', folder_path)
        self.system_role = system_role
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.model = model
        print(f'***OpenAI model: {self.model}')

    def create_prompt(self, task, text):
        system_role = f'{self.system_role}'
        user_input = f"""Given the following text delimited by triple backticks: ```{text}``` \n {task}"""
        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_input},]

        print('\tDone creating prompt')
        return messages

    def gpt(self, messages, n_choices, temperature, model=None):
        model = self.model if model == None else model
        print(f'\tSending request to {model}')
        print(f'\t\tRequesting {n_choices} choices using {model}')
        openai.api_key = os.getenv('api_openai')
        response = openai.ChatCompletion.create(
            model=model, messages=messages, 
            temperature=temperature, 
            max_tokens=self.max_tokens,
            n=n_choices
            )
        print('\tDone sending request to GPT-3')
        return response

    def summarize(
            self, task, prep_step, edit_task, simplify_task, simplify_audience,
            format_task,
            n_choices=5, task_first=True):
        if task_first == True:
            full_task = f'{task}\n\n{prep_step}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        else:
            full_task = f'{prep_step}\n\n{task}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        prompt = self.create_prompt(full_task, self.text)
        firstline_pattern = r'\s?(\S*)(\n*)(.+)'
        title = re.match(firstline_pattern, self.text)[0]
        self.qna = dict() 
        self.qna['timestamp'] = str(datetime.now(pytz.timezone('Canada/Pacific')))
        self.qna['reference_id'] = self.reference_id
        self.qna['article_title'] = self.title
        self.qna['text'] = self.text
        self.qna['system_role'] = self.system_role
        self.qna['model'] = self.model        
        self.qna['temperature'] = self.temperature
        self.qna['prep_step'] = prep_step.strip()
        self.qna['summarize_task'] = task.strip()
        self.qna['edit_task'] = edit_task.strip()
        self.qna['simplify_task'] = simplify_task.strip()
        self.qna['simplify_audience'] = simplify_audience.strip()
        self.qna['format_task'] = format_task.strip()
        self.qna['full_summarize_task'] = full_task.strip()
        self.qna['folder'] = self.folder
        self.summaries_dict = dict()
        self.article_title = title
        self.response_regex = r'response_(.*)'
        self.simple_summary_dict = dict()
        self.relevance_dict = dict()
        self.n_previous_prompts = dict()

        try:
            response = self.gpt(prompt, n_choices=n_choices, temperature=self.temperature)
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**API request failed for `.summarize()`**')
            return self.qna
        try:
            for index, choice in enumerate(response.choices):
                self.summaries_dict[f'response_{"{:02d}".format(index+1)}'] = choice["message"]["content"]
            self.qna.setdefault('summary', [])
            self.qna['summary'].extend([value for value in self.summaries_dict.values()])
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**Error with response parsing**')
    
def batch_summarize(sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chaining_bot_dict, iteration_id, task_first=True,
    system_role=None, model='gpt-3.5-turbo', max_tokens=1000, temperature=0.7, pause_per_request=0, n_choices=5,
    save_outputs=False
    ):
    prompts_df = pd.DataFrame(product(prep_step, summarize_task, edit_task, simplify_task, simplify_audience, format_task), 
        columns=['prep_step', 'summarize_task', 'edit_task', 'simplify_task', 'simplify_audience', 'format_task'])

    chaining_bot_dict[iteration_id] = dict()
    def summarize_from_df_row(text_id, title, text, chaining_bot_dict):
        for index in prompts_df.index:
            print(f'**Text #{text_id} prompt #{index+1} of {prompts_df.index.max()+1}**')
            task = prompts_df.loc[index, 'summarize_task']
            prep_step = prompts_df.loc[index, 'prep_step']
            edit_task = prompts_df.loc[index, 'edit_task']
            simplify_task = prompts_df.loc[index, 'simplify_task']
            simplify_audience = prompts_df.loc[index, 'simplify_audience']
            format_task = prompts_df.loc[index, 'format_task']
            try:
                print('Creating Chaining class instance')
                chatbot = Chaining(
                    text_id, title, text, folder_path=folder_path, system_role=system_role, 
                    model=model, max_tokens=max_tokens, temperature=temperature)
                print('Chaining class instance created')
                chatbot.summarize(
                    task=task, prep_step=prep_step, edit_task=edit_task, 
                    simplify_task=simplify_task, simplify_audience=simplify_audience,
                    format_task=format_task, n_choices=n_choices, task_first=task_first
                    )
                chaining_bot_dict[iteration_id][f'{text_id}_prompt{"{:02d}".format(index)}'] = chatbot
                print('\t...Completed')
                if pause_per_request > 0:
                    print(f'[batch_summarize()] Sleeping {pause_per_request} sec to avoid exceeding API rate limit')
                    time.sleep(pause_per_request) # Account for API rate limit of 3 API requests/limit 
            except Exception as error:
                exc_type, exc_obj, tb = sys.exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                file = f.f_code.co_filename
                print("An error occurred on line", lineno, "in", file, ":", error)
                print('\t...Error making chatbot request')
                break
    sources_df.apply(lambda row: summarize_from_df_row(row['id'], row['title'], row['text'], chaining_bot_dict), axis=1)
    
    if save_outputs:
        try:
            save_instance_to_dict(
                chaining_bot_dict[iteration_id], 
                description=f'batch_Chaining_attributes_initial',
                ext=None, json_path=folder_path
                )
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            file = f.f_code.co_filename
            print(f'An error occurred on line {lineno} in {file}: {error}')
            print('[batch_summarize_chain()] Unable to save API response')

    return chaining_bot_dict

def create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=None, 
    ):
    """
    Create DataFrame from initial ChatGPT summaries.
    """
    dfs_list = []
    chatbot_id = iteration_id if chatbot_id == None else chatbot_id
    for chatbot_key in chatbot_dict[chatbot_id].keys():
        print(f'Processing {chatbot_key}...')
        dfs_list.append(pd.DataFrame(
            chatbot_dict[chatbot_id][chatbot_key].qna, 
            index=[choice for choice in range(1, len(chatbot_dict[chatbot_id][chatbot_key].qna['summary'])+1)])
            )
    
    qna_df = pd.concat(dfs_list).reset_index(names=['choice'])
    qna_df = extract_summary(qna_df, 'summary')
    columns = qna_df.columns.tolist()
    columns.remove('choice')
    columns.insert(3, 'choice') # Move 'choice' column

    # qna_df['date'] = pd.Series('2023-06-12', index=qna_df.index)
    # columns.insert(0, 'date')


    qna_dict[iteration_id] = qna_df[columns]
    print(f'Original summaries DataFrame shape: {qna_df.shape}')
    print(f'\tOriginal summaries Dataframe columns: {qna_df.columns}')
    return qna_dict


import json
def extract_summary(df, summary_column='summary'):
    # Convert the string to JSON
    df[summary_column] = df[summary_column].apply(json.loads)

    # Extract 'headline' and 'body' values
    df['headline'] = df[summary_column].apply(lambda x: x['headline'])
    df['simple_summary'] = df[summary_column].apply(lambda x: x['audience'])
    df[summary_column] = df[summary_column].apply(lambda x: x['body'])

    return df


# Set parameters
iteration_id = 1
article_limit = None
temperature = 1.5
n_choices = 2
pause_per_request=0
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
model = 'gpt-3.5-turbo-16k-0613'
# model = 'gpt-4'
save_outputs=True
folder_path = '../text/2023-07-11 for db'

# summaries = get_table(table='summaries')


sources_df = get_table(table='sources', limit=article_limit)

chaining_dict = batch_summarize(
    sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chatbot_dict, temperature=temperature,
    system_role=system_role, model=model, max_tokens=1000,
    n_choices=n_choices, pause_per_request=pause_per_request,
    iteration_id=iteration_id, save_outputs=save_outputs
    )
# # chaining_dict[iteration_id]
qna_dict = create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
    )
# Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])
qna_dict[iteration_id]


Query: SELECT * from sources
**Text #1 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
**Text #2 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
**Text #3 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
**Text #4 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turb

Unnamed: 0,timestamp,reference_id,article_title,choice,text,system_role,model,temperature,prep_step,summarize_task,edit_task,simplify_task,simplify_audience,format_task,full_summarize_task,folder,summary,headline,simple_summary
0,2023-07-12 11:23:53.713910-07:00,1,Comparisons in the Recovery Response From Resi...,1,"Decreases in muscle mass, function, and neurom...",You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,A new study compared young and middle-aged adu...,New Research on the Impact of Aging and Exerci...,Check out this new study comparing young and m...
1,2023-07-12 11:23:53.713910-07:00,1,Comparisons in the Recovery Response From Resi...,2,"Decreases in muscle mass, function, and neurom...",You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,A recent study compared the recovery response ...,New Study on Aging and Exercise,A new study has found that regular resistance ...
2,2023-07-12 11:23:56.749327-07:00,2,Effect of dietary sources of calcium and prote...,1,Longevity increases the proportion of older ad...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,A recent study found that a high calcium and h...,New study shows calcium and protein interventi...,A recent study has shown that a simple dietary...
3,2023-07-12 11:23:56.749327-07:00,2,Effect of dietary sources of calcium and prote...,2,Longevity increases the proportion of older ad...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,A recent study found promising results for a h...,Nutritional Intervention Reduces Fracture Risk...,A recent study found that a tailored high calc...
4,2023-07-12 11:24:01.173618-07:00,3,Exercise Snacks A Novel Strategy to Improve Ca...,1,We define exercise snacks as isolated ?1-min b...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,Exercise snacks are short bouts of vigorous ex...,Discover the Benefits of Exercise Snacks: A Fa...,Exercise snacks are short bursts of vigorous e...
5,2023-07-12 11:24:01.173618-07:00,3,Exercise Snacks A Novel Strategy to Improve Ca...,2,We define exercise snacks as isolated ?1-min b...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,Key points: Exercise snacks are brief bouts of...,Improve your health with exercise snacks,Introducing exercise snacks- short and intense...
6,2023-07-12 11:24:08.390067-07:00,4,"Food craving, cortisol and ghrelin responses i...",1,The United States is at the forefront of the g...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,A recent study in a controlled hospital-based ...,New Study Reveals Surprising Links Between Foo...,This study offers valuable information about h...
7,2023-07-12 11:24:08.390067-07:00,4,"Food craving, cortisol and ghrelin responses i...",2,The United States is at the forefront of the g...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,A recent study conducted in a controlled hospi...,New study examines the impact of food cues and...,Researchers have found that exposure to certai...
8,2023-07-12 11:24:11.716996-07:00,5,Hypohydration but not Menstrual Phase Influenc...,1,Pain is recognized as a public health problem ...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,Chronic pain affects a significant portion of ...,Study finds that mild dehydration increases pa...,A recent study discovered that not getting eno...
9,2023-07-12 11:24:11.716996-07:00,5,Hypohydration but not Menstrual Phase Influenc...,2,Pain is recognized as a public health problem ...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following informatio...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,1. Summarize the text for a LinkedIn post.\n\n...,text/2023-07-11 for db,A study found that mild dehydration can increa...,New Research Reveals that Dehydration Increase...,Did you know that being properly hydrated can ...


In [27]:
bulk_append(qna_dict[iteration_id])

Adding 12 rows to the database...
	Reference #1: New Research on the Impact of Aging and Exercise on Muscle Function and Recovery
	Reference #1: New Study on Aging and Exercise
	Reference #2: New study shows calcium and protein intervention reduces fracture and fall risk in older adults
	Reference #2: Nutritional Intervention Reduces Fracture Risk in Older Adults
	Reference #3: Discover the Benefits of Exercise Snacks: A Fast and Feasible Way to Improve Health
	Reference #3: Improve your health with exercise snacks
	Reference #4: New Study Reveals Surprising Links Between Food Cravings, Stress, and Weight
	Reference #4: New study examines the impact of food cues and stress on cravings and intake of highly palatable foods
	Reference #5: Study finds that mild dehydration increases pain sensitivity in women
	Reference #5: New Research Reveals that Dehydration Increases Pain Sensitivity in Women
	Reference #6: Weight Stigma Linked to Negative Health Behaviors
	Reference #6: New research li

## 1.1

In [28]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi']
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                # if table == 'summaries':

                    # Check if prompt already exists in the database
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        rating_original_content=row['rating_original_content'],
                        simple_summary=row['simple_summary'],
                        rating_simple_content=row['rating_simple_content'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *
from response_processing import *
import time
import pytz
import re
from itertools import product
import openai
from prompts import *

class Chaining:
    def __init__(self, text_id, title, text, folder_path, system_role="You are a helpful assistant.", 
            model="gpt-3.5-turbo", temperature=0.7, max_tokens=9000, 
        ):
        self.reference_id = text_id
        self.title = title
        self.text = text
        self.folder = re.sub(r'(?:.*\/)?(.*\/.*)\/?$', r'\1', folder_path)
        self.system_role = system_role
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.model = model
        print(f'***OpenAI model: {self.model}')

    def create_prompt(self, task, text):
        system_role = f'{self.system_role}'
        user_input = f"""Given the following text delimited by triple backticks: ```{text}``` \n {task}"""
        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_input},]

        print('\tDone creating prompt')
        return messages

    def gpt(self, messages, n_choices, temperature, model=None):
        model = self.model if model == None else model
        print(f'\tSending request to {model}')
        print(f'\t\tRequesting {n_choices} choices using {model}')
        openai.api_key = os.getenv('api_openai')
        response = openai.ChatCompletion.create(
            model=model, messages=messages, 
            temperature=temperature, 
            max_tokens=self.max_tokens,
            n=n_choices
            )
        print('\tDone sending request to GPT-3')
        return response

    def summarize(
            self, task, prep_step, edit_task, simplify_task, simplify_audience,
            format_task,
            n_choices=5, task_first=True):
        if task_first == True:
            full_task = f'{task}\n\n{prep_step}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        else:
            full_task = f'{prep_step}\n\n{task}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        prompt = self.create_prompt(full_task, self.text)
        firstline_pattern = r'\s?(\S*)(\n*)(.+)'
        title = re.match(firstline_pattern, self.text)[0]
        self.qna = dict() 
        self.qna['timestamp'] = str(datetime.now(pytz.timezone('Canada/Pacific')))
        self.qna['reference_id'] = self.reference_id
        self.qna['article_title'] = self.title
        self.qna['text'] = self.text
        self.qna['system_role'] = self.system_role
        self.qna['model'] = self.model        
        self.qna['temperature'] = self.temperature
        self.qna['prep_step'] = prep_step.strip()
        self.qna['summarize_task'] = task.strip()
        self.qna['edit_task'] = edit_task.strip()
        self.qna['simplify_task'] = simplify_task.strip()
        self.qna['simplify_audience'] = simplify_audience.strip()
        self.qna['format_task'] = format_task.strip()
        self.qna['full_summarize_task'] = full_task.strip()
        self.qna['folder'] = self.folder
        self.summaries_dict = dict()
        self.article_title = title
        self.response_regex = r'response_(.*)'
        self.simple_summary_dict = dict()
        self.relevance_dict = dict()
        self.n_previous_prompts = dict()

        try:
            response = self.gpt(prompt, n_choices=n_choices, temperature=self.temperature)
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**API request failed for `.summarize()`**')
            return self.qna
        try:
            for index, choice in enumerate(response.choices):
                self.summaries_dict[f'response_{"{:02d}".format(index+1)}'] = choice["message"]["content"]
            self.qna.setdefault('summary', [])
            self.qna['summary'].extend([value for value in self.summaries_dict.values()])
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**Error with response parsing**')
    
def batch_summarize(sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chaining_bot_dict, iteration_id, task_first=True,
    system_role=None, model='gpt-3.5-turbo', max_tokens=1000, temperature=0.7, pause_per_request=0, n_choices=5,
    save_outputs=False
    ):
    prompts_df = pd.DataFrame(product(prep_step, summarize_task, edit_task, simplify_task, simplify_audience, format_task), 
        columns=['prep_step', 'summarize_task', 'edit_task', 'simplify_task', 'simplify_audience', 'format_task'])

    chaining_bot_dict[iteration_id] = dict()
    def summarize_from_df_row(text_id, title, text, chaining_bot_dict):
        for index in prompts_df.index:
            print(f'**Text #{text_id} prompt #{index+1} of {prompts_df.index.max()+1}**')
            task = prompts_df.loc[index, 'summarize_task']
            prep_step = prompts_df.loc[index, 'prep_step']
            edit_task = prompts_df.loc[index, 'edit_task']
            simplify_task = prompts_df.loc[index, 'simplify_task']
            simplify_audience = prompts_df.loc[index, 'simplify_audience']
            format_task = prompts_df.loc[index, 'format_task']
            try:
                print('Creating Chaining class instance')
                chatbot = Chaining(
                    text_id, title, text, folder_path=folder_path, system_role=system_role, 
                    model=model, max_tokens=max_tokens, temperature=temperature)
                print('Chaining class instance created')
                chatbot.summarize(
                    task=task, prep_step=prep_step, edit_task=edit_task, 
                    simplify_task=simplify_task, simplify_audience=simplify_audience,
                    format_task=format_task, n_choices=n_choices, task_first=task_first
                    )
                chaining_bot_dict[iteration_id][f'{text_id}_prompt{"{:02d}".format(index)}'] = chatbot
                print('\t...Completed')
                if pause_per_request > 0:
                    print(f'[batch_summarize()] Sleeping {pause_per_request} sec to avoid exceeding API rate limit')
                    time.sleep(pause_per_request) # Account for API rate limit of 3 API requests/limit 
            except Exception as error:
                exc_type, exc_obj, tb = sys.exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                file = f.f_code.co_filename
                print("An error occurred on line", lineno, "in", file, ":", error)
                print('\t...Error making chatbot request')
                break
    sources_df.apply(lambda row: summarize_from_df_row(row['id'], row['title'], row['text'], chaining_bot_dict), axis=1)
    
    if save_outputs:
        try:
            save_instance_to_dict(
                chaining_bot_dict[iteration_id], 
                description=f'batch_Chaining_attributes_initial',
                ext=None, json_path=folder_path
                )
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            file = f.f_code.co_filename
            print(f'An error occurred on line {lineno} in {file}: {error}')
            print('[batch_summarize_chain()] Unable to save API response')

    return chaining_bot_dict

def create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=None, 
    ):
    """
    Create DataFrame from initial ChatGPT summaries.
    """
    dfs_list = []
    chatbot_id = iteration_id if chatbot_id == None else chatbot_id
    for chatbot_key in chatbot_dict[chatbot_id].keys():
        print(f'Processing {chatbot_key}...')
        dfs_list.append(pd.DataFrame(
            chatbot_dict[chatbot_id][chatbot_key].qna, 
            index=[choice for choice in range(1, len(chatbot_dict[chatbot_id][chatbot_key].qna['summary'])+1)])
            )
    
    qna_df = pd.concat(dfs_list).reset_index(names=['choice'])
    qna_df = extract_summary(qna_df, 'summary')
    columns = qna_df.columns.tolist()
    columns.remove('choice')
    columns.insert(3, 'choice') # Move 'choice' column

    # qna_df['date'] = pd.Series('2023-06-12', index=qna_df.index)
    # columns.insert(0, 'date')


    qna_dict[iteration_id] = qna_df[columns]
    print(f'Original summaries DataFrame shape: {qna_df.shape}')
    print(f'\tOriginal summaries Dataframe columns: {qna_df.columns}')
    return qna_dict


import json
def extract_summary(df, summary_column='summary'):
    # Convert the string to JSON
    df[summary_column] = df[summary_column].apply(json.loads)

    # Extract 'headline' and 'body' values
    df['headline'] = df[summary_column].apply(lambda x: x['headline'])
    df['simple_summary'] = df[summary_column].apply(lambda x: x['audience'])
    df[summary_column] = df[summary_column].apply(lambda x: x['body'])

    return df


# Set parameters
iteration_id = 1.1
article_limit = None
temperature = 1.5
n_choices = 2
pause_per_request=0
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
model = 'gpt-3.5-turbo-16k-0613'
# model = 'gpt-4'
save_outputs=True
folder_path = '../text/2023-07-11 for db'

# summaries = get_table(table='summaries')


sources_df = get_table(table='sources', limit=article_limit).tail(3)

chaining_dict = batch_summarize(
    sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chatbot_dict, temperature=temperature,
    system_role=system_role, model=model, max_tokens=1000,
    n_choices=n_choices, pause_per_request=pause_per_request,
    iteration_id=iteration_id, save_outputs=save_outputs
    )
# # chaining_dict[iteration_id]
qna_dict = create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
    )
# Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])
qna_dict[iteration_id]

Query: SELECT * from sources
**Text #4 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
**Text #5 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
**Text #6 prompt #1 of 1**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
4_prompt00
	reference_id
	title
	text
	folder
	system_role
	temperature
	max_tokens
	mode

JSONDecodeError: Expecting property name enclosed in double quotes: line 5 column 2 (char 848)

In [29]:
qna_dict[iteration_id]

KeyError: 1.1

## 1.11

In [107]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi']
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                # if table == 'summaries':

                    # Check if prompt already exists in the database
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        rating_original_content=row['rating_original_content'],
                        simple_summary=row['simple_summary'],
                        rating_simple_content=row['rating_simple_content'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *
from response_processing import *
import time
import pytz
import re
from itertools import product
import openai
from prompts import *

class Chaining:
    def __init__(self, text_id, title, text, folder_path, system_role="You are a helpful assistant.", 
            model="gpt-3.5-turbo", temperature=0.7, max_tokens=9000, 
        ):
        self.reference_id = text_id
        self.title = title
        self.text = text
        self.folder = re.sub(r'(?:.*\/)?(.*\/.*)\/?$', r'\1', folder_path)
        self.system_role = system_role
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.model = model
        print(f'***OpenAI model: {self.model}')

    def create_prompt(self, task, text):
        system_role = f'{self.system_role}'
        user_input = f"""Given the following text delimited by triple backticks: ```{text}``` \n {task}"""
        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_input},]

        print('\tDone creating prompt')
        return messages

    def gpt(self, messages, n_choices, temperature, model=None):
        model = self.model if model == None else model
        print(f'\tSending request to {model}')
        print(f'\t\tRequesting {n_choices} choices using {model}')
        openai.api_key = os.getenv('api_openai')
        response = openai.ChatCompletion.create(
            model=model, messages=messages, 
            temperature=temperature, 
            max_tokens=self.max_tokens,
            n=n_choices
            )
        print('\tDone sending request to GPT-3')
        return response

    def summarize(
            self, task, prep_step, edit_task, simplify_task, simplify_audience,
            format_task,
            n_choices=5, task_first=True):
        if task_first == True:
            full_task = f'{task}\n\n{prep_step}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        else:
            full_task = f'{prep_step}\n\n{task}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        prompt = self.create_prompt(full_task, self.text)
        firstline_pattern = r'\s?(\S*)(\n*)(.+)'
        title = re.match(firstline_pattern, self.text)[0]
        self.qna = dict() 
        self.qna['timestamp'] = str(datetime.now(pytz.timezone('Canada/Pacific')))
        self.qna['reference_id'] = self.reference_id
        self.qna['article_title'] = self.title
        self.qna['text'] = self.text
        self.qna['system_role'] = self.system_role
        self.qna['model'] = self.model        
        self.qna['temperature'] = self.temperature
        self.qna['prep_step'] = prep_step.strip()
        self.qna['summarize_task'] = task.strip()
        self.qna['edit_task'] = edit_task.strip()
        self.qna['simplify_task'] = simplify_task.strip()
        self.qna['simplify_audience'] = simplify_audience.strip()
        self.qna['format_task'] = format_task.strip()
        self.qna['full_summarize_task'] = full_task.strip()
        self.qna['folder'] = self.folder
        self.summaries_dict = dict()
        self.article_title = title
        self.response_regex = r'response_(.*)'
        self.simple_summary_dict = dict()
        self.relevance_dict = dict()
        self.n_previous_prompts = dict()

        try:
            response = self.gpt(prompt, n_choices=n_choices, temperature=self.temperature)
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**API request failed for `.summarize()`**')
            return self.qna
        try:
            for index, choice in enumerate(response.choices):
                self.summaries_dict[f'response_{"{:02d}".format(index+1)}'] = choice["message"]["content"]
            self.qna.setdefault('summary', [])
            self.qna['summary'].extend([value for value in self.summaries_dict.values()])
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**Error with response parsing**')
    
def batch_summarize(sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chaining_bot_dict, iteration_id, task_first=True,
    system_role=None, model='gpt-3.5-turbo', max_tokens=1000, temperature=0.7, pause_per_request=0, n_choices=5,
    save_outputs=False
    ):
    prompts_df = pd.DataFrame(product(prep_step, summarize_task, edit_task, simplify_task, simplify_audience, format_task), 
        columns=['prep_step', 'summarize_task', 'edit_task', 'simplify_task', 'simplify_audience', 'format_task'])

    chaining_bot_dict[iteration_id] = dict()
    def summarize_from_df_row(text_id, title, text, chaining_bot_dict):
        for index in prompts_df.index:
            print(f'**Text #{text_id} prompt #{index+1} of {prompts_df.index.max()+1}**')
            task = prompts_df.loc[index, 'summarize_task']
            prep_step = prompts_df.loc[index, 'prep_step']
            edit_task = prompts_df.loc[index, 'edit_task']
            simplify_task = prompts_df.loc[index, 'simplify_task']
            simplify_audience = prompts_df.loc[index, 'simplify_audience']
            format_task = prompts_df.loc[index, 'format_task']
            try:
                print('Creating Chaining class instance')
                chatbot = Chaining(
                    text_id, title, text, folder_path=folder_path, system_role=system_role, 
                    model=model, max_tokens=max_tokens, temperature=temperature)
                print('Chaining class instance created')
                chatbot.summarize(
                    task=task, prep_step=prep_step, edit_task=edit_task, 
                    simplify_task=simplify_task, simplify_audience=simplify_audience,
                    format_task=format_task, n_choices=n_choices, task_first=task_first
                    )
                chaining_bot_dict[iteration_id][f'{text_id}_prompt{"{:02d}".format(index)}'] = chatbot
                print('\t...Completed')
                if pause_per_request > 0:
                    print(f'[batch_summarize()] Sleeping {pause_per_request} sec to avoid exceeding API rate limit')
                    time.sleep(pause_per_request) # Account for API rate limit of 3 API requests/limit 
            except Exception as error:
                exc_type, exc_obj, tb = sys.exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                file = f.f_code.co_filename
                print("An error occurred on line", lineno, "in", file, ":", error)
                print('\t...Error making chatbot request')
                break
    sources_df.apply(lambda row: summarize_from_df_row(row['id'], row['title'], row['text'], chaining_bot_dict), axis=1)
    
    if save_outputs:
        try:
            save_instance_to_dict(
                chaining_bot_dict[iteration_id], 
                description=f'batch_Chaining_attributes_initial',
                ext=None, json_path=folder_path
                )
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            file = f.f_code.co_filename
            print(f'An error occurred on line {lineno} in {file}: {error}')
            print('[batch_summarize_chain()] Unable to save API response')

    return chaining_bot_dict

def create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=None, 
    ):
    """
    Create DataFrame from initial ChatGPT summaries.
    """
    dfs_list = []
    chatbot_id = iteration_id if chatbot_id == None else chatbot_id
    for chatbot_key in chatbot_dict[chatbot_id].keys():
        print(f'Processing {chatbot_key}...')
        dfs_list.append(pd.DataFrame(
            chatbot_dict[chatbot_id][chatbot_key].qna, 
            index=[choice for choice in range(1, len(chatbot_dict[chatbot_id][chatbot_key].qna['summary'])+1)])
            )
    
    qna_df = pd.concat(dfs_list).reset_index(names=['choice'])
    qna_df = extract_summary(qna_df, 'summary')
    columns = qna_df.columns.tolist()
    columns.remove('choice')
    columns.insert(3, 'choice') # Move 'choice' column

    # qna_df['date'] = pd.Series('2023-06-12', index=qna_df.index)
    # columns.insert(0, 'date')


    qna_dict[iteration_id] = qna_df[columns]
    print(f'Original summaries DataFrame shape: {qna_df.shape}')
    print(f'\tOriginal summaries Dataframe columns: {qna_df.columns}')
    return qna_dict


import json
def extract_summary(df, summary_column='summary'):
    # Convert the string to JSON
    try:
        df[summary_column] = df[summary_column].apply(json.loads)
    except Exception as error:
        print(f'Error converting {summary_column} column to JSON: {error}; will do row by row')
        summary_list = []
        for index, summary in df[summary_column].items():
            try:
                summary_list.append(json.loads(summary))
            except Exception as error:
                print(f'Error converting summary {index} to JSON: {error}')
                summary_list.append(summary)
    def extract_value_from_key(summary, key):
        try:
            return summary[key]
        except Exception as error:
            value = re.search(rf'"{key}":\s*"([^"]+)"', summary).group(1)
            return value

    # Extract 'headline' and 'body' values
    df['headline'] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'headline'))
    df['simple_summary'] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'audience'))
    df[summary_column] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'body'))

    return df


# Set parameters
iteration_id = 1.1
article_limit = None
temperature = 1.5
n_choices = 2
pause_per_request=0
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
model = 'gpt-3.5-turbo-16k-0613'
# model = 'gpt-4'
save_outputs=True
folder_path = '../text/2023-07-11 for db'

# summaries = get_table(table='summaries')


# sources_df = get_table(table='sources', limit=article_limit).tail(3)

# chaining_dict = batch_summarize(
#     sources_df, folder_path, prep_step, summarize_task, edit_task, 
#     simplify_task, simplify_audience, format_task,
#     chatbot_dict, temperature=temperature,
#     system_role=system_role, model=model, max_tokens=1000,
#     n_choices=n_choices, pause_per_request=pause_per_request,
#     iteration_id=iteration_id, save_outputs=save_outputs
#     )
# # chaining_dict[iteration_id]
qna_dict = create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
    )
# Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])
qna_dict[iteration_id]

Processing 4_prompt00...
Processing 5_prompt00...
Processing 6_prompt00...
Error converting summary column to JSON: Expecting property name enclosed in double quotes: line 5 column 2 (char 848); will do row by row
Error converting summary 5 to JSON: Expecting property name enclosed in double quotes: line 5 column 2 (char 848)
Original summaries DataFrame shape: (6, 19)
	Original summaries Dataframe columns: Index(['choice', 'timestamp', 'reference_id', 'article_title', 'text',
       'system_role', 'model', 'temperature', 'prep_step', 'summarize_task',
       'edit_task', 'simplify_task', 'simplify_audience', 'format_task',
       'full_summarize_task', 'folder', 'summary', 'headline',
       'simple_summary'],
      dtype='object')


Unnamed: 0,timestamp,reference_id,article_title,choice,text,system_role,model,temperature,prep_step,summarize_task,edit_task,simplify_task,simplify_audience,format_task,full_summarize_task,folder,summary,headline,simple_summary
0,2023-07-12 14:10:05.214512-07:00,4,"Food craving, cortisol and ghrelin responses in modeling highly palatable snack intake in the la...",1,The United States is at the forefront of the global obesity epidemic with 67% of its population ...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following information: \n- Identify the key points and statistics ...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \nEvaluate your text message to see if it may be co...,"3. If needed, rewrite the text using terms appropriate for the audience. If not keep it the same...",people without a science background,"4. Return your final response in a JSON format with the following format: \n{""headline"": <su...","1. Summarize the text for a LinkedIn post.\n\nIn the summary, cover the following information: ...",text/2023-07-11 for db,A recent study conducted in a controlled setting found that exposure to food cues and stress can...,New Study Reveals How Food Cues and Stress Increase Cravings for Unhealthy Foods,Discover how exposure to food cues and stress can affect our cravings for unhealthy foods: the l...
1,2023-07-12 14:10:05.214512-07:00,4,"Food craving, cortisol and ghrelin responses in modeling highly palatable snack intake in the la...",2,The United States is at the forefront of the global obesity epidemic with 67% of its population ...,You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following information: \n- Identify the key points and statistics ...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \nEvaluate your text message to see if it may be co...,"3. If needed, rewrite the text using terms appropriate for the audience. If not keep it the same...",people without a science background,"4. Return your final response in a JSON format with the following format: \n{""headline"": <su...","1. Summarize the text for a LinkedIn post.\n\nIn the summary, cover the following information: ...",text/2023-07-11 for db,"In a recent study conducted in a controlled hospital setting, researchers found that exposure to...",New Study Shows That Food Cues and Stress Increase Cravings and Intake of Unhealthy Foods,A recent study conducted in a controlled hospital setting found that exposures to food cues and ...
2,2023-07-12 14:10:08.562822-07:00,5,Hypohydration but not Menstrual Phase Influences Pain Perception in Healthy Women,1,"Pain is recognized as a public health problem (1). Chronic pain [i.e., pain that persists for ?3...",You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following information: \n- Identify the key points and statistics ...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \nEvaluate your text message to see if it may be co...,"3. If needed, rewrite the text using terms appropriate for the audience. If not keep it the same...",people without a science background,"4. Return your final response in a JSON format with the following format: \n{""headline"": <su...","1. Summarize the text for a LinkedIn post.\n\nIn the summary, cover the following information: ...",text/2023-07-11 for db,"A recent study found that mild dehydration increases pain sensitivity in women, supporting previ...",New Research Shows Dehydration Can Affect Pain Sensitivity in Women,"Dehydration can have a negative impact on pain response in women, according to new research. It'..."
3,2023-07-12 14:10:08.562822-07:00,5,Hypohydration but not Menstrual Phase Influences Pain Perception in Healthy Women,2,"Pain is recognized as a public health problem (1). Chronic pain [i.e., pain that persists for ?3...",You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following information: \n- Identify the key points and statistics ...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \nEvaluate your text message to see if it may be co...,"3. If needed, rewrite the text using terms appropriate for the audience. If not keep it the same...",people without a science background,"4. Return your final response in a JSON format with the following format: \n{""headline"": <su...","1. Summarize the text for a LinkedIn post.\n\nIn the summary, cover the following information: ...",text/2023-07-11 for db,"A recent study found that mild dehydration can increase experimental pain sensitivity in women, ...",New research shows that dehydration increases pain sensitivity in women,Have you ever considered the effects of dehydration on pain perception? Recent research shows th...
4,2023-07-12 14:10:12.012633-07:00,6,Weight stigma and health behaviors: evidence from the Eating in America Study,1,"Weight stigma is pervasive. Higher weight individuals are stigmatized across many contexts, incl...",You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following information: \n- Identify the key points and statistics ...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \nEvaluate your text message to see if it may be co...,"3. If needed, rewrite the text using terms appropriate for the audience. If not keep it the same...",people without a science background,"4. Return your final response in a JSON format with the following format: \n{""headline"": <su...","1. Summarize the text for a LinkedIn post.\n\nIn the summary, cover the following information: ...",text/2023-07-11 for db,"Weight stigma negatively affects health behaviors such as eating, physical activity, alcohol use...",Weight Stigma and its Impact on Health Behaviors,"New study highlights the negative impact of weight stigma on eating, exercise, alcohol use, and ..."
5,2023-07-12 14:10:12.012633-07:00,6,Weight stigma and health behaviors: evidence from the Eating in America Study,2,"Weight stigma is pervasive. Higher weight individuals are stigmatized across many contexts, incl...",You are a helpful assistant.,gpt-3.5-turbo-16k-0613,1.5,"In the summary, cover the following information: \n- Identify the key points and statistics ...",1. Summarize the text for a LinkedIn post.,Once you have written your text message: \nEvaluate your text message to see if it may be co...,"3. If needed, rewrite the text using terms appropriate for the audience. If not keep it the same...",people without a science background,"4. Return your final response in a JSON format with the following format: \n{""headline"": <su...","1. Summarize the text for a LinkedIn post.\n\nIn the summary, cover the following information: ...",text/2023-07-11 for db,Weight discrimination is associated with negative effects on physical health and multiple health...,Weight stigma and health behaviors: Key findings from recent research,"Weight discrimination can impact physical health and health behaviors like eating, alcohol use, ..."


In [46]:
qna_dict[iteration_id][['folder', 'summary']]

Unnamed: 0,folder,summary
0,text/2023-07-11 for db,"{""headline"": ""New Study Reveals How Food Cues and Stress Increase Cravings for Unhealthy Foods"",\n""body"": ""A recent study conducted in a controlled setting found that exposure to food cues and stress can significantly increase cravings for highly palatable foods. The study also showed that these cravings directly predicted subsequent intake of unhealthy snacks. Furthermore, the hormone ghrelin was found to play a role in promoting food cravings, particularly in individuals who are overweight. The findings highlight the need for greater understanding of the biobehavioral processes that contribute to overeating and weight gain."",\n""audience"": ""Discover how exposure to food cues and stress can affect our cravings for unhealthy foods: the latest research sheds light on the role of hormonal responses and the potential impact on weight management.""}"
1,text/2023-07-11 for db,"{\n ""headline"": ""New Study Shows That Food Cues and Stress Increase Cravings and Intake of Unhealthy Foods"",\n ""body"": ""In a recent study conducted in a controlled hospital setting, researchers found that exposure to food cues and stress both significantly increased cravings for highly palatable (HP) foods. These cravings, in turn, predicted greater intake of HP foods. Additionally, the study revealed that specific hormones, such as ghrelin and cortisol, may play a role in food motivation and intake. The findings highlight the potential factors influencing overeating and weight gain, and provide insights into the biobehavioral processes driving unhealthy food consumption."",\n ""audience"": ""A recent study conducted in a controlled hospital setting found that exposures to food cues and stress may increase unhealthy food cravings and intake. The findings suggest important factors to consider in controlling food consumption and understanding the impact on weight gain."" \n}"
2,text/2023-07-11 for db,"{""headline"": ""New Research Shows Dehydration Can Affect Pain Sensitivity in Women"",\n""body"": ""A recent study found that mild dehydration increases pain sensitivity in women, supporting previous research in men. The study also investigated the effects of menstrual phase on pain sensitivity, but found no significant difference. Interestingly, acute water ingestion did not reduce pain sensitivity in hypohydrated participants. This research highlights the importance of staying properly hydrated to manage pain symptoms."",\n""audience"": ""Dehydration can have a negative impact on pain response in women, according to new research. It's essential to focus on staying hydrated to help manage pain effectively.""}"
3,text/2023-07-11 for db,"{""headline"": ""New research shows that dehydration increases pain sensitivity in women"", \n""body"": ""A recent study found that mild dehydration can increase experimental pain sensitivity in women, leading to decreased pain tolerance and increased pain intensity and unpleasantness. The study also examined the effects of menstrual phase on pain sensitivity, but found that it did not greatly impact pain perception. Additionally, acute water ingestion did not reduce pain sensitivity. This research highlights the importance of maintaining adequate hydration for women's wellbeing and suggests a link between dehydration and pain perception."", \n""audience"": ""Have you ever considered the effects of dehydration on pain perception? Recent research shows that even mild dehydration can affect how we perceive pain. This is particularly important for women, as they may experience greater pain sensitivity under dehydrated conditions. Ensuring proper hydration can play a role in managing pain symptoms and supporting overall health.""}"
4,text/2023-07-11 for db,"{""headline"": ""Weight Stigma and its Impact on Health Behaviors"",\n ""body"": ""Weight stigma negatively affects health behaviors such as eating, physical activity, alcohol use, and sleep, according to a US study. The research highlights that weight stigma is associated with disordered eating, higher alcohol consumption, and poorer sleep quality. It also found that weight stigma affects health behaviors across different weights. This shows the need to reduce weight stigma and promote weight-inclusive health approaches."",\n ""audience"": ""New study highlights the negative impact of weight stigma on eating, exercise, alcohol use, and sleep. It affects people of all weights. Let's work towards promoting more inclusive health approaches to avoid detrimental effects.""}"
5,text/2023-07-11 for db,"{\n ""headline"": ""Weight stigma and health behaviors: Key findings from recent research"",\n ""body"": ""Weight discrimination is associated with negative effects on physical health and multiple health behaviors including eating behavior, alcohol use, and sleep. Research found that weight stigma is significantly associated with poorer health behaviors, independent of BMI. These findings suggest the need to reduce weight stigma and employ more weight-inclusive approaches to health promotion."",\n ""audience"": ""Weight discrimination can impact physical health and health behaviors like eating, alcohol use, and sleep. Research, in a diverse U.S. sample, found that weight stigma is associated with poorer health behaviors, regardless of body mass index. Addressing weight stigma is important for promoting healthy behaviors and overall well-being."",\n }"


In [57]:
qna_dict[iteration_id].loc[5, 'summary']

'{\n  "headline": "Weight stigma and health behaviors: Key findings from recent research",\n  "body": "Weight discrimination is associated with negative effects on physical health and multiple health behaviors including eating behavior, alcohol use, and sleep. Research found that weight stigma is significantly associated with poorer health behaviors, independent of BMI. These findings suggest the need to reduce weight stigma and employ more weight-inclusive approaches to health promotion.",\n  "audience": "Weight discrimination can impact physical health and health behaviors like eating, alcohol use, and sleep. Research, in a diverse U.S. sample, found that weight stigma is associated with poorer health behaviors, regardless of body mass index. Addressing weight stigma is important for promoting healthy behaviors and overall well-being.",\n }'

In [99]:
qna_dict[iteration_id].loc[5, 'summary'].replace(r'",\n', r'')

'{\n  "headline": "Weight stigma and health behaviors: Key findings from recent research",\n  "body": "Weight discrimination is associated with negative effects on physical health and multiple health behaviors including eating behavior, alcohol use, and sleep. Research found that weight stigma is significantly associated with poorer health behaviors, independent of BMI. These findings suggest the need to reduce weight stigma and employ more weight-inclusive approaches to health promotion.",\n  "audience": "Weight discrimination can impact physical health and health behaviors like eating, alcohol use, and sleep. Research, in a diverse U.S. sample, found that weight stigma is associated with poorer health behaviors, regardless of body mass index. Addressing weight stigma is important for promoting healthy behaviors and overall well-being.",\n }'

In [89]:
qna_dict[iteration_id].loc[5, 'summary'].rstrip(',\n ')

'{\n  "headline": "Weight stigma and health behaviors: Key findings from recent research",\n  "body": "Weight discrimination is associated with negative effects on physical health and multiple health behaviors including eating behavior, alcohol use, and sleep. Research found that weight stigma is significantly associated with poorer health behaviors, independent of BMI. These findings suggest the need to reduce weight stigma and employ more weight-inclusive approaches to health promotion.",\n  "audience": "Weight discrimination can impact physical health and health behaviors like eating, alcohol use, and sleep. Research, in a diverse U.S. sample, found that weight stigma is associated with poorer health behaviors, regardless of body mass index. Addressing weight stigma is important for promoting healthy behaviors and overall well-being.",\n }'

In [91]:
text = '{\n  "headline": "Weight stigma and health behaviors: Key findings from recent research",\n  "body": "Weight discrimination is associated with negative effects on physical health and multiple health behaviors including eating behavior, alcohol use, and sleep. Research found that weight stigma is significantly associated with poorer health behaviors, independent of BMI. These findings suggest the need to reduce weight stigma and employ more weight-inclusive approaches to health promotion.",\n  "audience": "Weight discrimination can impact physical health and health behaviors like eating, alcohol use, and sleep. Research, in a diverse U.S. sample, found that weight stigma is associated with poorer health behaviors, regardless of body mass index. Addressing weight stigma is important for promoting healthy behaviors and overall well-being.",\n }'
json.loads(text)

JSONDecodeError: Expecting property name enclosed in double quotes: line 5 column 2 (char 848)

In [92]:
text = '{\n  "headline": "Weight stigma and health behaviors: Key findings from recent research",\n  "body": "Weight discrimination is associated with negative effects on physical health and multiple health behaviors including eating behavior, alcohol use, and sleep. Research found that weight stigma is significantly associated with poorer health behaviors, independent of BMI. These findings suggest the need to reduce weight stigma and employ more weight-inclusive approaches to health promotion.",\n  "audience": "Weight discrimination can impact physical health and health behaviors like eating, alcohol use, and sleep. Research, in a diverse U.S. sample, found that weight stigma is associated with poorer health behaviors, regardless of body mass index. Addressing weight stigma is important for promoting healthy behaviors and overall well-being."\n }'
json.loads(text)

{'headline': 'Weight stigma and health behaviors: Key findings from recent research',
 'body': 'Weight discrimination is associated with negative effects on physical health and multiple health behaviors including eating behavior, alcohol use, and sleep. Research found that weight stigma is significantly associated with poorer health behaviors, independent of BMI. These findings suggest the need to reduce weight stigma and employ more weight-inclusive approaches to health promotion.',
 'audience': 'Weight discrimination can impact physical health and health behaviors like eating, alcohol use, and sleep. Research, in a diverse U.S. sample, found that weight stigma is associated with poorer health behaviors, regardless of body mass index. Addressing weight stigma is important for promoting healthy behaviors and overall well-being.'}

In [42]:
for index, row in qna_dict[iteration_id]['summary'].items():
    print(index, row)
    print()

0 {"headline": "New Study Reveals How Food Cues and Stress Increase Cravings for Unhealthy Foods",
"body": "A recent study conducted in a controlled setting found that exposure to food cues and stress can significantly increase cravings for highly palatable foods. The study also showed that these cravings directly predicted subsequent intake of unhealthy snacks. Furthermore, the hormone ghrelin was found to play a role in promoting food cravings, particularly in individuals who are overweight. The findings highlight the need for greater understanding of the biobehavioral processes that contribute to overeating and weight gain.",
"audience": "Discover how exposure to food cues and stress can affect our cravings for unhealthy foods: the latest research sheds light on the role of hormonal responses and the potential impact on weight management."}

1 {
  "headline": "New Study Shows That Food Cues and Stress Increase Cravings and Intake of Unhealthy Foods",
  "body": "In a recent study con

# *End of Page*