# Title
[]()

In [3]:
# set the option to wrap text within cells
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# set up

In [3]:

references_df_dict = {}

# Create text dictionary
folder_path = '../text/2023-07-14 full' # ** UPDATE REQUIRED**

encoding='ISO-8859-1'
subset=None

In [4]:
qna_dict = dict()
chatbot_dict = dict()
simple_summaries_dict = dict()
relevance_dict = dict()
save = True
# save_outputs = False
save_outputs = True

# Generate summaries from most recent notebook, `2023-07-13 article feed` notebook

In [6]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
from orm_summarize import *
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                result_title_match = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                if result_title_match:
                    result_title = result_title_match.group(1)
                    cleaned_result_title = re.sub(r'</?[ib]>', '', result_title)
                    cleaned_result_title = re.sub(r'/(?![^<>]*>)', '', cleaned_result_title) # Remove any / that is not within html tag
                    cleaned_result_title = re.sub(r'[^a-zA-Z0-9 <>/]', '', cleaned_result_title).lower().strip()
                else:
                    cleaned_result_title = cleaned_result
                if cleaned_title == cleaned_result_title:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                        return result
                else:
                    continue
            if cleaned_title != cleaned_result_title:
                print(f'Warning: Article title not found in PMIDs.')
                print(f'Check these PMIDs: {id_list}')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tResult title: {result_title if result_title else cleaned_result}')
                print(f'\tCleaned input title: {cleaned_title}')
                print(f'\tCleaned result title: {cleaned_result_title}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    flagged_indices = df[df['flag_title'] == True].index
    for index in flagged_indices:
        print(f'Flagged: ')
        print(f'\tArticle title: {df.loc[index, "title"]}')
        print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
        print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')
                elif table == 'feed':
                    source = session.query(Feed).filter_by(
                        title=row['title'],
                        journal=row['journal'],
                        doi=row['doi']
                    ).first()
                    if source:
                        print(f'\tAlready exists in the database: {row["title"]}.')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

# iteration_id = 1
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

####### Create feed table
# feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
# feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])

####### Create sources table

iteration = 1

text_df = parse_fulltext(folder_path)
text_df
# references_df_dict[iteration] = add_pubmed_details(text_df, api_key)

references_df_dict[iteration] = create_sources_table(text_df, section=None)
references_df_dict[iteration]

# ##### Create summaries

# # Set parameters
# iteration_id = 1
# article_limit = None
# temperature = 1.5
# n_choices = 2
# pause_per_request=0
# # summary_iteration_id = iteration_id
# chatbot_id = iteration_id
# model = 'gpt-3.5-turbo-16k-0613'
# # model = 'gpt-4'
# save_outputs=True
# folder_path = folder_path

# sources_df = get_table(table='sources', limit=article_limit)
# # sources_df


# chaining_dict = batch_summarize(
#     sources_df, folder_path, prep_step, summarize_task, edit_task, 
#     simplify_task, simplify_audience, format_task,
#     chatbot_dict, temperature=temperature,
#     system_role=system_role, model=model, max_tokens=1000,
#     n_choices=n_choices, pause_per_request=pause_per_request,
#     iteration_id=iteration_id, save_outputs=save_outputs
#     )
# # # chaining_dict[iteration_id]
# qna_dict = create_summaries_df(
#     qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
#     )



Keys for text_dict: dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

Flagged: 
	Article title: Exploring participants perspectives on adverse events due to resistance training: a qualitative study
	PubMed title: Exploring participants' perspectives on adverse events due to resistance training: a qualitative study.



Unnamed: 0,title,text,pubmed_title,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section,flag_title
0,A systematic review examining associations bet...,There is strong evidence that moderate-to-vigo...,A systematic review examining associations bet...,This systematic review examined the associatio...,"Applied physiology, nutrition, and metabolism ...","Shawn Hakimi, Sahej Kaur, Amanda Ross-White, L...",2023,,48.0,2.0,97.0,162.0,10.1139/apnm-2022-0298,,False
1,Advertising expenditures across media on food ...,Unhealthy diets characterized by the consumpti...,Advertising expenditures across media on food ...,This research estimated and characterized adve...,"Applied physiology, nutrition, and metabolism ...","Monique Potvin Kent, Elise Pauz&#xe9;, Mariang...",2023,,48.0,1.0,27.0,37.0,10.1139/apnm-2022-0219,,False
2,Calcaneal tendon stiffness is not associated w...,The rates at which muscles generate torque and...,Calcaneal tendon stiffness is not associated w...,The ability to rapidly generate muscular torqu...,"Applied physiology, nutrition, and metabolism ...","Sohum V Kulkarni, Michael T Paris, Charles L Rice",2023,,48.0,4.0,331.0,339.0,10.1139/apnm-2022-0436,,False
3,Creatine supplementation combined with blood f...,Increases in muscular strength and hypertrophy...,Creatine supplementation combined with blood f...,This study aimed to compare the effects of an ...,"Applied physiology, nutrition, and metabolism ...","Rayssa Sousa-Silva, Jason M Cholewa, Kassiana ...",2023,,48.0,6.0,417.0,426.0,10.1139/apnm-2022-0209,,False
4,Economic burden of low muscle strength in Cana...,We used a prevalence-based approach to estimat...,Economic burden of low muscle strength in Cana...,The economic cost associated with low muscle s...,"Applied physiology, nutrition, and metabolism ...","Jean-Philippe Chaput, Ian Janssen, Hugues Samp...",2023,,,,,,10.1139/apnm-2022-0371,,False
5,Exploring participants perspectives on advers...,According to a cross-sectional analysis of the...,Exploring participants' perspectives on advers...,The objective of this study was to explore the...,"Applied physiology, nutrition, and metabolism ...","Rasha El-Kotob, Justin R Pagcanlungan, B Catha...",2023,,48.0,6.0,427.0,435.0,10.1139/apnm-2022-0117,,True
6,Fast food consumption in adults living in Cana...,"Since the 1980s, technological advancements an...",Fast food consumption in adults living in Cana...,Global industries and technological advancemen...,"Applied physiology, nutrition, and metabolism ...","Emily Seale, Margaret de Groh, Linda Greene-Fi...",2023,,48.0,2.0,163.0,171.0,10.1139/apnm-2022-0252,,False
7,Loneliness and resilience are associated with ...,"In December 2019, the coronavirus disease 2019...",Loneliness and resilience are associated with ...,"Nutrition risk is linked to hospitalization, f...","Applied physiology, nutrition, and metabolism ...","Cindy Wei, Marla K Beauchamp, Brenda Vrkljan, ...",2023,,48.0,1.0,38.0,48.0,10.1139/apnm-2022-0201,,False
8,Low vitamin K status in adults with cystic fib...,Cystic fibrosis (CF) is the most common geneti...,Low vitamin K status in adults with cystic fib...,Patients with cystic fibrosis&#xa0;(CF) are at...,"Applied physiology, nutrition, and metabolism ...","Cindy Bergeron, Kathryn J Potter, Val&#xe9;rie...",2023,,48.0,4.0,321.0,330.0,10.1139/apnm-2022-0163,,False
9,Milk protein ingestion does not enhance recove...,Milk-based foods provide a rich source of carb...,Milk protein ingestion does not enhance recove...,Milk-based proteins are a common choice of pos...,"Applied physiology, nutrition, and metabolism ...","Alice G Pearson, Lindsay S Macnaughton, Karen ...",2023,,48.0,6.0,455.0,468.0,10.1139/apnm-2022-0385,,False


## Append rows to sources table

In [7]:
# Add rows from results to summaries and prompts table
bulk_append(table='sources', input_df=references_df_dict[iteration])

Adding 14 rows to the database...
	A systematic review examining associations between physical activity, sedentary behaviour, and sleep duration with quality of life in older adults aged 65 years and above
	Advertising expenditures across media on food and beverage products heavily advertised on youth-appealing television stations in Canada
	Calcaneal tendon stiffness is not associated with dynamic time-dependent contractile output
	Creatine supplementation combined with blood flow restriction training enhances muscle thickness and performance: a randomized, placebo-controlled, and double-blind study
	Economic burden of low muscle strength in Canadian adults
	Exploring participants perspectives on adverse events due to resistance training: a qualitative study
	Fast food consumption in adults living in Canada: alternative measurement methods, consumption choices, and correlates
	Loneliness and resilience are associated with nutrition risk after the first wave of COVID-19 in community-d

## Create summaries

In [11]:
##### Create summaries

# Set parameters
iteration_id = 1
article_limit = None
temperature = 1
n_choices = 2
pause_per_request=10
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
model = 'gpt-3.5-turbo-16k-0613'
# model = 'gpt-4'
save_outputs=True
folder_path = folder_path

sources_df = get_table(table='sources', limit=article_limit)
# sources_df


chaining_dict = batch_summarize(
    sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chatbot_dict, temperature=temperature,
    system_role=system_role, model=model, max_tokens=1000,
    n_choices=n_choices, pause_per_request=pause_per_request,
    iteration_id=iteration_id, save_outputs=save_outputs
    )
# # chaining_dict[iteration_id]
qna_dict = create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
    )

Query: SELECT * from sources ORDER BY id ASC
**Text #1 prompt #1 of 2**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
[batch_summarize()] Sleeping 10 sec to avoid exceeding API rate limit
**Text #1 prompt #2 of 2**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
[batch_summarize()] Sleeping 10 sec to avoid exceeding API rate limit
**Text #2 prompt #1 of 2**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using g

In [13]:
qna_dict[iteration_id]

Unnamed: 0,timestamp,reference_id,article_title,choice,text,system_role,model,temperature,prep_step,summarize_task,edit_task,simplify_task,simplify_audience,format_task,full_summarize_task,folder,summary,headline,simple_summary
0,2023-07-14 09:18:53.342148-07:00,1,Comparisons in the Recovery Response From Resi...,1,"Decreases in muscle mass, function, and neurom...",You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,A recent study compared the recovery response ...,New Study: Aging and Recovery from Exercise,A recent study compared the recovery response ...
1,2023-07-14 09:18:53.342148-07:00,1,Comparisons in the Recovery Response From Resi...,2,"Decreases in muscle mass, function, and neurom...",You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,"According to a recent study, changes in muscle...",New Research on Aging and Exercise Recovery,"New research has shown that as we age, changes..."
2,2023-07-14 09:19:07.547679-07:00,1,Comparisons in the Recovery Response From Resi...,1,"Decreases in muscle mass, function, and neurom...",You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,A recent study compared the recovery response ...,New Research Shows How Exercise Recovery May B...,New research has found that exercise recovery ...
3,2023-07-14 09:19:07.547679-07:00,1,Comparisons in the Recovery Response From Resi...,2,"Decreases in muscle mass, function, and neurom...",You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,A recent study explored the differences in exe...,New Research on Age and Exercise Recovery,Hey! I just read a fascinating study about exe...
4,2023-07-14 09:19:21.988746-07:00,2,Effect of dietary sources of calcium and prote...,1,Longevity increases the proportion of older ad...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,A recent research study found that a high calc...,New Research Shows Nutritional Intervention Ca...,New research shows that a simple change in die...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,2023-07-14 09:31:18.055983-07:00,30,"Similar body composition, muscle size, and str...",2,Strength training (ST) is widely known for bri...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,A recent study found that following a plant-ba...,New Research Shows Plant-Based Diet Doesn't Hi...,New research reveals that following a plant-ba...
90,2023-07-14 09:31:34.025736-07:00,31,The influence of training status and parasympa...,1,Parasympathetic control of the heart is largel...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,Recent research examined the impact of enduran...,New Research on Endurance Training and Heart H...,Hey! I just read a study about the impact of e...
91,2023-07-14 09:31:34.025736-07:00,31,The influence of training status and parasympa...,2,Parasympathetic control of the heart is largel...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,A recent study explored the effects of enduran...,New Research Reveals Impact of Training on Hea...,A recent study found that endurance-trained at...
92,2023-07-14 09:31:52.533051-07:00,31,The influence of training status and parasympa...,1,Parasympathetic control of the heart is largel...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,text/2023-07-14 full,A recent study looked at the effects of endura...,New Research on the Cardiac Effects of Enduran...,I wanted to share this interesting research on...


In [14]:
bulk_append(table='summaries', input_df=qna_dict[iteration_id])

Adding 94 rows to the database...
	Reference #1: New Study: Aging and Recovery from Exercise
	Reference #1: New Research on Aging and Exercise Recovery
	Reference #1: New Research Shows How Exercise Recovery May Be Similar in Young and Middle-Aged Adults
	Reference #1: New Research on Age and Exercise Recovery
	Reference #2: New Research Shows Nutritional Intervention Can Reduce Fracture Risk in Older Adults
	Reference #2: New Research Shows Nutritional Intervention Can Reduce Fracture Risk in Older Adults
	Reference #2: Exciting new research on reducing fractures in the elderly
	Reference #2: New Research Shows Nutritional Intervention Can Reduce Falls and Fractures in Older Adults
	Reference #3: Exciting New Approach to Exercise: Exercise Snacks
	Reference #3: New Research Reveals the Power of 'Exercise Snacks' for Health
	Reference #3: Exciting New Research on Exercise Snacks for Busy People
	Reference #3: Exciting New Research on Exercise Snacks for Busy People
	Reference #4: New R

# Batch 2: Check that I can run all code without it breaking

In [8]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
from orm_summarize import *
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                result_title_match = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                if result_title_match:
                    result_title = result_title_match.group(1)
                    cleaned_result_title = re.sub(r'</?[ib]>', '', result_title)
                    cleaned_result_title = re.sub(r'/(?![^<>]*>)', '', cleaned_result_title) # Remove any / that is not within html tag
                    cleaned_result_title = re.sub(r'[^a-zA-Z0-9 <>/]', '', cleaned_result_title).lower().strip()
                else:
                    cleaned_result_title = cleaned_result
                if cleaned_title == cleaned_result_title:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                        return result
                else:
                    continue
            if cleaned_title != cleaned_result_title:
                print(f'Warning: Article title not found in PMIDs.')
                print(f'Check these PMIDs: {id_list}')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tResult title: {result_title if result_title else cleaned_result}')
                print(f'\tCleaned input title: {cleaned_title}')
                print(f'\tCleaned result title: {cleaned_result_title}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
        flagged_indices = df[df['flag_title'] == True].index
        for index in flagged_indices:
            print(f'Flagged: ')
            print(f'\tArticle title: {df.loc[index, "title"]}')
            print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
            print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')
                elif table == 'feed':
                    source = session.query(Feed).filter_by(
                        title=row['title'],
                        journal=row['journal'],
                        doi=row['doi']
                    ).first()
                    if source:
                        print(f'\tAlready exists in the database: {row["title"]}.')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

# iteration_id = 1
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

####### Create feed table
# feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
# feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])

####### Create sources table

iteration = 2
folder_path = '../text/2023-07-14 full2/'
text_df = parse_fulltext(folder_path)
text_df
# references_df_dict[iteration] = add_pubmed_details(text_df, api_key)

references_df_dict[iteration] = create_sources_table(text_df, section=None)
references_df_dict[iteration]

# ##### Create summaries

# Set parameters
iteration_id = 2
article_limit = len(references_df_dict[iteration])
temperature = 1
n_choices = 2
pause_per_request=10
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
model = 'gpt-3.5-turbo-16k-0613'
# model = 'gpt-4'
save_outputs=True
folder_path = folder_path

sources_df = get_table(table='sources', limit=article_limit, order='DESC')
sources_df

# Add rows from results to summaries and prompts table
bulk_append(table='sources', input_df=references_df_dict[iteration])

chaining_dict = batch_summarize(
    sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chatbot_dict, temperature=temperature,
    system_role=system_role, model=model, max_tokens=1000,
    n_choices=n_choices, pause_per_request=pause_per_request,
    iteration_id=iteration_id, save_outputs=save_outputs
    )
# # chaining_dict[iteration_id]
qna_dict = create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
    )



Keys for text_dict: dict_keys([1, 2, 3, 4])

Query: SELECT * from sources ORDER BY id DESC LIMIT 4
Adding 4 rows to the database...
	A systematic review of patient barriers and facilitators for implementing lifestyle interventions targeting weight loss in primary care
	Dietary pulses as a means to improve the gut microbiome, inflammation, and appetite control in obesity
	Effects of exercise timing on metabolic health
	The burden of type 2 diabetes mellitus in states of the European Union and United Kingdom at the national and subnational levels: A systematic review
Data added successfully!
**Text #31 prompt #1 of 2**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
[batch_summarize()] Sleeping 10 sec to avoid exceeding API rate limit
**Text #31 prompt #2 of 2**
Creating 

In [9]:
qna_dict[iteration_id]

Unnamed: 0,timestamp,reference_id,article_title,choice,text,system_role,model,temperature,prep_step,summarize_task,edit_task,simplify_task,simplify_audience,format_task,full_summarize_task,folder,summary,headline,simple_summary
0,2023-07-14 12:01:47.626309-07:00,31,The influence of training status and parasympa...,1,Parasympathetic control of the heart is largel...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,A recent study found that endurance-trained at...,New Research on Endurance Training and Heart H...,Hey! Just read this interesting study that fou...
1,2023-07-14 12:01:47.626309-07:00,31,The influence of training status and parasympa...,2,Parasympathetic control of the heart is largel...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,A recent study investigated how the heart resp...,New research on the heart's response to breath...,New research suggests that endurance-trained a...
2,2023-07-14 12:02:02.778457-07:00,31,The influence of training status and parasympa...,1,Parasympathetic control of the heart is largel...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,A recent study found that endurance-trained at...,Research on Endurance Athletes and Heart Function,A recent study suggests that endurance-trained...
3,2023-07-14 12:02:02.778457-07:00,31,The influence of training status and parasympa...,2,Parasympathetic control of the heart is largel...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,A recent study found that endurance-trained at...,New research on the impact of exercise on hear...,Hey! I read an interesting study about the imp...
4,2023-07-14 12:02:20.287810-07:00,30,"Similar body composition, muscle size, and str...",1,Strength training (ST) is widely known for bri...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,A recent study compared muscle and body compos...,New Study Shows Vegetarian Diet Doesn't Impair...,New research has found that following a vegeta...
5,2023-07-14 12:02:20.287810-07:00,30,"Similar body composition, muscle size, and str...",2,Strength training (ST) is widely known for bri...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,Hey! I came across an interesting research stu...,New Research Shows Vegetarian Diet Does Not Hi...,Hey! I read an interesting study that showed p...
6,2023-07-14 12:02:35.266234-07:00,30,"Similar body composition, muscle size, and str...",1,Strength training (ST) is widely known for bri...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,A recent study compared the muscle and body co...,New Research Shows Plant-based Diets Don't Imp...,Exciting new research has found that following...
7,2023-07-14 12:02:35.266234-07:00,30,"Similar body composition, muscle size, and str...",2,Strength training (ST) is widely known for bri...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,Hey! I just read a really interesting research...,Exciting new research on strength training and...,Hey! I just read an awesome article about how ...
8,2023-07-14 12:02:50.747117-07:00,29,Sex differences and indications of metabolic c...,1,Elite collegiate athletes experience high 24 h...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,A recent study examined the energy balance of ...,New Research on Energy Balance and Athlete Health,A recent study examined the energy balance of ...
9,2023-07-14 12:02:50.747117-07:00,29,Sex differences and indications of metabolic c...,2,Elite collegiate athletes experience high 24 h...,You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,Elite collegiate athletes often struggle to ma...,New Research Reveals the Impact of Energy Bala...,New research has found that male and female sw...


In [10]:
bulk_append(table='summaries', input_df=qna_dict[iteration_id])

Adding 16 rows to the database...
	Reference #31: New Research on Endurance Training and Heart Health
	Reference #31: New research on the heart's response to breath holding
	Reference #31: Research on Endurance Athletes and Heart Function
	Reference #31: New research on the impact of exercise on heart health
	Reference #30: New Study Shows Vegetarian Diet Doesn't Impair Muscle Gain from Strength Training
	Reference #30: New Research Shows Vegetarian Diet Does Not Hinder Muscle and Strength Gains from Strength Training
	Reference #30: New Research Shows Plant-based Diets Don't Impair Muscle Growth in Strength Training
	Reference #30: Exciting new research on strength training and vegetarian diets
	Reference #29: New Research on Energy Balance and Athlete Health
	Reference #29: New Research Reveals the Impact of Energy Balance on Athlete's Health
	Reference #29: New research reveals the importance of monitoring energy status in athletes
	Reference #29: New research reveals the impact of 

In [11]:
get_table(table='sources', limit=None, order='DESC')

Query: SELECT * from sources ORDER BY id DESC


Unnamed: 0,id,title,text,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section
0,35,The burden of type 2 diabetes mellitus in stat...,Type 2 diabetes mellitus (T2D) is a complex me...,Type 2 diabetes mellitus (T2D) is a highly pre...,Obesity reviews : an official journal of the I...,"Carlos Alexandre Soares Andrade, Balqees Shahi...",2023,,,,e13593,,10.1111/obr.13593,
1,34,Effects of exercise timing on metabolic health,"In the last decades, the dramatic increase in ...",The increasing prevalence of metabolic syndrom...,Obesity reviews : an official journal of the I...,"Jos&#xe9; Ignacio Mart&#xed;nez-Montoro, Javie...",2023,,,,e13599,,10.1111/obr.13599,
2,33,Dietary pulses as a means to improve the gut m...,"According to the World Health Organization, th...",A dysbiotic intestinal microbiome has been lin...,Obesity reviews : an official journal of the I...,"Hannah St John, &#xc9;ric Doucet, Krista A Power",2023,,,,e13598,,10.1111/obr.13598,
3,32,A systematic review of patient barriers and fa...,Despite increasing awareness of the consequenc...,Numerous barriers are experienced by people wi...,Obesity reviews : an official journal of the I...,"Maxim de Jong, N&#xfa;ria Jansen, Marienke van...",2023,Aug,24.0,8.0,e13571,,10.1111/obr.13571,
4,31,The influence of training status and parasympa...,Parasympathetic control of the heart is largel...,Apnea (breath-holding) elicits co-activation o...,"Applied physiology, nutrition, and metabolism ...","Lindsey F Berthelsen, Andrew J M Douglas, Tony...",2023,,48.0,3.0,270,282.0,10.1139/apnm-2022-0340,
5,30,"Similar body composition, muscle size, and str...",Strength training (ST) is widely known for bri...,There is a popular belief that meat consumptio...,"Applied physiology, nutrition, and metabolism ...","Gabriela Lucciana Martini, Ronei Silveira Pint...",2023,,48.0,6.0,469,478.0,10.1139/apnm-2022-0258,
6,29,Sex differences and indications of metabolic c...,Elite collegiate athletes experience high 24 h...,To determine whether mismatched energy intake ...,"Applied physiology, nutrition, and metabolism ...","Emily A Lundstrom, Mary Jane De Souza, Hannah ...",2023,,48.0,1.0,74,87.0,10.1139/apnm-2022-0161,
7,28,Prevalence of sarcopenia indicators and sub-op...,Sarcopenia is the age-related loss of muscle m...,"Sarcopenia is associated with falls, and can c...","Applied physiology, nutrition, and metabolism ...","Giulia Coletta, Josephine S Jakubowski, Stuart...",2023,,48.0,7.0,498,506.0,10.1139/apnm-2022-0125,
8,27,Milk protein ingestion does not enhance recove...,Milk-based foods provide a rich source of carb...,Milk-based proteins are a common choice of pos...,"Applied physiology, nutrition, and metabolism ...","Alice G Pearson, Lindsay S Macnaughton, Karen ...",2023,,48.0,6.0,455,468.0,10.1139/apnm-2022-0385,
9,26,Low vitamin K status in adults with cystic fib...,Cystic fibrosis (CF) is the most common geneti...,Patients with cystic fibrosis&#xa0;(CF) are at...,"Applied physiology, nutrition, and metabolism ...","Cindy Bergeron, Kathryn J Potter, Val&#xe9;rie...",2023,,48.0,4.0,321,330.0,10.1139/apnm-2022-0163,


# batch 3

In [16]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
from orm_summarize import *
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                result_title_match = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                if result_title_match:
                    result_title = result_title_match.group(1)
                    cleaned_result_title = re.sub(r'</?[ib]>', '', result_title)
                    cleaned_result_title = re.sub(r'/(?![^<>]*>)', '', cleaned_result_title) # Remove any / that is not within html tag
                    cleaned_result_title = re.sub(r'[^a-zA-Z0-9 <>/]', '', cleaned_result_title).lower().strip()
                else:
                    cleaned_result_title = cleaned_result
                if cleaned_title == cleaned_result_title:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                        return result
                else:
                    continue
            if cleaned_title != cleaned_result_title:
                print(f'Warning: Article title not found in PMIDs.')
                print(f'Check these PMIDs: {id_list}')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tResult title: {result_title if result_title else cleaned_result}')
                print(f'\tCleaned input title: {cleaned_title}')
                print(f'\tCleaned result title: {cleaned_result_title}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
        flagged_indices = df[df['flag_title'] == True].index
        for index in flagged_indices:
            print(f'Flagged: ')
            print(f'\tArticle title: {df.loc[index, "title"]}')
            print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
            print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')
                elif table == 'feed':
                    source = session.query(Feed).filter_by(
                        title=row['title'],
                        journal=row['journal'],
                        doi=row['doi']
                    ).first()
                    if source:
                        print(f'\tAlready exists in the database: {row["title"]}.')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

# iteration_id = 1
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

####### Create feed table
# feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
# feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])



# #########
# Prep: Set parameters
iteration_id = 3
temperature = 1
n_choices = 2
pause_per_request=10
# summary_iteration_id = iteration_id
chatbot_id = iteration_id
model = 'gpt-3.5-turbo-16k-0613'
# model = 'gpt-4'
save_outputs=True
folder_path = '../text/2023-07-14 full2/'

####### 
# Step 1: Create sources table

iteration = 2
text_df = parse_fulltext(folder_path)
text_df
# references_df_dict[iteration] = add_pubmed_details(text_df, api_key)

references_df_dict[iteration] = create_sources_table(text_df, section=None)
references_df_dict[iteration]

####### 
# Step 2:  Add rows from results to summaries and prompts table IF NOT ALREADY ADDED
# bulk_append(table='sources', input_df=references_df_dict[iteration])

# ##### 
# Step 3: Get the new sources for summarization
article_limit = len(references_df_dict[iteration])
sources_df = get_table(table='sources', limit=article_limit, order='DESC')
sources_df = sources_df.loc[1:] ## exclude first one due to too many tokens


# ##### 
# Step 3: Create summaries

chaining_dict = batch_summarize(
    sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chatbot_dict, temperature=temperature,
    system_role=system_role, model=model, max_tokens=1000,
    n_choices=n_choices, pause_per_request=pause_per_request,
    iteration_id=iteration_id, save_outputs=save_outputs
    )
# # chaining_dict[iteration_id]
qna_dict = create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
    )

bulk_append(table='summaries', input_df=qna_dict[iteration_id])
qna_dict[iteration_id]


Keys for text_dict: dict_keys([1, 2, 3, 4])

Query: SELECT * from sources ORDER BY id DESC LIMIT 4
**Text #34 prompt #1 of 2**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
[batch_summarize()] Sleeping 10 sec to avoid exceeding API rate limit
**Text #34 prompt #2 of 2**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending request to gpt-3.5-turbo-16k-0613
		Requesting 2 choices using gpt-3.5-turbo-16k-0613
	Done sending request to GPT-3
	...Completed
[batch_summarize()] Sleeping 10 sec to avoid exceeding API rate limit
**Text #33 prompt #1 of 2**
Creating Chaining class instance
***OpenAI model: gpt-3.5-turbo-16k-0613
Chaining class instance created
	Done creating prompt
	Sending reques

Unnamed: 0,timestamp,reference_id,article_title,choice,text,system_role,model,temperature,prep_step,summarize_task,edit_task,simplify_task,simplify_audience,format_task,full_summarize_task,folder,summary,headline,simple_summary
0,2023-07-14 13:15:51.071151-07:00,34,Effects of exercise timing on metabolic health,1,"In the last decades, the dramatic increase in ...",You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,Hey! I just came across some interesting resea...,Exciting New Research on Exercise Timing and M...,Hey! I just read some really cool research abo...
1,2023-07-14 13:15:51.071151-07:00,34,Effects of exercise timing on metabolic health,2,"In the last decades, the dramatic increase in ...",You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,A recent study found that the timing of exerci...,Exciting new research on exercise timing and m...,New research suggests that the timing of exerc...
2,2023-07-14 13:16:07.325388-07:00,34,Effects of exercise timing on metabolic health,1,"In the last decades, the dramatic increase in ...",You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,Hey there! I just read some interesting resear...,New Research on Exercise Timing and Metabolic ...,Hey there! I just read some fascinating resear...
3,2023-07-14 13:16:07.325388-07:00,34,Effects of exercise timing on metabolic health,2,"In the last decades, the dramatic increase in ...",You are someone who loves to read health resea...,gpt-3.5-turbo-16k-0613,1,"In the summary, cover the following informatio...",Write a casual text message to your friend abo...,Once you have written your text message: \...,"3. If needed, rewrite the text using terms app...",people without a science background,4. Return your final response in a JSON format...,Write a casual text message to your friend abo...,2023-07-14 full2/,Hey friend! I just read a really interesting r...,New research reveals the best time to exercise...,Hey friend! I just read a really interesting a...


In [14]:
references_df_dict[iteration]

Unnamed: 0,title,text,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section
0,A systematic review of patient barriers and fa...,Despite increasing awareness of the consequenc...,Numerous barriers are experienced by people wi...,Obesity reviews : an official journal of the I...,"Maxim de Jong, N&#xfa;ria Jansen, Marienke van...",2023,Aug,24.0,8.0,e13571,,10.1111/obr.13571,
1,Dietary pulses as a means to improve the gut m...,"According to the World Health Organization, th...",A dysbiotic intestinal microbiome has been lin...,Obesity reviews : an official journal of the I...,"Hannah St John, &#xc9;ric Doucet, Krista A Power",2023,,,,e13598,,10.1111/obr.13598,
2,Effects of exercise timing on metabolic health,"In the last decades, the dramatic increase in ...",The increasing prevalence of metabolic syndrom...,Obesity reviews : an official journal of the I...,"Jos&#xe9; Ignacio Mart&#xed;nez-Montoro, Javie...",2023,,,,e13599,,10.1111/obr.13599,
3,The burden of type 2 diabetes mellitus in stat...,Type 2 diabetes mellitus (T2D) is a complex me...,Type 2 diabetes mellitus (T2D) is a highly pre...,Obesity reviews : an official journal of the I...,"Carlos Alexandre Soares Andrade, Balqees Shahi...",2023,,,,e13593,,10.1111/obr.13593,


In [15]:
sources_df.loc[1:]

Unnamed: 0,id,title,text,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section
1,34,Effects of exercise timing on metabolic health,"In the last decades, the dramatic increase in ...",The increasing prevalence of metabolic syndrom...,Obesity reviews : an official journal of the I...,"Jos&#xe9; Ignacio Mart&#xed;nez-Montoro, Javie...",2023,,,,e13599,,10.1111/obr.13599,
2,33,Dietary pulses as a means to improve the gut m...,"According to the World Health Organization, th...",A dysbiotic intestinal microbiome has been lin...,Obesity reviews : an official journal of the I...,"Hannah St John, &#xc9;ric Doucet, Krista A Power",2023,,,,e13598,,10.1111/obr.13598,
3,32,A systematic review of patient barriers and fa...,Despite increasing awareness of the consequenc...,Numerous barriers are experienced by people wi...,Obesity reviews : an official journal of the I...,"Maxim de Jong, N&#xfa;ria Jansen, Marienke van...",2023,Aug,24.0,8.0,e13571,,10.1111/obr.13571,


# *End of Page*