# Title
[]()

In [7]:

import pandas as pd
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from silvhua import *
# sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\portfolio-projects\online-PT-social-media-NLP\src")
# import json
# from pandas import json_normalize  
# from plotly.subplots import make_subplots
# import requests

In [12]:
# set the option to wrap text within cells
pd.set_option('display.max_colwidth', 500)
# pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# code from `2023-07-11 create references table` notebook

In [None]:
import requests
import json
import re
import os
import string
import pandas as pd
api_key = os.getenv('api_ncbi')

def parse_fulltext(series, title_pattern=r'^(.*)\n*.+'):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    # Iterate over each element in the series
    for text in series:
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)

        
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(f'[{string.punctuation}]', '', title).lower().strip()

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

iteration = 11



text_df = parse_fulltext(fulltext)
text_df
# references_df_dict = {}
references_df_dict[iteration] = add_pubmed_details(dummy_df, api_key)
references_df_dict[iteration] 
compare_columns(references_df_dict[iteration])

# test_id = search_article(text_df.loc[4,'Title'], api_key)
# test_id

# Set up

In [9]:

references_df_dict = {}

# Create text dictionary
folder_path = '../text/2023-06-20 discussion' # ** UPDATE REQUIRED**

encoding='ISO-8859-1'
subset=None

# Update code

In [15]:
import requests
import json
import re
import os
import string
import pandas as pd
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.DataFrame.from_dict(text_dict, orient='index', columns=['text'])
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    return text_df
    # Iterate over each element in the series
    for text in text_df:
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(f'[{string.punctuation}]', '', title).lower().strip()

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

iteration = 1



text_df = parse_fulltext(folder_path)
text_df
# references_df_dict[iteration] = add_pubmed_details(dummy_df, api_key)
# references_df_dict[iteration] 
# compare_columns(references_df_dict[iteration])

# test_id = search_article(text_df.loc[4,'Title'], api_key)
# test_id


Keys for text_dict: dict_keys([1, 2, 3, 4, 5])



Unnamed: 0,text
1,"Comparisons in the Recovery Response From Resistance Exercise Between Young and Middle-Aged Men\n\nDiscussion\nResults of this study indicated no differences in the recovery response between YA and MA for any of the performance measures, nor in subjective levels of muscle pain or soreness. Furthermore, no between-group differences were observed in the inflammatory or muscle damage response to the exercise protocol. To the best of our knowledge, this is the first study to examine differences ..."
2,"Effect of dietary sources of calcium and protein on hip fractures and falls in older adults in residential care cluster randomised controlled trial\nDiscussion\nThis nutritional approach using high calcium and high protein dairy foods to increase calcium and protein intakes in institutionalised older adults replete in vitamin D was associated with a 33% reduction in risk of fractures of any type, a 46% reduction in risk of hip fractures, and an 11% reduction in risk of falls relative to cont..."
3,"Food craving, cortisol and ghrelin responses in modeling highly palatable snack intake in the laboratory\n\nDISCUSSION\nThis preliminary study is the first to directly compare the effects of both food cue and stress exposure on HP food craving and HP food intake in a 3-day human laboratory experiment conducted within a controlled hospital-based setting with healthy community adults. As both the ubiquitous HP food environment and stressors are known to increase HP food intake and obesity risk..."
4,"Hypohydration but not Menstrual Phase Influences Pain Perception in Healthy Women\n\nDISCUSSION\nThis study examined the independent and combined effects of hypohydration and menstrual phase on experimental pain sensitivity in healthy eumenorrheic women, and the potential efficacy of acute water ingestion as a remedy to the deleterious impact of hypohydration. The main findings were that: 1) mild hypohydration increased pain sensitivity, 2) menstrual phase did not affect pain sensitivity, no..."
5,"Weight stigma and health behaviors: evidence from the Eating in America Study. International Journal of Obesity\n\nDiscussion\nThe present study employed a two-stage research investigation to examine the relationship between weight stigma and several health behaviors in a large sample of U.S. adults. As predicted, weight stigma was significantly associated with greater disordered eating, comfort eating, alcohol use, and sleep disturbance, after controlling for covariates. No such relationshi..."


# 1.1

In [21]:
text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
type(text_dict[1])


Keys for text_dict: dict_keys([1, 2, 3, 4, 5])



str

In [25]:
import requests
import json
import re
import os
import string
import pandas as pd
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df
    # return text_df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(f'[{string.punctuation}]', '', title).lower().strip()

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

iteration = 1.1



text_df = parse_fulltext(folder_path)
text_df
references_df_dict[iteration] = add_pubmed_details(text_df, api_key)

references_df_dict[iteration] = compare_columns(references_df_dict[iteration])

# test_id = search_article(text_df.loc[4,'Title'], api_key)
# test_id


Keys for text_dict: dict_keys([1, 2, 3, 4, 5])



In [26]:
references_df_dict[iteration]

Unnamed: 0,title,text,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi
0,Comparisons in the Recovery Response From Resistance Exercise Between Young and Middle-Aged Men,"Results of this study indicated no differences in the recovery response between YA and MA for any of the performance measures, nor in subjective levels of muscle pain or soreness. Furthermore, no between-group differences were observed in the inflammatory or muscle damage response to the exercise protocol. To the best of our knowledge, this is the first study to examine differences in the recovery response from high-volume resistance exercise between recreationally trained young and middle-a...","Gordon, JA III, Hoffman, JR, Arroyo, E, Varanoske, AN, Coker, NA, Gepner, Y, Wells, AJ, Stout, JR, and Fukuda, DH. Comparisons in the recovery response from resistance exercise between young and middle-aged men. J Strength Cond Res 31(12): 3454-3462, 2017-The purpose of this study was to compare the effects of a bout of high-volume isokinetic resistance exercise protocol (HVP) on lower-body strength and markers of inflammation and muscle damage during recovery between young and middle-aged a...",Journal of strength and conditioning research,"Joseph A Gordon, Jay R Hoffman, Eliott Arroyo, Alyssa N Varanoske, Nicholas A Coker, Yftach Gepner, Adam J Wells, Jeffrey R Stout, David H Fukuda",2017.0,,31.0,12.0,3454,3462.0,10.1519/JSC.0000000000002219
1,Effect of dietary sources of calcium and protein on hip fractures and falls in older adults in residential care cluster randomised controlled trial,"This nutritional approach using high calcium and high protein dairy foods to increase calcium and protein intakes in institutionalised older adults replete in vitamin D was associated with a 33% reduction in risk of fractures of any type, a 46% reduction in risk of hip fractures, and an 11% reduction in risk of falls relative to controls. We found no group difference in all cause mortality.\nMost interventions aimed at reducing fracture risk target a drug therapy to people with osteoporosis ...",To assess the antifracture efficacy and safety of a nutritional intervention in institutionalised older adults replete in vitamin D but with mean intakes of 600 mg/day calcium and &lt;1 g/kg body weight protein/day.,BMJ (Clinical research ed.),"S Iuliano, S Poon, J Robbins, M Bui, X Wang, L De Groot, M Van Loan, A Ghasem Zadeh, T Nguyen, E Seeman",2021.0,,375.0,,n2364,,10.1136/bmj.n2364
2,"Food craving, cortisol and ghrelin responses in modeling highly palatable snack intake in the laboratory","This preliminary study is the first to directly compare the effects of both food cue and stress exposure on HP food craving and HP food intake in a 3-day human laboratory experiment conducted within a controlled hospital-based setting with healthy community adults. As both the ubiquitous HP food environment and stressors are known to increase HP food intake and obesity risk, a direct comparison of these contexts could identify similar and differential processes that may underlie food motivat...","Overeating of highly palatable (HP) foods in the ubiquitous HP food cue environment and under stress is associated with weight gain and contributes to the global obesity epidemic. However, subjective and biobehavioral processes that may increase HP overeating are not clear. Using a novel experimental approach, we examined HP food motivation and intake and neuroendocrine responses in the context of food cues, stress and a control neutral relaxing cue exposure in healthy individuals.",Physiology &amp; behavior,"Rajita Sinha, Peihua Gu, Rachel Hart, J B Guarnaccia",2019.0,,208.0,,112563,,10.1016/j.physbeh.2019.112563
3,Hypohydration but not Menstrual Phase Influences Pain Perception in Healthy Women,"This study examined the independent and combined effects of hypohydration and menstrual phase on experimental pain sensitivity in healthy eumenorrheic women, and the potential efficacy of acute water ingestion as a remedy to the deleterious impact of hypohydration. The main findings were that: 1) mild hypohydration increased pain sensitivity, 2) menstrual phase did not affect pain sensitivity, nor did it influence the effect of hypohydration on pain, and 3) acute water ingestion did not redu...","Chronic pain is a pervasive health problem and is associated with tremendous socioeconomic costs. However, current pain treatments are often ineffective due, in part, to the multifactorial nature of pain. Mild hypohydration was shown to increase experimental pain sensitivity in men, but whether this also occurs in women has not been examined. Fluctuations in ovarian hormones (i.e., 17&#x3b2;-estradiol and progesterone) throughout the menstrual cycle may influence a woman's pain sensitivity, ...","Journal of applied physiology (Bethesda, Md. : 1985)","Beverly Tan, Michael C Philipp, Ahmad Munir Che Muhamed, Toby M&#xfc;ndel",2022.0,,132.0,3.0,611,621.0,10.1152/japplphysiol.00402.2021
4,Weight stigma and health behaviors: evidence from the Eating in America Study. International Journal of Obesity,"The present study employed a two-stage research investigation to examine the relationship between weight stigma and several health behaviors in a large sample of U.S. adults. As predicted, weight stigma was significantly associated with greater disordered eating, comfort eating, alcohol use, and sleep disturbance, after controlling for covariates. No such relationship was observed for physical activity.\nTargeting health behaviors (e.g., eating) to achieve weight loss is common in weight-foc...",,,,,,,,,,


## 1.11 Add the section 

In [41]:
import requests
import json
import re
import os
import string
import pandas as pd
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df
    # return text_df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(f'[{string.punctuation}]', '', title).lower().strip()

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

iteration = 1.11



text_df = parse_fulltext(folder_path)
text_df
# references_df_dict[iteration] = add_pubmed_details(text_df, api_key)

references_df_dict[iteration] = create_sources_table(text_df, section='discussion')
references_df_dict[iteration]



Keys for text_dict: dict_keys([1, 2, 3, 4, 5])



Unnamed: 0,title,text,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section
0,Comparisons in the Recovery Response From Resistance Exercise Between Young and Middle-Aged Men,"Results of this study indicated no differences in the recovery response between YA and MA for any of the performance measures, nor in subjective levels of muscle pain or soreness. Furthermore, no between-group differences were observed in the inflammatory or muscle damage response to the exercise protocol. To the best of our knowledge, this is the first study to examine differences in the recovery response from high-volume resistance exercise between recreationally trained young and middle-a...","Gordon, JA III, Hoffman, JR, Arroyo, E, Varanoske, AN, Coker, NA, Gepner, Y, Wells, AJ, Stout, JR, and Fukuda, DH. Comparisons in the recovery response from resistance exercise between young and middle-aged men. J Strength Cond Res 31(12): 3454-3462, 2017-The purpose of this study was to compare the effects of a bout of high-volume isokinetic resistance exercise protocol (HVP) on lower-body strength and markers of inflammation and muscle damage during recovery between young and middle-aged a...",Journal of strength and conditioning research,"Joseph A Gordon, Jay R Hoffman, Eliott Arroyo, Alyssa N Varanoske, Nicholas A Coker, Yftach Gepner, Adam J Wells, Jeffrey R Stout, David H Fukuda",2017.0,,31.0,12.0,3454,3462.0,10.1519/JSC.0000000000002219,discussion
1,Effect of dietary sources of calcium and protein on hip fractures and falls in older adults in residential care cluster randomised controlled trial,"This nutritional approach using high calcium and high protein dairy foods to increase calcium and protein intakes in institutionalised older adults replete in vitamin D was associated with a 33% reduction in risk of fractures of any type, a 46% reduction in risk of hip fractures, and an 11% reduction in risk of falls relative to controls. We found no group difference in all cause mortality.\nMost interventions aimed at reducing fracture risk target a drug therapy to people with osteoporosis ...",To assess the antifracture efficacy and safety of a nutritional intervention in institutionalised older adults replete in vitamin D but with mean intakes of 600 mg/day calcium and &lt;1 g/kg body weight protein/day.,BMJ (Clinical research ed.),"S Iuliano, S Poon, J Robbins, M Bui, X Wang, L De Groot, M Van Loan, A Ghasem Zadeh, T Nguyen, E Seeman",2021.0,,375.0,,n2364,,10.1136/bmj.n2364,discussion
2,"Food craving, cortisol and ghrelin responses in modeling highly palatable snack intake in the laboratory","This preliminary study is the first to directly compare the effects of both food cue and stress exposure on HP food craving and HP food intake in a 3-day human laboratory experiment conducted within a controlled hospital-based setting with healthy community adults. As both the ubiquitous HP food environment and stressors are known to increase HP food intake and obesity risk, a direct comparison of these contexts could identify similar and differential processes that may underlie food motivat...","Overeating of highly palatable (HP) foods in the ubiquitous HP food cue environment and under stress is associated with weight gain and contributes to the global obesity epidemic. However, subjective and biobehavioral processes that may increase HP overeating are not clear. Using a novel experimental approach, we examined HP food motivation and intake and neuroendocrine responses in the context of food cues, stress and a control neutral relaxing cue exposure in healthy individuals.",Physiology &amp; behavior,"Rajita Sinha, Peihua Gu, Rachel Hart, J B Guarnaccia",2019.0,,208.0,,112563,,10.1016/j.physbeh.2019.112563,discussion
3,Hypohydration but not Menstrual Phase Influences Pain Perception in Healthy Women,"This study examined the independent and combined effects of hypohydration and menstrual phase on experimental pain sensitivity in healthy eumenorrheic women, and the potential efficacy of acute water ingestion as a remedy to the deleterious impact of hypohydration. The main findings were that: 1) mild hypohydration increased pain sensitivity, 2) menstrual phase did not affect pain sensitivity, nor did it influence the effect of hypohydration on pain, and 3) acute water ingestion did not redu...","Chronic pain is a pervasive health problem and is associated with tremendous socioeconomic costs. However, current pain treatments are often ineffective due, in part, to the multifactorial nature of pain. Mild hypohydration was shown to increase experimental pain sensitivity in men, but whether this also occurs in women has not been examined. Fluctuations in ovarian hormones (i.e., 17&#x3b2;-estradiol and progesterone) throughout the menstrual cycle may influence a woman's pain sensitivity, ...","Journal of applied physiology (Bethesda, Md. : 1985)","Beverly Tan, Michael C Philipp, Ahmad Munir Che Muhamed, Toby M&#xfc;ndel",2022.0,,132.0,3.0,611,621.0,10.1152/japplphysiol.00402.2021,discussion
4,Weight stigma and health behaviors: evidence from the Eating in America Study. International Journal of Obesity,"The present study employed a two-stage research investigation to examine the relationship between weight stigma and several health behaviors in a large sample of U.S. adults. As predicted, weight stigma was significantly associated with greater disordered eating, comfort eating, alcohol use, and sleep disturbance, after controlling for covariates. No such relationship was observed for physical activity.\nTargeting health behaviors (e.g., eating) to achieve weight loss is common in weight-foc...",,,,,,,,,,,discussion


In [39]:
pd.Series(None, index=[1,2,3], dtype="O")

1    NaN
2    NaN
3    NaN
dtype: object

## Add to the database

In [42]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries'):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] if 'section' in row.index else None
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *
from response_processing import *
import time
import pytz
import re
from itertools import product
import openai
from prompts import *

class Chaining:
    def __init__(self, text_id, title, text, folder_path, system_role="You are a helpful assistant.", 
            model="gpt-3.5-turbo", temperature=0.7, max_tokens=9000, 
        ):
        self.reference_id = text_id
        self.title = title
        self.text = text
        self.folder = re.sub(r'(?:.*\/)?(.*\/.*)\/?$', r'\1', folder_path)
        self.system_role = system_role
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.model = model
        print(f'***OpenAI model: {self.model}')

    def create_prompt(self, task, text):
        system_role = f'{self.system_role}'
        user_input = f"""Given the following text delimited by triple backticks: ```{text}``` \n {task}"""
        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_input},]

        print('\tDone creating prompt')
        return messages

    def gpt(self, messages, n_choices, temperature, model=None):
        model = self.model if model == None else model
        print(f'\tSending request to {model}')
        print(f'\t\tRequesting {n_choices} choices using {model}')
        openai.api_key = os.getenv('api_openai')
        response = openai.ChatCompletion.create(
            model=model, messages=messages, 
            temperature=temperature, 
            max_tokens=self.max_tokens,
            n=n_choices
            )
        print('\tDone sending request to GPT-3')
        return response

    def summarize(
            self, task, prep_step, edit_task, simplify_task, simplify_audience,
            format_task,
            n_choices=5, task_first=True):
        if task_first == True:
            full_task = f'{task}\n\n{prep_step}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        else:
            full_task = f'{prep_step}\n\n{task}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        prompt = self.create_prompt(full_task, self.text)
        firstline_pattern = r'\s?(\S*)(\n*)(.+)'
        title = re.match(firstline_pattern, self.text)[0]
        self.qna = dict() 
        self.qna['timestamp'] = str(datetime.now(pytz.timezone('Canada/Pacific')))
        self.qna['reference_id'] = self.reference_id
        self.qna['article_title'] = self.title
        self.qna['text'] = self.text
        self.qna['system_role'] = self.system_role
        self.qna['model'] = self.model        
        self.qna['temperature'] = self.temperature
        self.qna['prep_step'] = prep_step.strip()
        self.qna['summarize_task'] = task.strip()
        self.qna['edit_task'] = edit_task.strip()
        self.qna['simplify_task'] = simplify_task.strip()
        self.qna['simplify_audience'] = simplify_audience.strip()
        self.qna['format_task'] = format_task.strip()
        self.qna['full_summarize_task'] = full_task.strip()
        self.qna['folder'] = self.folder
        self.summaries_dict = dict()
        self.article_title = title
        self.response_regex = r'response_(.*)'
        self.simple_summary_dict = dict()
        self.relevance_dict = dict()
        self.n_previous_prompts = dict()

        try:
            response = self.gpt(prompt, n_choices=n_choices, temperature=self.temperature)
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**API request failed for `.summarize()`**')
            return self.qna
        try:
            for index, choice in enumerate(response.choices):
                self.summaries_dict[f'response_{"{:02d}".format(index+1)}'] = choice["message"]["content"]
            self.qna.setdefault('summary', [])
            self.qna['summary'].extend([value for value in self.summaries_dict.values()])
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**Error with response parsing**')
    
def batch_summarize(sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chaining_bot_dict, iteration_id, task_first=True,
    system_role=None, model='gpt-3.5-turbo', max_tokens=1000, temperature=0.7, pause_per_request=0, n_choices=5,
    save_outputs=False
    ):
    prompts_df = pd.DataFrame(product(prep_step, summarize_task, edit_task, simplify_task, simplify_audience, format_task), 
        columns=['prep_step', 'summarize_task', 'edit_task', 'simplify_task', 'simplify_audience', 'format_task'])

    chaining_bot_dict[iteration_id] = dict()
    def summarize_from_df_row(text_id, title, text, chaining_bot_dict):
        for index in prompts_df.index:
            print(f'**Text #{text_id} prompt #{index+1} of {prompts_df.index.max()+1}**')
            task = prompts_df.loc[index, 'summarize_task']
            prep_step = prompts_df.loc[index, 'prep_step']
            edit_task = prompts_df.loc[index, 'edit_task']
            simplify_task = prompts_df.loc[index, 'simplify_task']
            simplify_audience = prompts_df.loc[index, 'simplify_audience']
            format_task = prompts_df.loc[index, 'format_task']
            try:
                print('Creating Chaining class instance')
                chatbot = Chaining(
                    text_id, title, text, folder_path=folder_path, system_role=system_role, 
                    model=model, max_tokens=max_tokens, temperature=temperature)
                print('Chaining class instance created')
                chatbot.summarize(
                    task=task, prep_step=prep_step, edit_task=edit_task, 
                    simplify_task=simplify_task, simplify_audience=simplify_audience,
                    format_task=format_task, n_choices=n_choices, task_first=task_first
                    )
                chaining_bot_dict[iteration_id][f'{text_id}_prompt{"{:02d}".format(index)}'] = chatbot
                print('\t...Completed')
                if pause_per_request > 0:
                    print(f'[batch_summarize()] Sleeping {pause_per_request} sec to avoid exceeding API rate limit')
                    time.sleep(pause_per_request) # Account for API rate limit of 3 API requests/limit 
            except Exception as error:
                exc_type, exc_obj, tb = sys.exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                file = f.f_code.co_filename
                print("An error occurred on line", lineno, "in", file, ":", error)
                print('\t...Error making chatbot request')
                break
    sources_df.apply(lambda row: summarize_from_df_row(row['id'], row['title'], row['text'], chaining_bot_dict), axis=1)
    
    if save_outputs:
        try:
            save_instance_to_dict(
                chaining_bot_dict[iteration_id], 
                description=f'batch_Chaining_attributes_initial',
                ext=None, json_path=folder_path
                )
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            file = f.f_code.co_filename
            print(f'An error occurred on line {lineno} in {file}: {error}')
            print('[batch_summarize_chain()] Unable to save API response')

    return chaining_bot_dict

def create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=None, 
    ):
    """
    Create DataFrame from initial ChatGPT summaries.
    """
    dfs_list = []
    chatbot_id = iteration_id if chatbot_id == None else chatbot_id
    for chatbot_key in chatbot_dict[chatbot_id].keys():
        print(f'Processing {chatbot_key}...')
        dfs_list.append(pd.DataFrame(
            chatbot_dict[chatbot_id][chatbot_key].qna, 
            index=[choice for choice in range(1, len(chatbot_dict[chatbot_id][chatbot_key].qna['summary'])+1)])
            )
    
    qna_df = pd.concat(dfs_list).reset_index(names=['choice'])
    qna_df = extract_summary(qna_df, 'summary')
    columns = qna_df.columns.tolist()
    columns.remove('choice')
    columns.insert(3, 'choice') # Move 'choice' column

    # qna_df['date'] = pd.Series('2023-06-12', index=qna_df.index)
    # columns.insert(0, 'date')


    qna_dict[iteration_id] = qna_df[columns]
    print(f'Original summaries DataFrame shape: {qna_df.shape}')
    print(f'\tOriginal summaries Dataframe columns: {qna_df.columns}')
    return qna_dict


import json
def extract_summary(df, summary_column='summary'):
    # Convert the string to JSON
    try:
        df[summary_column] = df[summary_column].apply(json.loads)
    except Exception as error:
        print(f'Error converting {summary_column} column to JSON: {error}; will do row by row')
        summary_list = []
        for index, summary in df[summary_column].items():
            try:
                summary_list.append(json.loads(summary))
            except Exception as error:
                print(f'Error converting summary {index} to JSON: {error}')
                summary_list.append(summary)
    def extract_value_from_key(summary, key):
        try:
            return summary[key]
        except Exception as error:
            match = re.search(rf'"{key}":\s*"([^"]+)"', summary)
            value = match.group(1) if match else None
            return value

    # Extract 'headline' and 'body' values
    df['headline'] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'headline'))
    df['simple_summary'] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'audience'))
    df[summary_column] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'body'))
    df['simple_summary'] = df['simple_summary'].fillna(df[summary_column])

    return df

# # Set parameters
# iteration_id = 1.6
# article_limit = None
# temperature = 1.5
# n_choices = 2
# pause_per_request=0
# # summary_iteration_id = iteration_id
# chatbot_id = iteration_id
# model = 'gpt-3.5-turbo-16k-0613'
# # model = 'gpt-4'
# save_outputs=True
# summaries = get_table(table='summaries')


# sources_df = get_table(table='sources', limit=article_limit)
# # sources_df

# chaining_dict = batch_summarize(
#     sources_df, folder_path, prep_step, summarize_task, edit_task, 
#     simplify_task, simplify_audience, format_task,
#     chatbot_dict, temperature=temperature,
#     system_role=system_role, model=model, max_tokens=1000,
#     n_choices=n_choices, pause_per_request=pause_per_request,
#     iteration_id=iteration_id, save_outputs=save_outputs
#     )
# # # chaining_dict[iteration_id]
# qna_dict = create_summaries_df(
#     qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
#     )

# qna_dict[iteration_id]

# # Add rows from results to summaries and prompts table
bulk_append(table='sources', input_df=references_df_dict[iteration])

Adding 5 rows to the database...
	Comparisons in the Recovery Response From Resistance Exercise Between Young and Middle-Aged Men
	Effect of dietary sources of calcium and protein on hip fractures and falls in older adults in residential care cluster randomised controlled trial
	Food craving, cortisol and ghrelin responses in modeling highly palatable snack intake in the laboratory
	Hypohydration but not Menstrual Phase Influences Pain Perception in Healthy Women
	Weight stigma and health behaviors: evidence from the Eating in America Study. International Journal of Obesity
Error adding data to the database: (psycopg2.errors.InvalidTextRepresentation) invalid input syntax for type integer: ""

[SQL: INSERT INTO sources (title, text, abstract, publication, authors, year, month, pub_volume, pub_issue, start_page, end_page, doi, section) SELECT p0::VARCHAR, p1::VARCHAR, p2::VARCHAR, p3::VARCHAR, p4::VARCHAR, p5::INTEGER, p6::VARCHAR, p7::VARCHAR, p ... 1185 characters truncated ... , p8, 

# iteration 2: fix issues if null values from pubmed

In [43]:
import requests
import json
import re
import os
import string
import pandas as pd
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df
    # return text_df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(f'[{string.punctuation}]', '', title).lower().strip()

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries'):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] if 'section' in row.index else None
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *
from response_processing import *
import time
import pytz
import re
from itertools import product
import openai
from prompts import *

class Chaining:
    def __init__(self, text_id, title, text, folder_path, system_role="You are a helpful assistant.", 
            model="gpt-3.5-turbo", temperature=0.7, max_tokens=9000, 
        ):
        self.reference_id = text_id
        self.title = title
        self.text = text
        self.folder = re.sub(r'(?:.*\/)?(.*\/.*)\/?$', r'\1', folder_path)
        self.system_role = system_role
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.model = model
        print(f'***OpenAI model: {self.model}')

    def create_prompt(self, task, text):
        system_role = f'{self.system_role}'
        user_input = f"""Given the following text delimited by triple backticks: ```{text}``` \n {task}"""
        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_input},]

        print('\tDone creating prompt')
        return messages

    def gpt(self, messages, n_choices, temperature, model=None):
        model = self.model if model == None else model
        print(f'\tSending request to {model}')
        print(f'\t\tRequesting {n_choices} choices using {model}')
        openai.api_key = os.getenv('api_openai')
        response = openai.ChatCompletion.create(
            model=model, messages=messages, 
            temperature=temperature, 
            max_tokens=self.max_tokens,
            n=n_choices
            )
        print('\tDone sending request to GPT-3')
        return response

    def summarize(
            self, task, prep_step, edit_task, simplify_task, simplify_audience,
            format_task,
            n_choices=5, task_first=True):
        if task_first == True:
            full_task = f'{task}\n\n{prep_step}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        else:
            full_task = f'{prep_step}\n\n{task}\n\n{edit_task}\n\n{simplify_task} {simplify_audience}\n\n{format_task}'
        prompt = self.create_prompt(full_task, self.text)
        firstline_pattern = r'\s?(\S*)(\n*)(.+)'
        title = re.match(firstline_pattern, self.text)[0]
        self.qna = dict() 
        self.qna['timestamp'] = str(datetime.now(pytz.timezone('Canada/Pacific')))
        self.qna['reference_id'] = self.reference_id
        self.qna['article_title'] = self.title
        self.qna['text'] = self.text
        self.qna['system_role'] = self.system_role
        self.qna['model'] = self.model        
        self.qna['temperature'] = self.temperature
        self.qna['prep_step'] = prep_step.strip()
        self.qna['summarize_task'] = task.strip()
        self.qna['edit_task'] = edit_task.strip()
        self.qna['simplify_task'] = simplify_task.strip()
        self.qna['simplify_audience'] = simplify_audience.strip()
        self.qna['format_task'] = format_task.strip()
        self.qna['full_summarize_task'] = full_task.strip()
        self.qna['folder'] = self.folder
        self.summaries_dict = dict()
        self.article_title = title
        self.response_regex = r'response_(.*)'
        self.simple_summary_dict = dict()
        self.relevance_dict = dict()
        self.n_previous_prompts = dict()

        try:
            response = self.gpt(prompt, n_choices=n_choices, temperature=self.temperature)
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**API request failed for `.summarize()`**')
            return self.qna
        try:
            for index, choice in enumerate(response.choices):
                self.summaries_dict[f'response_{"{:02d}".format(index+1)}'] = choice["message"]["content"]
            self.qna.setdefault('summary', [])
            self.qna['summary'].extend([value for value in self.summaries_dict.values()])
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            filename = f.f_code.co_filename
            print("An error occurred on line", lineno, "in", filename, ":", error)
            print('\t**Error with response parsing**')
    
def batch_summarize(sources_df, folder_path, prep_step, summarize_task, edit_task, 
    simplify_task, simplify_audience, format_task,
    chaining_bot_dict, iteration_id, task_first=True,
    system_role=None, model='gpt-3.5-turbo', max_tokens=1000, temperature=0.7, pause_per_request=0, n_choices=5,
    save_outputs=False
    ):
    prompts_df = pd.DataFrame(product(prep_step, summarize_task, edit_task, simplify_task, simplify_audience, format_task), 
        columns=['prep_step', 'summarize_task', 'edit_task', 'simplify_task', 'simplify_audience', 'format_task'])

    chaining_bot_dict[iteration_id] = dict()
    def summarize_from_df_row(text_id, title, text, chaining_bot_dict):
        for index in prompts_df.index:
            print(f'**Text #{text_id} prompt #{index+1} of {prompts_df.index.max()+1}**')
            task = prompts_df.loc[index, 'summarize_task']
            prep_step = prompts_df.loc[index, 'prep_step']
            edit_task = prompts_df.loc[index, 'edit_task']
            simplify_task = prompts_df.loc[index, 'simplify_task']
            simplify_audience = prompts_df.loc[index, 'simplify_audience']
            format_task = prompts_df.loc[index, 'format_task']
            try:
                print('Creating Chaining class instance')
                chatbot = Chaining(
                    text_id, title, text, folder_path=folder_path, system_role=system_role, 
                    model=model, max_tokens=max_tokens, temperature=temperature)
                print('Chaining class instance created')
                chatbot.summarize(
                    task=task, prep_step=prep_step, edit_task=edit_task, 
                    simplify_task=simplify_task, simplify_audience=simplify_audience,
                    format_task=format_task, n_choices=n_choices, task_first=task_first
                    )
                chaining_bot_dict[iteration_id][f'{text_id}_prompt{"{:02d}".format(index)}'] = chatbot
                print('\t...Completed')
                if pause_per_request > 0:
                    print(f'[batch_summarize()] Sleeping {pause_per_request} sec to avoid exceeding API rate limit')
                    time.sleep(pause_per_request) # Account for API rate limit of 3 API requests/limit 
            except Exception as error:
                exc_type, exc_obj, tb = sys.exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                file = f.f_code.co_filename
                print("An error occurred on line", lineno, "in", file, ":", error)
                print('\t...Error making chatbot request')
                break
    sources_df.apply(lambda row: summarize_from_df_row(row['id'], row['title'], row['text'], chaining_bot_dict), axis=1)
    
    if save_outputs:
        try:
            save_instance_to_dict(
                chaining_bot_dict[iteration_id], 
                description=f'batch_Chaining_attributes_initial',
                ext=None, json_path=folder_path
                )
        except Exception as error:
            exc_type, exc_obj, tb = sys.exc_info()
            f = tb.tb_frame
            lineno = tb.tb_lineno
            file = f.f_code.co_filename
            print(f'An error occurred on line {lineno} in {file}: {error}')
            print('[batch_summarize_chain()] Unable to save API response')

    return chaining_bot_dict

def create_summaries_df(
    qna_dict, chatbot_dict, iteration_id, chatbot_id=None, 
    ):
    """
    Create DataFrame from initial ChatGPT summaries.
    """
    dfs_list = []
    chatbot_id = iteration_id if chatbot_id == None else chatbot_id
    for chatbot_key in chatbot_dict[chatbot_id].keys():
        print(f'Processing {chatbot_key}...')
        dfs_list.append(pd.DataFrame(
            chatbot_dict[chatbot_id][chatbot_key].qna, 
            index=[choice for choice in range(1, len(chatbot_dict[chatbot_id][chatbot_key].qna['summary'])+1)])
            )
    
    qna_df = pd.concat(dfs_list).reset_index(names=['choice'])
    qna_df = extract_summary(qna_df, 'summary')
    columns = qna_df.columns.tolist()
    columns.remove('choice')
    columns.insert(3, 'choice') # Move 'choice' column

    # qna_df['date'] = pd.Series('2023-06-12', index=qna_df.index)
    # columns.insert(0, 'date')


    qna_dict[iteration_id] = qna_df[columns]
    print(f'Original summaries DataFrame shape: {qna_df.shape}')
    print(f'\tOriginal summaries Dataframe columns: {qna_df.columns}')
    return qna_dict


import json
def extract_summary(df, summary_column='summary'):
    # Convert the string to JSON
    try:
        df[summary_column] = df[summary_column].apply(json.loads)
    except Exception as error:
        print(f'Error converting {summary_column} column to JSON: {error}; will do row by row')
        summary_list = []
        for index, summary in df[summary_column].items():
            try:
                summary_list.append(json.loads(summary))
            except Exception as error:
                print(f'Error converting summary {index} to JSON: {error}')
                summary_list.append(summary)
    def extract_value_from_key(summary, key):
        try:
            return summary[key]
        except Exception as error:
            match = re.search(rf'"{key}":\s*"([^"]+)"', summary)
            value = match.group(1) if match else None
            return value

    # Extract 'headline' and 'body' values
    df['headline'] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'headline'))
    df['simple_summary'] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'audience'))
    df[summary_column] = df[summary_column].apply(lambda x: extract_value_from_key(x, 'body'))
    df['simple_summary'] = df['simple_summary'].fillna(df[summary_column])

    return df

# # Set parameters
# iteration_id = 1.6
# article_limit = None
# temperature = 1.5
# n_choices = 2
# pause_per_request=0
# # summary_iteration_id = iteration_id
# chatbot_id = iteration_id
# model = 'gpt-3.5-turbo-16k-0613'
# # model = 'gpt-4'
# save_outputs=True
# summaries = get_table(table='summaries')


# sources_df = get_table(table='sources', limit=article_limit)
# # sources_df

# chaining_dict = batch_summarize(
#     sources_df, folder_path, prep_step, summarize_task, edit_task, 
#     simplify_task, simplify_audience, format_task,
#     chatbot_dict, temperature=temperature,
#     system_role=system_role, model=model, max_tokens=1000,
#     n_choices=n_choices, pause_per_request=pause_per_request,
#     iteration_id=iteration_id, save_outputs=save_outputs
#     )
# # # chaining_dict[iteration_id]
# qna_dict = create_summaries_df(
#     qna_dict, chatbot_dict, iteration_id, chatbot_id=chatbot_id
#     )

# qna_dict[iteration_id]


iteration = 2



text_df = parse_fulltext(folder_path)
text_df
# references_df_dict[iteration] = add_pubmed_details(text_df, api_key)

references_df_dict[iteration] = create_sources_table(text_df, section='discussion')
references_df_dict[iteration]


# # # Add rows from results to summaries and prompts table
# bulk_append(table='sources', input_df=references_df_dict[iteration])


Keys for text_dict: dict_keys([1, 2, 3, 4, 5])



Unnamed: 0,title,text,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section
0,Comparisons in the Recovery Response From Resistance Exercise Between Young and Middle-Aged Men,"Results of this study indicated no differences in the recovery response between YA and MA for any of the performance measures, nor in subjective levels of muscle pain or soreness. Furthermore, no between-group differences were observed in the inflammatory or muscle damage response to the exercise protocol. To the best of our knowledge, this is the first study to examine differences in the recovery response from high-volume resistance exercise between recreationally trained young and middle-a...","Gordon, JA III, Hoffman, JR, Arroyo, E, Varanoske, AN, Coker, NA, Gepner, Y, Wells, AJ, Stout, JR, and Fukuda, DH. Comparisons in the recovery response from resistance exercise between young and middle-aged men. J Strength Cond Res 31(12): 3454-3462, 2017-The purpose of this study was to compare the effects of a bout of high-volume isokinetic resistance exercise protocol (HVP) on lower-body strength and markers of inflammation and muscle damage during recovery between young and middle-aged a...",Journal of strength and conditioning research,"Joseph A Gordon, Jay R Hoffman, Eliott Arroyo, Alyssa N Varanoske, Nicholas A Coker, Yftach Gepner, Adam J Wells, Jeffrey R Stout, David H Fukuda",2017,,31,12.0,3454,3462.0,10.1519/JSC.0000000000002219,discussion
1,Effect of dietary sources of calcium and protein on hip fractures and falls in older adults in residential care cluster randomised controlled trial,"This nutritional approach using high calcium and high protein dairy foods to increase calcium and protein intakes in institutionalised older adults replete in vitamin D was associated with a 33% reduction in risk of fractures of any type, a 46% reduction in risk of hip fractures, and an 11% reduction in risk of falls relative to controls. We found no group difference in all cause mortality.\nMost interventions aimed at reducing fracture risk target a drug therapy to people with osteoporosis ...",To assess the antifracture efficacy and safety of a nutritional intervention in institutionalised older adults replete in vitamin D but with mean intakes of 600 mg/day calcium and &lt;1 g/kg body weight protein/day.,BMJ (Clinical research ed.),"S Iuliano, S Poon, J Robbins, M Bui, X Wang, L De Groot, M Van Loan, A Ghasem Zadeh, T Nguyen, E Seeman",2021,,375,,n2364,,10.1136/bmj.n2364,discussion
2,"Food craving, cortisol and ghrelin responses in modeling highly palatable snack intake in the laboratory","This preliminary study is the first to directly compare the effects of both food cue and stress exposure on HP food craving and HP food intake in a 3-day human laboratory experiment conducted within a controlled hospital-based setting with healthy community adults. As both the ubiquitous HP food environment and stressors are known to increase HP food intake and obesity risk, a direct comparison of these contexts could identify similar and differential processes that may underlie food motivat...","Overeating of highly palatable (HP) foods in the ubiquitous HP food cue environment and under stress is associated with weight gain and contributes to the global obesity epidemic. However, subjective and biobehavioral processes that may increase HP overeating are not clear. Using a novel experimental approach, we examined HP food motivation and intake and neuroendocrine responses in the context of food cues, stress and a control neutral relaxing cue exposure in healthy individuals.",Physiology &amp; behavior,"Rajita Sinha, Peihua Gu, Rachel Hart, J B Guarnaccia",2019,,208,,112563,,10.1016/j.physbeh.2019.112563,discussion
3,Hypohydration but not Menstrual Phase Influences Pain Perception in Healthy Women,"This study examined the independent and combined effects of hypohydration and menstrual phase on experimental pain sensitivity in healthy eumenorrheic women, and the potential efficacy of acute water ingestion as a remedy to the deleterious impact of hypohydration. The main findings were that: 1) mild hypohydration increased pain sensitivity, 2) menstrual phase did not affect pain sensitivity, nor did it influence the effect of hypohydration on pain, and 3) acute water ingestion did not redu...","Chronic pain is a pervasive health problem and is associated with tremendous socioeconomic costs. However, current pain treatments are often ineffective due, in part, to the multifactorial nature of pain. Mild hypohydration was shown to increase experimental pain sensitivity in men, but whether this also occurs in women has not been examined. Fluctuations in ovarian hormones (i.e., 17&#x3b2;-estradiol and progesterone) throughout the menstrual cycle may influence a woman's pain sensitivity, ...","Journal of applied physiology (Bethesda, Md. : 1985)","Beverly Tan, Michael C Philipp, Ahmad Munir Che Muhamed, Toby M&#xfc;ndel",2022,,132,3.0,611,621.0,10.1152/japplphysiol.00402.2021,discussion
4,Weight stigma and health behaviors: evidence from the Eating in America Study,"The present study employed a two-stage research investigation to examine the relationship between weight stigma and several health behaviors in a large sample of U.S. adults. As predicted, weight stigma was significantly associated with greater disordered eating, comfort eating, alcohol use, and sleep disturbance, after controlling for covariates. No such relationship was observed for physical activity.\nTargeting health behaviors (e.g., eating) to achieve weight loss is common in weight-foc...","Weight stigma is pervasive across the U.S. and is associated with poor health outcomes including all-cause mortality. One potential reason that weight stigma may be detrimental to health is that it begets poorer health behaviors. Therefore, the present study tested for associations between weight stigma and four health behaviors (i.e., eating behavior, alcohol use, sleep disturbance, and physical activity), while controlling for BMI and other potential confounds.",International journal of obesity (2005),"Kristen M Lee, Jeffrey M Hunger, A Janet Tomiyama",2021,,45,7.0,1499,1509.0,10.1038/s41366-021-00814-5,discussion


## Add to database

In [44]:
bulk_append(table='sources', input_df=references_df_dict[iteration])

Adding 5 rows to the database...
	Comparisons in the Recovery Response From Resistance Exercise Between Young and Middle-Aged Men
	Effect of dietary sources of calcium and protein on hip fractures and falls in older adults in residential care cluster randomised controlled trial
	Food craving, cortisol and ghrelin responses in modeling highly palatable snack intake in the laboratory
	Hypohydration but not Menstrual Phase Influences Pain Perception in Healthy Women
	Weight stigma and health behaviors: evidence from the Eating in America Study
Data added successfully!


# *End of Page*