# Title
[]()

In [64]:
import pandas as pd
# set the option to wrap text within cells
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Code from `2023-07-12 summaries from journal` notebook

In [None]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        iframe = response.xpath('//iframe/@src').extract()
        article_dict[key]['iframe'] = iframe
        text = response.xpath('//div[@class="core-container"]').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 2.5
text_id = iteration_id
n_articles = 1
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict
article_dict
# article_titles(article_dict)


# partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

# text_dict[iteration_id], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

# root_display_dict[iteration_id] = display_dict
# display_html(display_dict, type='body')

# set up

In [1]:
root_article_dict = dict()

In [13]:
partial_article_dict = dict()
feed_df_dict = dict()

# Convert into just extracting article metadata, not text

## ** final scraping script

In [3]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS_feed(scrapy.Spider):
    name = "crawler_RSS_feed"
    
    def __init__(self, journals, n_articles='all'):
        self.n_articles = n_articles
        self.journals = journals
    
    def start_requests(self):
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
    #         yield response.follow(
    #             url=url, callback=self.parse_pages, 
    #             cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    # def parse_pages(self, response, key, article_dict):
    #     # print(f'Journal #{key}')
    #     # text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
    #     iframe = response.xpath('//iframe/@src').extract()
    #     article_dict[key]['iframe'] = iframe
    #     text = response.xpath('//div[@class="core-container"]').extract()
    #     article_dict[key]['text'] = ''.join(['\n'+line for line in text])
    #     if key - int(key) == 0:
    #         print(f'\t{article_dict[key]["journal"]}')
    #         print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def get_RSS_feed(n_articles='all', journals='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    get_RSS_feed(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS_feed, n_articles, journals)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict



journals = {
    'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
    # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
    # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
    # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
    'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

    # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
    }


iteration_id = 1
text_id = iteration_id
n_articles = 1
####
article_dict = dict()
get_RSS_feed(n_articles, journals)
root_article_dict[iteration_id] = article_dict
article_dict
# article_titles(article_dict)


# partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

# text_dict[iteration_id], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

# root_display_dict[iteration_id] = display_dict
# display_html(display_dict, type='body')

Found 30 articles and 30 URLs for PLOS One
	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 44 articles and 22 URLs for Applied Physiology, Nutrition, and Metabolism
	Corrected number of article titles: 22


{0.0: {'journal': 'PLOS One',
  'title': 'Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it',
  'url': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725'},
 0.01: {'journal': 'PLOS One',
  'title': 'Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model',
  'url': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288659'},
 0.02: {'journal': 'PLOS One',
  'title': 'Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation',
  'url': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288658'},
 0.03: {'journal': 'PLOS One',
  'title': 'Development of a neural network model to predict the presence of fentanyl in community drug samples',
  'url': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288656'},
 0.04: {'journal': 'PLOS One',
  'title': 'Val

In [5]:
article_titles(article_dict)

0.0: Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it
	PLOS One https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725

0.01: Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model
	PLOS One https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288659

0.02: Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation
	PLOS One https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288658

0.03: Development of a neural network model to predict the presence of fentanyl in community drug samples
	PLOS One https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288656

0.04: Validity and reliability of upper body push and pull tests to determine one-repetition maximum
	PLOS One https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288649

0.05: Machine learning‐b

In [9]:
import pandas as pd
pd.DataFrame(article_dict).transpose()

Unnamed: 0,journal,title,url
0.0,PLOS One,Online classified adverts reflect the broader ...,https://journals.plos.org/plosone/article?id=1...
0.01,PLOS One,"Biomarkers related to gas embolism: Gas score,...",https://journals.plos.org/plosone/article?id=1...
0.02,PLOS One,Feature interaction network based on hierarchi...,https://journals.plos.org/plosone/article?id=1...
0.03,PLOS One,Development of a neural network model to predi...,https://journals.plos.org/plosone/article?id=1...
0.04,PLOS One,Validity and reliability of upper body push an...,https://journals.plos.org/plosone/article?id=1...
0.05,PLOS One,Machine learning‐based identification and rela...,https://journals.plos.org/plosone/article?id=1...
0.06,PLOS One,Change in the faunal composition of mosquitoes...,https://journals.plos.org/plosone/article?id=1...
0.07,PLOS One,Knowledge exchange in the implementation of Na...,https://journals.plos.org/plosone/article?id=1...
0.08,PLOS One,Learning curve for flexible bronchoscope-guide...,https://journals.plos.org/plosone/article?id=1...
0.09,PLOS One,Highly mismatch-tolerant homology testing by R...,https://journals.plos.org/plosone/article?id=1...


# Create `feed` table

In [14]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(f'[{string.punctuation}]', '', title).lower().strip()

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()


partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])

Keys for article_dict: [0.0, 0.01, 1.0, 1.01]
Journals:
	Applied Physiology, Nutrition, and Metabolism
	PLOS One


Unnamed: 0,journal,title,url,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section
0,PLOS One,Online classified adverts reflect the broader ...,https://journals.plos.org/plosone/article?id=1...,Online sales are increasingly a route by which...,PloS one,"Jon Bielby, Andy Ferguson, Matthew Rendle, Kir...",2023,,18,7,e0288725,,10.1371/journal.pone.0288725,
1,PLOS One,"Biomarkers related to gas embolism: Gas score,...",https://journals.plos.org/plosone/article?id=1...,Fish exposed to water supersaturated with diss...,PloS one,"Alicia Vel&#xe1;zquez-Wallraf, Maria Jos&#xe9;...",2023,,18,7,e0288659,,10.1371/journal.pone.0288659,
2,"Applied Physiology, Nutrition, and Metabolism",Nutrition and immunity: perspectives on key is...,https://cdnsciencepub.com/doi/abs/10.1139/apnm...,"In January 2022, a group of experts came toget...","Applied physiology, nutrition, and metabolism ...","Carolyn L Dunbar, Harold M Aukema, Philip C Ca...",2023,,48,7,484,497.0,10.1139/apnm-2022-0276,
3,"Applied Physiology, Nutrition, and Metabolism",Prevalence of sarcopenia indicators and sub-op...,https://cdnsciencepub.com/doi/abs/10.1139/apnm...,"Sarcopenia is associated with falls, and can c...","Applied physiology, nutrition, and metabolism ...","Giulia Coletta, Josephine S Jakubowski, Stuart...",2023,,48,7,498,506.0,10.1139/apnm-2022-0125,


doi don't function as URLs on their own, but are the branch of a root URL.

In [15]:
feed_df_dict[iteration_id].loc[0, 'url']

'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725'

In [16]:
feed_df_dict[iteration_id].loc[3, 'url']

'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0125?af=R'

## iteration 2: Use all references

In [20]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(f'[{string.punctuation}]', '', title).lower().strip()

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])

Article title not found in PMIDs.
	Input title: machine learning‐based identification and related features of depression in patients with diabetes mellitus based on the korea national health and nutrition examination survey: a cross-sectional study


TypeError: expected string or bytes-like object, got 'list'

### trouble shoot pubmed

In [21]:
title = "machine learning‐based identification and related features of depression in patients with diabetes mellitus based on the korea national health and nutrition examination survey: a cross-sectional study"
feed_df_dict[2.001] = search_article(title, api_key, verbose=True)

Article title not found in PMIDs.
	Input title: machine learning‐based identification and related features of depression in patients with diabetes mellitus based on the korea national health and nutrition examination survey: a cross-sectional study


In [22]:
feed_df_dict[2.001]

['37440591']

## 2.1

In [37]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(rf'[{string.punctuation}]', '', title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            print(f'\tcleaned title: {cleaned_title}')
            print(f'\tResult title: {cleaned_result}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.1
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

# ## Create feed table
# feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
# feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])

title = "machine learning‐based identification and related features of depression in patients with diabetes mellitus based on the korea national health and nutrition examination survey: a cross-sectional study"
feed_df_dict[2.101] = search_article(title, api_key, verbose=True)

Match found for machine learning‐based identification and related features of depression in patients with diabetes mellitus based on the korea national health and nutrition examination survey: a cross-sectional study: PMID = 37440591.


In [29]:
text = 'learning‐based'
re.sub(r'‐', '-', text)
re.sub(r"\u2010", '', text)

'learningbased'

In [33]:
text = 'cross-sectional study'
re.sub(r"\u2010", '', text)
re.sub(r"\u002D", '', text)

'crosssectional study'

In [34]:
text = "survey: a cross-sectional study"
re.sub(r":", '', text)

'survey a cross-sectional study'

## 2.2

In [38]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(rf'[{string.punctuation}]', '', title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            print(f'\tcleaned title: {cleaned_title}')
            print(f'\tResult title: {cleaned_result}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except:
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.2
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


Article title not found in PMIDs.
	Input title: standardized tissue sampling guidelines for histopathological and molecular analyses of rainbow trout (<i>oncorhynchus mykiss</i>) in ecotoxicological studies
	cleaned title: standardized tissue sampling guidelines for histopathological and molecular analyses of rainbow trout ioncorhynchus mykissi in ecotoxicological studies
	Result title: xml version10 
doctype pubmedarticleset public nlmdtd pubmedarticle 1st january 2023en httpsdtdnlmnihgovncbipubmedoutpubmed230101dtd
pubmedarticleset
pubmedarticlemedlinecitation statuspublisher ownernlm indexingmethodautomatedpmid version137440561pmiddaterevisedyear2023yearmonth07monthday13daydaterevisedarticle pubmodelelectronicecollectionjournalissn issntypeelectronic19326203issnjournalissue citedmediuminternetvolume18volumeissue7issuepubdateyear2023yearpubdatejournalissuetitleplos onetitleisoabbreviationplos oneisoabbreviationjournalarticletitlestandardized tissue sampling guidelines for histopathol

TypeError: expected string or bytes-like object, got 'list'

## 2.21

In [41]:
text = 'trout (<i>oncorhynchus mykiss</i>) in'
re.sub(r'</?[ib]>', '', text)

'trout (oncorhynchus mykiss) in'

In [47]:
text = 'exposure levels of animal allergens, endotoxin, and β-(1,3)-glucan on a university campus of veterinary medicine'
re.sub(r'[^a-zA-Z0-9 ]', '', text)

'exposure levels of animal allergens endotoxin and 13glucan on a university campus of veterinary medicine'

In [51]:
text = 'and β-(1,3)-glucan'
re.sub(r'[^a-zA-Z0-9 ]', '', text)

'and 13glucan'

In [55]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            # cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
            for article_id in id_list:
                result = retrieve_citation(article_id, api_key).decode('utf-8')
                if cleaned_title in cleaned_result:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                    return result
            print('Article title not found in PMIDs.')
            print(f'\tInput title: {title.lower().strip()}')
            print(f'\tcleaned title: {cleaned_title}')
            print(f'\tResult title: {cleaned_result}')
            # print(f'Result title: {re.sub(r":", r"", result.lower())}')
            return id_list        
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.21
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


Article title not found in PMIDs.
	Input title: exposure levels of animal allergens, endotoxin, and β-(1,3)-glucan on a university campus of veterinary medicine
	cleaned title: exposure levels of animal allergens endotoxin and 13glucan on a university campus of veterinary medicine
	Result title: <xml version10 ><doctype pubmedarticleset public //nlm//dtd pubmedarticle 1st january 2023//en https//dtdnlmnihgov/ncbi/pubmed/out/pubmed230101dtd><pubmedarticleset><pubmedarticle><medlinecitation statuspublisher ownernlm indexingmethodautomated><pmid version1>37440536</pmid><daterevised><year>2023</year><month>07</month><day>13</day></daterevised><article pubmodelelectronicecollection><journal><issn issntypeelectronic>19326203</issn><journalissue citedmediuminternet><volume>18</volume><issue>7</issue><pubdate><year>2023</year></pubdate></journalissue><title>plos one</title><isoabbreviation>plos one</isoabbreviation></journal><articletitle>exposure levels of animal allergens endotoxin and x3b21

TypeError: expected string or bytes-like object, got 'list'

## 2.3 Print warning if title not found in pubmed record, but still allow for pubmed details to be added

In [59]:
text = '<articletitle>exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of veterinary medicine</articletitle>'
if re.match(r'<articletitle>(.*?)</articletitle>', text):
    print('Match found')

Match found


In [60]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            # cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
            # for article_id in id_list:
            #     result = retrieve_citation(article_id, api_key).decode('utf-8')
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            if cleaned_title in cleaned_result:
                if verbose:
                    print(f'Match found for {title}: PMID = {article_id}.')
            else:
                print('Warning: Article title not found in PMIDs.')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tcleaned title: {cleaned_title}')
                result_title = re.match(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                print(f'\tResult title: {result_title.match(1) if result_title else cleaned_result}')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.3
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


	Input title: exposure levels of animal allergens, endotoxin, and β-(1,3)-glucan on a university campus of veterinary medicine
	cleaned title: exposure levels of animal allergens endotoxin and 13glucan on a university campus of veterinary medicine
	Result title: <xml version10 ><doctype pubmedarticleset public //nlm//dtd pubmedarticle 1st january 2023//en https//dtdnlmnihgov/ncbi/pubmed/out/pubmed230101dtd><pubmedarticleset><pubmedarticle><medlinecitation statuspublisher ownernlm indexingmethodautomated><pmid version1>37440536</pmid><daterevised><year>2023</year><month>07</month><day>13</day></daterevised><article pubmodelelectronicecollection><journal><issn issntypeelectronic>19326203</issn><journalissue citedmediuminternet><volume>18</volume><issue>7</issue><pubdate><year>2023</year></pubdate></journalissue><title>plos one</title><isoabbreviation>plos one</isoabbreviation></journal><articletitle>exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of 

Unnamed: 0,journal,title,url,pubmed_title,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section,flag_title
0,PLOS One,Online classified adverts reflect the broader ...,https://journals.plos.org/plosone/article?id=1...,Online classified adverts reflect the broader ...,Online sales are increasingly a route by which...,PloS one,"Jon Bielby, Andy Ferguson, Matthew Rendle, Kir...",2023.0,,18.0,7.0,e0288725,,10.1371/journal.pone.0288725,,False
1,PLOS One,"Biomarkers related to gas embolism: Gas score,...",https://journals.plos.org/plosone/article?id=1...,"Biomarkers related to gas embolism: Gas score,...",Fish exposed to water supersaturated with diss...,PloS one,"Alicia Vel&#xe1;zquez-Wallraf, Maria Jos&#xe9;...",2023.0,,18.0,7.0,e0288659,,10.1371/journal.pone.0288659,,False
2,PLOS One,Feature interaction network based on hierarchi...,https://journals.plos.org/plosone/article?id=1...,Feature interaction network based on hierarchi...,Manual image segmentation consumes time. An au...,PloS one,"Longfeng Shen, Yingjie Zhang, Qiong Wang, Feng...",2023.0,,18.0,7.0,e0288658,,10.1371/journal.pone.0288658,,False
3,PLOS One,Development of a neural network model to predi...,https://journals.plos.org/plosone/article?id=1...,Development of a neural network model to predi...,"Increasingly, Fourier-transform infrared (FTIR...",PloS one,"Lianping Ti, Cameron J Grant, Samuel Tobias, D...",2023.0,,18.0,7.0,e0288656,,10.1371/journal.pone.0288656,,False
4,PLOS One,Validity and reliability of upper body push an...,https://journals.plos.org/plosone/article?id=1...,Validity and reliability of upper body push an...,The purpose of this study was to explore the v...,PloS one,"Eirik Sigvaldsen, Irineu Loturco, Fredrik Lars...",2023.0,,18.0,7.0,e0288649,,10.1371/journal.pone.0288649,,False
5,PLOS One,Machine learning‐based identification and rela...,https://journals.plos.org/plosone/article?id=1...,Machine learning-based identification and rela...,Patients with diabetes mellitus (DM) are twice...,PloS one,"Ji-Yoon Lee, Doyeon Won, Kiheon Lee",2023.0,,18.0,7.0,e0288648,,10.1371/journal.pone.0288648,,True
6,PLOS One,Change in the faunal composition of mosquitoes...,https://journals.plos.org/plosone/article?id=1...,Change in the faunal composition of mosquitoes...,This study aimed to evaluate the influence of ...,PloS one,"Jessica Feij&#xf3; Almeida, Heliana Christy Ma...",2023.0,,18.0,7.0,e0288646,,10.1371/journal.pone.0288646,,False
7,PLOS One,Knowledge exchange in the implementation of Na...,https://journals.plos.org/plosone/article?id=1...,Knowledge exchange in the implementation of Na...,Knowledge is an intrinsic element of environme...,PloS one,"Zheng-Hong Kong, Lindsay C Stringer, Jouni Paa...",2023.0,,18.0,7.0,e0288641,,10.1371/journal.pone.0288641,,False
8,PLOS One,Learning curve for flexible bronchoscope-guide...,https://journals.plos.org/plosone/article?id=1...,Learning curve for flexible bronchoscope-guide...,Endotracheal intubation with a flexible bronch...,PloS one,"Xingzhi Cai, Mingming Yue, Xiaohui Liu, Lize Z...",2023.0,,18.0,7.0,e0288617,,10.1371/journal.pone.0288617,,False
9,PLOS One,Highly mismatch-tolerant homology testing by R...,https://journals.plos.org/plosone/article?id=1...,Highly mismatch-tolerant homology testing by R...,"In E. coli, double strand breaks (DSBs) are re...",PloS one,"Mara Prentiss, Dianzhuo Wang, Jonathan Fu, Cha...",2023.0,,18.0,7.0,e0288611,,10.1371/journal.pone.0288611,,False


### Check flagged titles

In [65]:

sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
from wrangling import filter_df_any_condition
filter = {'flag_title': True}
filter_df_any_condition(feed_df_dict[iteration_id], filter)[['title', 'pubmed_title']]

Results where any condition is met:
	DataFrame indices: [5, 17, 20, 26, 34, 37, 38, 40, 45, 48, 51]
	DataFrame shape: (11, 16)


Unnamed: 0,title,pubmed_title
5,Machine learning‐based identification and related features of depression in patients with diabetes mellitus based on the Korea National Health and Nutrition Examination Survey: A cross-sectional s...,Machine learning-based identification and related features of depression in patients with diabetes mellitus based on the Korea National Health and Nutrition Examination Survey: A cross-sectional s...
17,Standardized tissue sampling guidelines for histopathological and molecular analyses of rainbow trout (<i>Oncorhynchus mykiss</i>) in ecotoxicological studies,Standardized tissue sampling guidelines for histopathological and molecular analyses of rainbow trout (Oncorhynchus mykiss) in ecotoxicological studies.
20,"Exposure levels of animal allergens, endotoxin, and β-(1,3)-glucan on a university campus of veterinary medicine","Exposure levels of animal allergens, endotoxin, and &#x3b2;-(1,3)-glucan on a university campus of veterinary medicine."
26,Global phylogeographical distribution of <i>Gloeoporus dichrous</i>,Global phylogeographical distribution of Gloeoporus dichrous.
34,Association between low blood selenium concentrations and poor hand grip strength in United States adults participating in NHANES (2011–2014),Association between low blood selenium concentrations and poor hand grip strength in United States adults participating in NHANES (2011-2014).
37,Discussion of “Efficacy of aerobic exercise following concussion: a narrative review”,"Reply to the discussion by Wang et al. of ""Efficacy of aerobic exercise following concussion: a narrative review""."
38,Reply to the discussion by Wang et al. of “Efficacy of aerobic exercise following concussion: a narrative review”,"Reply to the discussion by Wang et al. of ""Efficacy of aerobic exercise following concussion: a narrative review""."
40,Canadian Nutrition Society Dialogue on disease-related malnutrition: a commentary from the 2022 Food For Health Workshop,<i>Canadian Nutrition Society</i> Dialogue on disease-related malnutrition: a commentary from the 2022 Food For Health Workshop.
45,The Canadian Food Intake Screener for assessing alignment of adults’ dietary intake with the 2019 Canada’s Food Guide healthy food choices recommendations: scoring system and construct validity,The Canadian Food Intake Screener for assessing alignment of adults' dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations: scoring system and construct validity.
48,Development of the Canadian Food Intake Screener to assess alignment of adults’ dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations,Development of the Canadian Food Intake Screener to assess alignment of adults' dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations.


## 2.31 Update print statements

In [76]:
title = "ab</isoabbreviation></journal><articletitle>reply to the discussion by wang et al of efficacy of aerobic exercise following concussion a narrative review</articletitle><pagination><startpage>552</startpage><endpage>553</endpage><medlinepgn>552553</medlinepgn></pagination><elocationid eidtypedoi"
re.search(r'<articletitle>(.*?)</articletitle>', title).group(1)
# re.match(r'\<articletitle\>(.*?)\</articletitle\>', title).group(1)

'reply to the discussion by wang et al of efficacy of aerobic exercise following concussion a narrative review'

In [77]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            # cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
            # for article_id in id_list:
            #     result = retrieve_citation(article_id, api_key).decode('utf-8')
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            if cleaned_title in cleaned_result:
                if verbose:
                    print(f'Match found for {title}: PMID = {article_id}.')
            else:
                print('Warning: Article title not found in PMIDs.')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tcleaned title: {cleaned_title}')
                result_title = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                print(f'\tResult title: {result_title.group(1) if result_title else cleaned_result}')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    flagged_indices = df[df['flag_title'] == True].index
    for index in flagged_indices:
        print(f'Flagged: ')
        print(f'\tArticle title: {df.loc[index, "title"]}')
        print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
        print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.31
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


	Input title: exposure levels of animal allergens, endotoxin, and β-(1,3)-glucan on a university campus of veterinary medicine
	cleaned title: exposure levels of animal allergens endotoxin and 13glucan on a university campus of veterinary medicine
	Result title: exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of veterinary medicine
	Input title: exposure to the non-phthalate plasticizer di-heptyl succinate is less disruptive to c57bl/6n mouse recovery from a myocardial infarction than dehp, totm or related di-octyl succinate
	cleaned title: exposure to the nonphthalate plasticizer diheptyl succinate is less disruptive to c57bl6n mouse recovery from a myocardial infarction than dehp totm or related dioctyl succinate
	Result title: exposure to the nonphthalate plasticizer diheptyl succinate is less disruptive to c57bl/6n mouse recovery from a myocardial infarction than dehp totm or related dioctyl succinate
	Input title: discussion of “efficacy of ae

Unnamed: 0,journal,title,url,pubmed_title,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section,flag_title
0,PLOS One,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it.,"Online sales are increasingly a route by which exotic animals are sold in the global pet trade. There are numerous types of online platforms and transaction types, and dedicated classified adverti...",PloS one,"Jon Bielby, Andy Ferguson, Matthew Rendle, Kirsten M McMillan",2023,,18,7,e0288725,,10.1371/journal.pone.0288725,,False
1,PLOS One,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model",https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288659,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model.","Fish exposed to water supersaturated with dissolved gas experience gas embolism similar to decompression sickness (DCS), known as gas bubble disease (GBD) in fish. GBD has been postulated as an al...",PloS one,"Alicia Vel&#xe1;zquez-Wallraf, Maria Jos&#xe9; Caballero, Antonio Fern&#xe1;ndez, M&#xf3;nica B Betancor, Pedro Saavedra, Holden W Hemingway, Yara Bernaldo de Quir&#xf3;s",2023,,18,7,e0288659,,10.1371/journal.pone.0288659,,False
2,PLOS One,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288658,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation.,Manual image segmentation consumes time. An automatic and accurate method to segment multimodal brain tumors using context information rich three-dimensional medical images that can be used for cl...,PloS one,"Longfeng Shen, Yingjie Zhang, Qiong Wang, Fenglan Qin, Dengdi Sun, Hai Min, Qianqian Meng, Chengzhen Xu, Wei Zhao, Xin Song",2023,,18,7,e0288658,,10.1371/journal.pone.0288658,,False
3,PLOS One,Development of a neural network model to predict the presence of fentanyl in community drug samples,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288656,Development of a neural network model to predict the presence of fentanyl in community drug samples.,"Increasingly, Fourier-transform infrared (FTIR) spectroscopy is being used as a harm reduction tool to provide people who use drugs real-time information about the contents of their substances. Ho...",PloS one,"Lianping Ti, Cameron J Grant, Samuel Tobias, Dennis K Hore, Richard Laing, Brandon D L Marshall",2023,,18,7,e0288656,,10.1371/journal.pone.0288656,,False
4,PLOS One,Validity and reliability of upper body push and pull tests to determine one-repetition maximum,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288649,Validity and reliability of upper body push and pull tests to determine one-repetition maximum.,The purpose of this study was to explore the validity and reliability of three different strength testing approaches to determine one-repetition maximum (1RM) in the bench press and prone bench pull.,PloS one,"Eirik Sigvaldsen, Irineu Loturco, Fredrik Larsen, Jo Bruusgaard, John Magne Kalhovde, Thomas Haugen",2023,,18,7,e0288649,,10.1371/journal.pone.0288649,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,"Applied Physiology, Nutrition, and Metabolism",Economic burden of low muscle strength in Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0371?af=R,Economic burden of low muscle strength in Canadian adults.,The economic cost associated with low muscle strength in Canadian adults is unknown. The total annual economic burden of low muscle strength in Canadian adults represents 2.2% of the overall burde...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Jean-Philippe Chaput, Ian Janssen, Hugues Sampasa-Kanyinga, Grant R Tomkinson, Justin J Lang",2023,,,,,,10.1139/apnm-2022-0371,,False
48,"Applied Physiology, Nutrition, and Metabolism",Development of the Canadian Food Intake Screener to assess alignment of adults’ dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2023-0019?af=R,Development of the Canadian Food Intake Screener to assess alignment of adults' dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations.,The Canadian Food Intake Screener was developed to rapidly assess alignment of adults' dietary intake over the past month with the Food Guide's healthy food choices recommendations. The screener w...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Joy M Hutchinson, Tabitha E Williams, Ailish M Westaway, Alexandra B&#xe9;dard, Camille Pitre, Simone Lemieux, Kevin W Dodd, Beno&#xee;t Lamarche, Patricia M Guenther, Jess Haines, Angela Wallace,...",2023,,,,,,10.1139/apnm-2023-0019,,True
49,"Applied Physiology, Nutrition, and Metabolism",Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0395?af=R,Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey.,"Achieving the three therapeutics targets known as ABC (A1c&#xa0;&#x2264;&#xa0;7.0%, LDL-C&#xa0;&lt;&#xa0;2.0&#xa0;mmol/L, and resting BP&#xa0;&lt;&#xa0;130/80&#xa0;mmHg), limiting sedentary behavi...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Alexis Marcotte-Ch&#xe9;nard, Ren&#xe9; Mar&#xe9;chal, Ahmed Ghachem, Alan Cohen, El&#xe9;onor Riesco",2023,,,,,,10.1139/apnm-2022-0395,,False
50,"Applied Physiology, Nutrition, and Metabolism",Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0379?af=R,Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults.,"The objectives of our study were to examine recreational screen time behavior before and 2&#x2009;years following the COVID-19 pandemic lockdown, and explore whether components of the capability-o...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Sam Liu, Rebecca Coulter, Wuyou Sui, Kayla Nuss, Ryan E Rhodes",2023,,,,,,10.1139/apnm-2022-0379,,False


## 2.32

In [None]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            # cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
            # for article_id in id_list:
            #     result = retrieve_citation(article_id, api_key).decode('utf-8')
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            if cleaned_title in cleaned_result:
                if verbose:
                    print(f'Match found for {title}: PMID = {article_id}.')
            else:
                print(f'Warning: Article title not found in PMIDs.')
                print(f'Check these PMIDs: {id_list}')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tcleaned title: {cleaned_title}')
                result_title = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                print(f'\tResult title: {result_title.group(1) if result_title else cleaned_result}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    flagged_indices = df[df['flag_title'] == True].index
    for index in flagged_indices:
        print(f'Flagged: ')
        print(f'\tArticle title: {df.loc[index, "title"]}')
        print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
        print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.32
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


## 2.33

In [79]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            # cleaned_result = re.sub(f'[{string.punctuation}]', '', result).lower().strip()
            cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
            # for article_id in id_list:
            #     result = retrieve_citation(article_id, api_key).decode('utf-8')
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            result_title_match = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
            if result_title_match:
                result_title = result_title_match.group(1)
                cleaned_result_title = re.sub(r'[^a-zA-Z0-9 ]', '', result_title).lower().strip()
            else:
                cleaned_result_title = cleaned_result
            if cleaned_title in cleaned_result_title:
                if verbose:
                    print(f'Match found for {title}: PMID = {article_id}.')
            else:
                print(f'Warning: Article title not found in PMIDs.')
                print(f'Check these PMIDs: {id_list}')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tResult title: {result_title if result_title else cleaned_result}')
                print(f'\tcleaned input title: {cleaned_title}')
                print(f'\tCleaned result title: {cleaned_result_title}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    flagged_indices = df[df['flag_title'] == True].index
    for index in flagged_indices:
        print(f'Flagged: ')
        print(f'\tArticle title: {df.loc[index, "title"]}')
        print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
        print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.33
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


Check these PMIDs: ['37440536']
	Input title: exposure levels of animal allergens, endotoxin, and β-(1,3)-glucan on a university campus of veterinary medicine
	Result title: exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of veterinary medicine
	cleaned input title: exposure levels of animal allergens endotoxin and 13glucan on a university campus of veterinary medicine
	Cleaned result title: exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of veterinary medicine

Check these PMIDs: ['37222426', '37222399']
	Input title: discussion of “efficacy of aerobic exercise following concussion: a narrative review”
	Result title: reply to the discussion by wang et al of efficacy of aerobic exercise following concussion a narrative review
	cleaned input title: discussion of efficacy of aerobic exercise following concussion a narrative review
	Cleaned result title: reply to the discussion by wang et al of efficacy of aerobic

Unnamed: 0,journal,title,url,pubmed_title,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section,flag_title
0,PLOS One,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it.,"Online sales are increasingly a route by which exotic animals are sold in the global pet trade. There are numerous types of online platforms and transaction types, and dedicated classified adverti...",PloS one,"Jon Bielby, Andy Ferguson, Matthew Rendle, Kirsten M McMillan",2023,,18,7,e0288725,,10.1371/journal.pone.0288725,,False
1,PLOS One,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model",https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288659,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model.","Fish exposed to water supersaturated with dissolved gas experience gas embolism similar to decompression sickness (DCS), known as gas bubble disease (GBD) in fish. GBD has been postulated as an al...",PloS one,"Alicia Vel&#xe1;zquez-Wallraf, Maria Jos&#xe9; Caballero, Antonio Fern&#xe1;ndez, M&#xf3;nica B Betancor, Pedro Saavedra, Holden W Hemingway, Yara Bernaldo de Quir&#xf3;s",2023,,18,7,e0288659,,10.1371/journal.pone.0288659,,False
2,PLOS One,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288658,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation.,Manual image segmentation consumes time. An automatic and accurate method to segment multimodal brain tumors using context information rich three-dimensional medical images that can be used for cl...,PloS one,"Longfeng Shen, Yingjie Zhang, Qiong Wang, Fenglan Qin, Dengdi Sun, Hai Min, Qianqian Meng, Chengzhen Xu, Wei Zhao, Xin Song",2023,,18,7,e0288658,,10.1371/journal.pone.0288658,,False
3,PLOS One,Development of a neural network model to predict the presence of fentanyl in community drug samples,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288656,Development of a neural network model to predict the presence of fentanyl in community drug samples.,"Increasingly, Fourier-transform infrared (FTIR) spectroscopy is being used as a harm reduction tool to provide people who use drugs real-time information about the contents of their substances. Ho...",PloS one,"Lianping Ti, Cameron J Grant, Samuel Tobias, Dennis K Hore, Richard Laing, Brandon D L Marshall",2023,,18,7,e0288656,,10.1371/journal.pone.0288656,,False
4,PLOS One,Validity and reliability of upper body push and pull tests to determine one-repetition maximum,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288649,Validity and reliability of upper body push and pull tests to determine one-repetition maximum.,The purpose of this study was to explore the validity and reliability of three different strength testing approaches to determine one-repetition maximum (1RM) in the bench press and prone bench pull.,PloS one,"Eirik Sigvaldsen, Irineu Loturco, Fredrik Larsen, Jo Bruusgaard, John Magne Kalhovde, Thomas Haugen",2023,,18,7,e0288649,,10.1371/journal.pone.0288649,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,"Applied Physiology, Nutrition, and Metabolism",Economic burden of low muscle strength in Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0371?af=R,Economic burden of low muscle strength in Canadian adults.,The economic cost associated with low muscle strength in Canadian adults is unknown. The total annual economic burden of low muscle strength in Canadian adults represents 2.2% of the overall burde...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Jean-Philippe Chaput, Ian Janssen, Hugues Sampasa-Kanyinga, Grant R Tomkinson, Justin J Lang",2023,,,,,,10.1139/apnm-2022-0371,,False
48,"Applied Physiology, Nutrition, and Metabolism",Development of the Canadian Food Intake Screener to assess alignment of adults’ dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2023-0019?af=R,Development of the Canadian Food Intake Screener to assess alignment of adults' dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations.,The Canadian Food Intake Screener was developed to rapidly assess alignment of adults' dietary intake over the past month with the Food Guide's healthy food choices recommendations. The screener w...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Joy M Hutchinson, Tabitha E Williams, Ailish M Westaway, Alexandra B&#xe9;dard, Camille Pitre, Simone Lemieux, Kevin W Dodd, Beno&#xee;t Lamarche, Patricia M Guenther, Jess Haines, Angela Wallace,...",2023,,,,,,10.1139/apnm-2023-0019,,True
49,"Applied Physiology, Nutrition, and Metabolism",Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0395?af=R,Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey.,"Achieving the three therapeutics targets known as ABC (A1c&#xa0;&#x2264;&#xa0;7.0%, LDL-C&#xa0;&lt;&#xa0;2.0&#xa0;mmol/L, and resting BP&#xa0;&lt;&#xa0;130/80&#xa0;mmHg), limiting sedentary behavi...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Alexis Marcotte-Ch&#xe9;nard, Ren&#xe9; Mar&#xe9;chal, Ahmed Ghachem, Alan Cohen, El&#xe9;onor Riesco",2023,,,,,,10.1139/apnm-2022-0395,,False
50,"Applied Physiology, Nutrition, and Metabolism",Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0379?af=R,Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults.,"The objectives of our study were to examine recreational screen time behavior before and 2&#x2009;years following the COVID-19 pandemic lockdown, and explore whether components of the capability-o...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Sam Liu, Rebecca Coulter, Wuyou Sui, Kayla Nuss, Ryan E Rhodes",2023,,,,,,10.1139/apnm-2022-0379,,False


## 2.34

In [109]:
text = 'disruptive! to c57bl/6n mouse recover</b>'
text = re.sub(r'/(?![^<>]*>)', '', text)
text = re.sub(r'[^a-zA-Z0-9 <>/]', '', text)
text

'disruptive to c57bl6n mouse recover</b>'

In [110]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
            result = retrieve_citation(id_list[0], api_key).decode('utf-8')
            result_title_match = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
            if result_title_match:
                result_title = result_title_match.group(1)
                cleaned_result_title = re.sub(r'</?[ib]>', '', result_title)
                cleaned_result_title = re.sub(r'/(?![^<>]*>)', '', cleaned_result_title) # Remove any / that is not within html tag
                cleaned_result_title = re.sub(r'[^a-zA-Z0-9 <>/]', '', cleaned_result_title).lower().strip()
            else:
                cleaned_result_title = cleaned_result
            if cleaned_title in cleaned_result_title:
                if verbose:
                    print(f'Match found for {title}: PMID = {article_id}.')
            else:
                print(f'Warning: Article title not found in PMIDs.')
                print(f'Check these PMIDs: {id_list}')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tResult title: {result_title if result_title else cleaned_result}')
                print(f'\tCleaned input title: {cleaned_title}')
                print(f'\tCleaned result title: {cleaned_result_title}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    flagged_indices = df[df['flag_title'] == True].index
    for index in flagged_indices:
        print(f'Flagged: ')
        print(f'\tArticle title: {df.loc[index, "title"]}')
        print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
        print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.34
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


Check these PMIDs: ['37440536']
	Input title: exposure levels of animal allergens, endotoxin, and β-(1,3)-glucan on a university campus of veterinary medicine
	Result title: exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of veterinary medicine
	Cleaned input title: exposure levels of animal allergens endotoxin and 13glucan on a university campus of veterinary medicine
	Cleaned result title: exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of veterinary medicine

Check these PMIDs: ['37222426', '37222399']
	Input title: discussion of “efficacy of aerobic exercise following concussion: a narrative review”
	Result title: reply to the discussion by wang et al of efficacy of aerobic exercise following concussion a narrative review
	Cleaned input title: discussion of efficacy of aerobic exercise following concussion a narrative review
	Cleaned result title: reply to the discussion by wang et al of efficacy of aerobic

Unnamed: 0,journal,title,url,pubmed_title,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section,flag_title
0,PLOS One,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it.,"Online sales are increasingly a route by which exotic animals are sold in the global pet trade. There are numerous types of online platforms and transaction types, and dedicated classified adverti...",PloS one,"Jon Bielby, Andy Ferguson, Matthew Rendle, Kirsten M McMillan",2023,,18,7,e0288725,,10.1371/journal.pone.0288725,,False
1,PLOS One,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model",https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288659,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model.","Fish exposed to water supersaturated with dissolved gas experience gas embolism similar to decompression sickness (DCS), known as gas bubble disease (GBD) in fish. GBD has been postulated as an al...",PloS one,"Alicia Vel&#xe1;zquez-Wallraf, Maria Jos&#xe9; Caballero, Antonio Fern&#xe1;ndez, M&#xf3;nica B Betancor, Pedro Saavedra, Holden W Hemingway, Yara Bernaldo de Quir&#xf3;s",2023,,18,7,e0288659,,10.1371/journal.pone.0288659,,False
2,PLOS One,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288658,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation.,Manual image segmentation consumes time. An automatic and accurate method to segment multimodal brain tumors using context information rich three-dimensional medical images that can be used for cl...,PloS one,"Longfeng Shen, Yingjie Zhang, Qiong Wang, Fenglan Qin, Dengdi Sun, Hai Min, Qianqian Meng, Chengzhen Xu, Wei Zhao, Xin Song",2023,,18,7,e0288658,,10.1371/journal.pone.0288658,,False
3,PLOS One,Development of a neural network model to predict the presence of fentanyl in community drug samples,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288656,Development of a neural network model to predict the presence of fentanyl in community drug samples.,"Increasingly, Fourier-transform infrared (FTIR) spectroscopy is being used as a harm reduction tool to provide people who use drugs real-time information about the contents of their substances. Ho...",PloS one,"Lianping Ti, Cameron J Grant, Samuel Tobias, Dennis K Hore, Richard Laing, Brandon D L Marshall",2023,,18,7,e0288656,,10.1371/journal.pone.0288656,,False
4,PLOS One,Validity and reliability of upper body push and pull tests to determine one-repetition maximum,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288649,Validity and reliability of upper body push and pull tests to determine one-repetition maximum.,The purpose of this study was to explore the validity and reliability of three different strength testing approaches to determine one-repetition maximum (1RM) in the bench press and prone bench pull.,PloS one,"Eirik Sigvaldsen, Irineu Loturco, Fredrik Larsen, Jo Bruusgaard, John Magne Kalhovde, Thomas Haugen",2023,,18,7,e0288649,,10.1371/journal.pone.0288649,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,"Applied Physiology, Nutrition, and Metabolism",Economic burden of low muscle strength in Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0371?af=R,Economic burden of low muscle strength in Canadian adults.,The economic cost associated with low muscle strength in Canadian adults is unknown. The total annual economic burden of low muscle strength in Canadian adults represents 2.2% of the overall burde...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Jean-Philippe Chaput, Ian Janssen, Hugues Sampasa-Kanyinga, Grant R Tomkinson, Justin J Lang",2023,,,,,,10.1139/apnm-2022-0371,,False
48,"Applied Physiology, Nutrition, and Metabolism",Development of the Canadian Food Intake Screener to assess alignment of adults’ dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2023-0019?af=R,Development of the Canadian Food Intake Screener to assess alignment of adults' dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations.,The Canadian Food Intake Screener was developed to rapidly assess alignment of adults' dietary intake over the past month with the Food Guide's healthy food choices recommendations. The screener w...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Joy M Hutchinson, Tabitha E Williams, Ailish M Westaway, Alexandra B&#xe9;dard, Camille Pitre, Simone Lemieux, Kevin W Dodd, Beno&#xee;t Lamarche, Patricia M Guenther, Jess Haines, Angela Wallace,...",2023,,,,,,10.1139/apnm-2023-0019,,True
49,"Applied Physiology, Nutrition, and Metabolism",Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0395?af=R,Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey.,"Achieving the three therapeutics targets known as ABC (A1c&#xa0;&#x2264;&#xa0;7.0%, LDL-C&#xa0;&lt;&#xa0;2.0&#xa0;mmol/L, and resting BP&#xa0;&lt;&#xa0;130/80&#xa0;mmHg), limiting sedentary behavi...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Alexis Marcotte-Ch&#xe9;nard, Ren&#xe9; Mar&#xe9;chal, Ahmed Ghachem, Alan Cohen, El&#xe9;onor Riesco",2023,,,,,,10.1139/apnm-2022-0395,,False
50,"Applied Physiology, Nutrition, and Metabolism",Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0379?af=R,Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults.,"The objectives of our study were to examine recreational screen time behavior before and 2&#x2009;years following the COVID-19 pandemic lockdown, and explore whether components of the capability-o...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Sam Liu, Rebecca Coulter, Wuyou Sui, Kayla Nuss, Ryan E Rhodes",2023,,,,,,10.1139/apnm-2022-0379,,False


## 2.4 check the next PMID if title doesn't match from first result

In [112]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                result_title_match = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                if result_title_match:
                    result_title = result_title_match.group(1)
                    cleaned_result_title = re.sub(r'</?[ib]>', '', result_title)
                    cleaned_result_title = re.sub(r'/(?![^<>]*>)', '', cleaned_result_title) # Remove any / that is not within html tag
                    cleaned_result_title = re.sub(r'[^a-zA-Z0-9 <>/]', '', cleaned_result_title).lower().strip()
                else:
                    cleaned_result_title = cleaned_result
                if cleaned_title == cleaned_result_title:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                        return result
                else:
                    continue
            print(f'Warning: Article title not found in PMIDs.')
            print(f'Check these PMIDs: {id_list}')
            print(f'\tInput title: {title.lower().strip()}')
            print(f'\tResult title: {result_title if result_title else cleaned_result}')
            print(f'\tCleaned input title: {cleaned_title}')
            print(f'\tCleaned result title: {cleaned_result_title}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    flagged_indices = df[df['flag_title'] == True].index
    for index in flagged_indices:
        print(f'Flagged: ')
        print(f'\tArticle title: {df.loc[index, "title"]}')
        print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
        print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.4
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


Check these PMIDs: ['37440593']
	Input title: online classified adverts reflect the broader united kingdom trade in turtles and tortoises rather than drive it
	Result title: online classified adverts reflect the broader united kingdom trade in turtles and tortoises rather than drive it
	Cleaned input title: online classified adverts reflect the broader united kingdom trade in turtles and tortoises rather than drive it
	Cleaned result title: online classified adverts reflect the broader united kingdom trade in turtles and tortoises rather than drive it

Check these PMIDs: ['37440588']
	Input title: biomarkers related to gas embolism: gas score, pathology, and gene expression in a gas bubble disease model
	Result title: biomarkers related to gas embolism gas score pathology and gene expression in a gas bubble disease model
	Cleaned input title: biomarkers related to gas embolism gas score pathology and gene expression in a gas bubble disease model
	Cleaned result title: biomarkers relate

Unnamed: 0,journal,title,url,pubmed_title,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section,flag_title
0,PLOS One,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it.,"Online sales are increasingly a route by which exotic animals are sold in the global pet trade. There are numerous types of online platforms and transaction types, and dedicated classified adverti...",PloS one,"Jon Bielby, Andy Ferguson, Matthew Rendle, Kirsten M McMillan",2023,,18,7,e0288725,,10.1371/journal.pone.0288725,,False
1,PLOS One,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model",https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288659,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model.","Fish exposed to water supersaturated with dissolved gas experience gas embolism similar to decompression sickness (DCS), known as gas bubble disease (GBD) in fish. GBD has been postulated as an al...",PloS one,"Alicia Vel&#xe1;zquez-Wallraf, Maria Jos&#xe9; Caballero, Antonio Fern&#xe1;ndez, M&#xf3;nica B Betancor, Pedro Saavedra, Holden W Hemingway, Yara Bernaldo de Quir&#xf3;s",2023,,18,7,e0288659,,10.1371/journal.pone.0288659,,False
2,PLOS One,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288658,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation.,Manual image segmentation consumes time. An automatic and accurate method to segment multimodal brain tumors using context information rich three-dimensional medical images that can be used for cl...,PloS one,"Longfeng Shen, Yingjie Zhang, Qiong Wang, Fenglan Qin, Dengdi Sun, Hai Min, Qianqian Meng, Chengzhen Xu, Wei Zhao, Xin Song",2023,,18,7,e0288658,,10.1371/journal.pone.0288658,,False
3,PLOS One,Development of a neural network model to predict the presence of fentanyl in community drug samples,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288656,Development of a neural network model to predict the presence of fentanyl in community drug samples.,"Increasingly, Fourier-transform infrared (FTIR) spectroscopy is being used as a harm reduction tool to provide people who use drugs real-time information about the contents of their substances. Ho...",PloS one,"Lianping Ti, Cameron J Grant, Samuel Tobias, Dennis K Hore, Richard Laing, Brandon D L Marshall",2023,,18,7,e0288656,,10.1371/journal.pone.0288656,,False
4,PLOS One,Validity and reliability of upper body push and pull tests to determine one-repetition maximum,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288649,Validity and reliability of upper body push and pull tests to determine one-repetition maximum.,The purpose of this study was to explore the validity and reliability of three different strength testing approaches to determine one-repetition maximum (1RM) in the bench press and prone bench pull.,PloS one,"Eirik Sigvaldsen, Irineu Loturco, Fredrik Larsen, Jo Bruusgaard, John Magne Kalhovde, Thomas Haugen",2023,,18,7,e0288649,,10.1371/journal.pone.0288649,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,"Applied Physiology, Nutrition, and Metabolism",Economic burden of low muscle strength in Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0371?af=R,Economic burden of low muscle strength in Canadian adults.,The economic cost associated with low muscle strength in Canadian adults is unknown. The total annual economic burden of low muscle strength in Canadian adults represents 2.2% of the overall burde...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Jean-Philippe Chaput, Ian Janssen, Hugues Sampasa-Kanyinga, Grant R Tomkinson, Justin J Lang",2023,,,,,,10.1139/apnm-2022-0371,,False
48,"Applied Physiology, Nutrition, and Metabolism",Development of the Canadian Food Intake Screener to assess alignment of adults’ dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2023-0019?af=R,Development of the Canadian Food Intake Screener to assess alignment of adults' dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations.,The Canadian Food Intake Screener was developed to rapidly assess alignment of adults' dietary intake over the past month with the Food Guide's healthy food choices recommendations. The screener w...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Joy M Hutchinson, Tabitha E Williams, Ailish M Westaway, Alexandra B&#xe9;dard, Camille Pitre, Simone Lemieux, Kevin W Dodd, Beno&#xee;t Lamarche, Patricia M Guenther, Jess Haines, Angela Wallace,...",2023,,,,,,10.1139/apnm-2023-0019,,True
49,"Applied Physiology, Nutrition, and Metabolism",Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0395?af=R,Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey.,"Achieving the three therapeutics targets known as ABC (A1c&#xa0;&#x2264;&#xa0;7.0%, LDL-C&#xa0;&lt;&#xa0;2.0&#xa0;mmol/L, and resting BP&#xa0;&lt;&#xa0;130/80&#xa0;mmHg), limiting sedentary behavi...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Alexis Marcotte-Ch&#xe9;nard, Ren&#xe9; Mar&#xe9;chal, Ahmed Ghachem, Alan Cohen, El&#xe9;onor Riesco",2023,,,,,,10.1139/apnm-2022-0395,,False
50,"Applied Physiology, Nutrition, and Metabolism",Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0379?af=R,Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults.,"The objectives of our study were to examine recreational screen time behavior before and 2&#x2009;years following the COVID-19 pandemic lockdown, and explore whether components of the capability-o...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Sam Liu, Rebecca Coulter, Wuyou Sui, Kayla Nuss, Ryan E Rhodes",2023,,,,,,10.1139/apnm-2022-0379,,False


## 2.41 fix print statements

In [113]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                result_title_match = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                if result_title_match:
                    result_title = result_title_match.group(1)
                    cleaned_result_title = re.sub(r'</?[ib]>', '', result_title)
                    cleaned_result_title = re.sub(r'/(?![^<>]*>)', '', cleaned_result_title) # Remove any / that is not within html tag
                    cleaned_result_title = re.sub(r'[^a-zA-Z0-9 <>/]', '', cleaned_result_title).lower().strip()
                else:
                    cleaned_result_title = cleaned_result
                if cleaned_title == cleaned_result_title:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                        return result
                else:
                    continue
            if cleaned_title != cleaned_result_title:
                print(f'Warning: Article title not found in PMIDs.')
                print(f'Check these PMIDs: {id_list}')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tResult title: {result_title if result_title else cleaned_result}')
                print(f'\tCleaned input title: {cleaned_title}')
                print(f'\tCleaned result title: {cleaned_result_title}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    flagged_indices = df[df['flag_title'] == True].index
    for index in flagged_indices:
        print(f'Flagged: ')
        print(f'\tArticle title: {df.loc[index, "title"]}')
        print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
        print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.41
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

## Create feed table
feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


Check these PMIDs: ['37440536']
	Input title: exposure levels of animal allergens, endotoxin, and β-(1,3)-glucan on a university campus of veterinary medicine
	Result title: exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of veterinary medicine
	Cleaned input title: exposure levels of animal allergens endotoxin and 13glucan on a university campus of veterinary medicine
	Cleaned result title: exposure levels of animal allergens endotoxin and x3b213glucan on a university campus of veterinary medicine

Flagged: 
	Article title: Machine learning‐based identification and related features of depression in patients with diabetes mellitus based on the Korea National Health and Nutrition Examination Survey: A cross-sectional study
	PubMed title: Machine learning-based identification and related features of depression in patients with diabetes mellitus based on the Korea National Health and Nutrition Examination Survey: A cross-sectional study.

Flagged: 
	A

Unnamed: 0,journal,title,url,pubmed_title,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section,flag_title
0,PLOS One,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it.,"Online sales are increasingly a route by which exotic animals are sold in the global pet trade. There are numerous types of online platforms and transaction types, and dedicated classified adverti...",PloS one,"Jon Bielby, Andy Ferguson, Matthew Rendle, Kirsten M McMillan",2023,,18,7,e0288725,,10.1371/journal.pone.0288725,,False
1,PLOS One,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model",https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288659,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model.","Fish exposed to water supersaturated with dissolved gas experience gas embolism similar to decompression sickness (DCS), known as gas bubble disease (GBD) in fish. GBD has been postulated as an al...",PloS one,"Alicia Vel&#xe1;zquez-Wallraf, Maria Jos&#xe9; Caballero, Antonio Fern&#xe1;ndez, M&#xf3;nica B Betancor, Pedro Saavedra, Holden W Hemingway, Yara Bernaldo de Quir&#xf3;s",2023,,18,7,e0288659,,10.1371/journal.pone.0288659,,False
2,PLOS One,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288658,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation.,Manual image segmentation consumes time. An automatic and accurate method to segment multimodal brain tumors using context information rich three-dimensional medical images that can be used for cl...,PloS one,"Longfeng Shen, Yingjie Zhang, Qiong Wang, Fenglan Qin, Dengdi Sun, Hai Min, Qianqian Meng, Chengzhen Xu, Wei Zhao, Xin Song",2023,,18,7,e0288658,,10.1371/journal.pone.0288658,,False
3,PLOS One,Development of a neural network model to predict the presence of fentanyl in community drug samples,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288656,Development of a neural network model to predict the presence of fentanyl in community drug samples.,"Increasingly, Fourier-transform infrared (FTIR) spectroscopy is being used as a harm reduction tool to provide people who use drugs real-time information about the contents of their substances. Ho...",PloS one,"Lianping Ti, Cameron J Grant, Samuel Tobias, Dennis K Hore, Richard Laing, Brandon D L Marshall",2023,,18,7,e0288656,,10.1371/journal.pone.0288656,,False
4,PLOS One,Validity and reliability of upper body push and pull tests to determine one-repetition maximum,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288649,Validity and reliability of upper body push and pull tests to determine one-repetition maximum.,The purpose of this study was to explore the validity and reliability of three different strength testing approaches to determine one-repetition maximum (1RM) in the bench press and prone bench pull.,PloS one,"Eirik Sigvaldsen, Irineu Loturco, Fredrik Larsen, Jo Bruusgaard, John Magne Kalhovde, Thomas Haugen",2023,,18,7,e0288649,,10.1371/journal.pone.0288649,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,"Applied Physiology, Nutrition, and Metabolism",Economic burden of low muscle strength in Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0371?af=R,Economic burden of low muscle strength in Canadian adults.,The economic cost associated with low muscle strength in Canadian adults is unknown. The total annual economic burden of low muscle strength in Canadian adults represents 2.2% of the overall burde...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Jean-Philippe Chaput, Ian Janssen, Hugues Sampasa-Kanyinga, Grant R Tomkinson, Justin J Lang",2023,,,,,,10.1139/apnm-2022-0371,,False
48,"Applied Physiology, Nutrition, and Metabolism",Development of the Canadian Food Intake Screener to assess alignment of adults’ dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2023-0019?af=R,Development of the Canadian Food Intake Screener to assess alignment of adults' dietary intake with the 2019 Canada's Food Guide healthy food choices recommendations.,The Canadian Food Intake Screener was developed to rapidly assess alignment of adults' dietary intake over the past month with the Food Guide's healthy food choices recommendations. The screener w...,"Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Joy M Hutchinson, Tabitha E Williams, Ailish M Westaway, Alexandra B&#xe9;dard, Camille Pitre, Simone Lemieux, Kevin W Dodd, Beno&#xee;t Lamarche, Patricia M Guenther, Jess Haines, Angela Wallace,...",2023,,,,,,10.1139/apnm-2023-0019,,True
49,"Applied Physiology, Nutrition, and Metabolism",Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0395?af=R,Achievement of the ABC goal among Canadians with type 2 diabetes and the influence of physical activity: data from the Canadian Health Measures Survey.,"Achieving the three therapeutics targets known as ABC (A1c&#xa0;&#x2264;&#xa0;7.0%, LDL-C&#xa0;&lt;&#xa0;2.0&#xa0;mmol/L, and resting BP&#xa0;&lt;&#xa0;130/80&#xa0;mmHg), limiting sedentary behavi...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Alexis Marcotte-Ch&#xe9;nard, Ren&#xe9; Mar&#xe9;chal, Ahmed Ghachem, Alan Cohen, El&#xe9;onor Riesco",2023,,,,,,10.1139/apnm-2022-0395,,False
50,"Applied Physiology, Nutrition, and Metabolism",Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults,https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0379?af=R,Determinants of recreational screen time behavior following the COVID-19 pandemic among Canadian adults.,"The objectives of our study were to examine recreational screen time behavior before and 2&#x2009;years following the COVID-19 pandemic lockdown, and explore whether components of the capability-o...","Applied physiology, nutrition, and metabolism = Physiologie appliquee, nutrition et metabolisme","Sam Liu, Rebecca Coulter, Wuyou Sui, Kayla Nuss, Ryan E Rhodes",2023,,,,,,10.1139/apnm-2022-0379,,False


In [114]:
feed_df_dict[iteration_id].columns

Index(['journal', 'title', 'url', 'pubmed_title', 'abstract', 'publication',
       'authors', 'year', 'month', 'pub_volume', 'pub_issue', 'start_page',
       'end_page', 'doi', 'section', 'flag_title'],
      dtype='object')

In [120]:
filter = {'journal': ['PLOS One']}
filter_df_any_condition(feed_df_dict[iteration_id], filter)

Results where any condition is met:
	DataFrame indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
	DataFrame shape: (30, 16)


Unnamed: 0,journal,title,url,pubmed_title,abstract,publication,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,section,flag_title
0,PLOS One,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288725,Online classified adverts reflect the broader United Kingdom trade in turtles and tortoises rather than drive it.,"Online sales are increasingly a route by which exotic animals are sold in the global pet trade. There are numerous types of online platforms and transaction types, and dedicated classified adverti...",PloS one,"Jon Bielby, Andy Ferguson, Matthew Rendle, Kirsten M McMillan",2023,,18,7,e0288725,,10.1371/journal.pone.0288725,,False
1,PLOS One,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model",https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288659,"Biomarkers related to gas embolism: Gas score, pathology, and gene expression in a gas bubble disease model.","Fish exposed to water supersaturated with dissolved gas experience gas embolism similar to decompression sickness (DCS), known as gas bubble disease (GBD) in fish. GBD has been postulated as an al...",PloS one,"Alicia Vel&#xe1;zquez-Wallraf, Maria Jos&#xe9; Caballero, Antonio Fern&#xe1;ndez, M&#xf3;nica B Betancor, Pedro Saavedra, Holden W Hemingway, Yara Bernaldo de Quir&#xf3;s",2023,,18,7,e0288659,,10.1371/journal.pone.0288659,,False
2,PLOS One,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288658,Feature interaction network based on hierarchical decoupled convolution for 3D medical image segmentation.,Manual image segmentation consumes time. An automatic and accurate method to segment multimodal brain tumors using context information rich three-dimensional medical images that can be used for cl...,PloS one,"Longfeng Shen, Yingjie Zhang, Qiong Wang, Fenglan Qin, Dengdi Sun, Hai Min, Qianqian Meng, Chengzhen Xu, Wei Zhao, Xin Song",2023,,18,7,e0288658,,10.1371/journal.pone.0288658,,False
3,PLOS One,Development of a neural network model to predict the presence of fentanyl in community drug samples,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288656,Development of a neural network model to predict the presence of fentanyl in community drug samples.,"Increasingly, Fourier-transform infrared (FTIR) spectroscopy is being used as a harm reduction tool to provide people who use drugs real-time information about the contents of their substances. Ho...",PloS one,"Lianping Ti, Cameron J Grant, Samuel Tobias, Dennis K Hore, Richard Laing, Brandon D L Marshall",2023,,18,7,e0288656,,10.1371/journal.pone.0288656,,False
4,PLOS One,Validity and reliability of upper body push and pull tests to determine one-repetition maximum,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288649,Validity and reliability of upper body push and pull tests to determine one-repetition maximum.,The purpose of this study was to explore the validity and reliability of three different strength testing approaches to determine one-repetition maximum (1RM) in the bench press and prone bench pull.,PloS one,"Eirik Sigvaldsen, Irineu Loturco, Fredrik Larsen, Jo Bruusgaard, John Magne Kalhovde, Thomas Haugen",2023,,18,7,e0288649,,10.1371/journal.pone.0288649,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,PLOS One,Informal ready-to-eat food vending governance in urban Nigeria: Formal and informal lenses guiding the practice,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288499,Informal ready-to-eat food vending governance in urban Nigeria: Formal and informal lenses guiding the practice.,"Informal ready-to-eat food vending is an important, cheap, convenient, accessible and readily available urban food supply sector that has become an increasingly important part of the diets of peop...",PloS one,"Kehinde Paul Adeosun, Peter Oosterveer, Mary Greene",2023,,18,7,e0288499,,10.1371/journal.pone.0288499,,False
26,PLOS One,Global phylogeographical distribution of <i>Gloeoporus dichrous</i>,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288498,Global phylogeographical distribution of Gloeoporus dichrous.,"Phylogeographic analyses are efficient in ecological and evolutionary studies to discover the origin of a lineage, its dispersal routes, and the divergence of ancestral traits. Studies on widespre...",PloS one,"Yoonhee Cho, Chang Wan Seo, Paul Eunil Jung, Young Woon Lim",2023,,18,7,e0288498,,10.1371/journal.pone.0288498,,True
27,PLOS One,A novel co-culture model for investigation of the effects of LPS-induced macrophage-derived cytokines on brain endothelial cells,https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288497,A novel co-culture model for investigation of the effects of LPS-induced macrophage-derived cytokines on brain endothelial cells.,"In order to study effects of macrophage-derived inflammatory mediators associated with systemic inflammation on brain endothelial cells, we have established a co-culture system consisting of bEnd....",PloS one,"Junling Yang, Yinchuan Li, Ambuj Bhalla, Mark Maienschein-Cline, Ken-Ichiro Fukuchi",2023,,18,7,e0288497,,10.1371/journal.pone.0288497,,False
28,PLOS One,"Exposure to the non-phthalate plasticizer di-heptyl succinate is less disruptive to C57bl/6N mouse recovery from a myocardial infarction than DEHP, TOTM or related di-octyl succinate",https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0288491,"Exposure to the non-phthalate plasticizer di-heptyl succinate is less disruptive to C57bl/6N mouse recovery from a myocardial infarction than DEHP, TOTM or related di-octyl succinate.","Phthalate plasticizers are incorporated into plastics to make them soft and malleable, but are known to leach out of the final product into their surroundings with potential detrimental effects to...",PloS one,"Adam Schwendt, Joey-Bahige Chammas, Milan Maric, Jim A Nicell, Richard Leask, Lorraine E Chalifour",2023,,18,7,e0288491,,10.1371/journal.pone.0288491,,False


# 3 update Feed class and bulk_append

In [None]:
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
import re
import os
import string
import pandas as pd
import requests
from article_processing import create_text_dict_from_folder
api_key = os.getenv('api_ncbi')

def initialize_text_df(folder_path, encoding='ISO-8859-1', subset=None):
    """
    Create a DataFrame from a folder containing text files.

    Parameters:
    - folder_path (str): Path to folder containing text files.
    - encoding (str): Encoding of the text files.
    - subset (int): Number of text files to be read. If None, read all files.

    Returns:
    DataFrame containing the text files.
    """
    text_dict = create_text_dict_from_folder(folder_path, encoding, subset)
    text_df = pd.Series(text_dict, index=text_dict.keys())
    return text_df

def parse_fulltext(folder_path, title_pattern=r'^(.*)\n*.+', encoding='ISO-8859-1', subset=None):
    # Initialize empty lists to store the captured groups
    titles = []
    bodies = []
    
    text_df = initialize_text_df(folder_path, encoding, subset)
    # Iterate over each element in the series
    for text in text_df:
        # print(text)
        # Apply the regular expression pattern
        title_match = re.search(title_pattern, text)
        
        # Extract the capture groups and append them to the lists
        if title_match:
            titles.append(title_match.group(1))
            body = re.sub(title_pattern, '', text)
            bodies.append(body.strip())
            
        else:
            titles.append(None)
            bodies.append(None)
    
    # Create a new DataFrame from the captured groups
    df = pd.DataFrame({ 'title': titles, 'text': bodies })
    
    return df

def search_article(title, api_key, verbose=False):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    title_without_not = re.sub(r'not', '', title)
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'term': title_without_not,
        'field': 'title',
        'retmax': 5,
        'retmode': 'json'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    cleaned_title = re.sub(r'</?[ib]>', '', title) # remove bold and italic html tags
    cleaned_title = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_title).lower().strip()
    # cleaned_title = re.sub(rf'[{string.punctuation}]', '', cleaned_title).lower().strip()
    cleaned_title = re.sub(r"\u2010", '', cleaned_title)

    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                cleaned_result = re.sub(r'[^a-zA-Z0-9 <>/]', '', result).lower().strip() 
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
                result_title_match = re.search(r'<articletitle>(.*?)</articletitle>', cleaned_result)
                if result_title_match:
                    result_title = result_title_match.group(1)
                    cleaned_result_title = re.sub(r'</?[ib]>', '', result_title)
                    cleaned_result_title = re.sub(r'/(?![^<>]*>)', '', cleaned_result_title) # Remove any / that is not within html tag
                    cleaned_result_title = re.sub(r'[^a-zA-Z0-9 <>/]', '', cleaned_result_title).lower().strip()
                else:
                    cleaned_result_title = cleaned_result
                if cleaned_title == cleaned_result_title:
                    if verbose:
                        print(f'Match found for {title}: PMID = {article_id}.')
                        return result
                else:
                    continue
            if cleaned_title != cleaned_result_title:
                print(f'Warning: Article title not found in PMIDs.')
                print(f'Check these PMIDs: {id_list}')
                print(f'\tInput title: {title.lower().strip()}')
                print(f'\tResult title: {result_title if result_title else cleaned_result}')
                print(f'\tCleaned input title: {cleaned_title}')
                print(f'\tCleaned result title: {cleaned_result_title}\n')
            return result     
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return id_list 
    
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract = re.search(r'<AbstractText.*?>(.*?)</AbstractText>', record_string)
    abstract = abstract.group(1) if abstract else ''

    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'publication': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
    }


def pubmed_details_by_title(title, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

def add_pubmed_details(text_df, api_key, section=None):
    """
    Add the article metadata to a DataFrame containing article title and text.

    Parameters:
    - text_df (pd.DataFrame): DataFrame containing article title and text.
    - api_key (str): NCBI API key

    Returns:
    DataFrame with added PubMed details for each article.
    """
    article_details_list = []
    for article in text_df['title']:
        article_details = pubmed_details_by_title(article, api_key)
        if article_details:
            article_details_list.append(article_details)
        else:
            article_details_list.append({
                'pubmed_title': article,
                'abstract': '',
                'publication': '',
                'authors': '',
                'year': '',
                'month': '',
                'pub_volume': '',
                'pub_issue': '',
                'start_page': '',
                'end_page': '',
                'doi': '',
            })
    article_details_df = pd.DataFrame(article_details_list)
    article_details_df['section'] = pd.Series(section, index=article_details_df.index, dtype=str)
    return pd.concat([text_df.reset_index(drop=True), article_details_df], axis=1)

def compare_columns(df, col1='title', col2='pubmed_title'):
    """
    Compare two columns in a DataFrame. Drop the second column if the two columns are identical.
    Otherwise, return the dataframe with new column with the comparison results, 
    where `True` indicates a mismatch.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the two columns to be compared.
    - col1 (str): Name of the first column to be compared.
    - col2 (str): Name of the second column to be compared.

    Returns:
    DataFrame with added column containing the comparison results.
    """
    # Remove punctuation and special characters
    remove_punct = lambda text: re.sub(f'[{string.punctuation}]', '', text)
    col1 = df[col1].apply(remove_punct)
    col2 = df[col2].apply(remove_punct)

    # Convert to lowercase and remove white spaces
    clean_text = lambda text: text.lower().strip()
    col1 = col1.apply(clean_text)
    col2 = col2.apply(clean_text)

    # Perform the comparison
    comparison = col1 != col2
    if sum(comparison) == 0:
        df = df.drop(columns=['pubmed_title'])
    else:
        df['flag_title'] = comparison
    flagged_indices = df[df['flag_title'] == True].index
    for index in flagged_indices:
        print(f'Flagged: ')
        print(f'\tArticle title: {df.loc[index, "title"]}')
        print(f'\tPubMed title: {df.loc[index, "pubmed_title"]}')
        print()
    
    return df

def create_sources_table(text_df, col1='title', col2='pubmed_title', section=None):
    references_df = add_pubmed_details(text_df, api_key, section=section)

    references_df = compare_columns(references_df, col1=col1, col2=col2)
    return references_df

def create_feed_table(article_dict, col1='title', col2='pubmed_title', section=None):
    text_df = pd.DataFrame(article_dict).transpose()
    feed_df = add_pubmed_details(text_df, api_key, section=section)

    feed_df = compare_columns(feed_df, col1=col1, col2=col2)
    return feed_df


import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")

from db_session import *
from sqlalchemy.orm import declarative_base
from sqlalchemy import text
from sqlalchemy import Column, ForeignKey, Integer, String, Text, TIMESTAMP, Numeric
from sqlalchemy.dialects.postgresql import UUID
import uuid
import pandas as pd
# from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship



Base = declarative_base()

class Feed(Base):
    __tablename__ = 'feed'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    url = mapped_column(String(255))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))

class Sources(Base):
    __tablename__ = 'sources'
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    text = mapped_column(Text)
    abstract = mapped_column(Text)
    publication = mapped_column(String(100))
    authors = mapped_column(String(300))
    year = mapped_column(Integer)
    month = mapped_column(String(10))
    pub_volume = mapped_column(String(10))
    pub_issue = mapped_column(String(10))
    start_page = mapped_column(String(10))
    end_page = mapped_column(String(10))
    doi = mapped_column(String(50))
    section = mapped_column(String(100))
    summaries = relationship('Summaries', back_populates='sources')

class Prompts(Base):
    __tablename__ = 'prompts'
    id = mapped_column(Integer, primary_key=True)
    full_template = mapped_column(Text)
    system_role = mapped_column(String(300))
    prep_steps = mapped_column(Text)
    task = mapped_column(Text)
    edit_steps = mapped_column(Text)
    simplify_steps = mapped_column(Text)
    audience = mapped_column(String(200))
    format_steps = mapped_column(Text)

    summaries = relationship('Summaries', back_populates='prompts')
    
class Summaries(Base):
    __tablename__ = 'summaries'
    id = mapped_column(Integer, primary_key=True)
    timestamp = mapped_column(TIMESTAMP(timezone=True))
    original_summary = mapped_column(Text)
    rating_original_content = mapped_column(Integer) 
    simple_summary = mapped_column(Text)
    rating_simple_content = mapped_column(Integer) 
    original_headline = mapped_column(String(255))
    prompt_id = mapped_column(Integer, ForeignKey('prompts.id'), autoincrement=False)
    reference_id = mapped_column(Integer, ForeignKey('sources.id'), autoincrement=False)
    choice = mapped_column(Integer)
    model = mapped_column(String(70))
    temperature = mapped_column(Numeric)

    prompts = relationship('Prompts', back_populates='summaries')
    sources = relationship('Sources', back_populates='summaries')

@remote_sql_session
def get_table(session, query='SELECT *', table='publications', limit=None, order_by='id', order='ASC'):
    """
    Return a database table as a pandas dataframe.
    """
    query_statement = f'{query} from {table}'
    if order_by:
        query_statement += f' ORDER BY {order_by} {order}'
    if limit:
        query_statement += f' LIMIT {limit}'
    print(f'Query: {query_statement}')
    q = session.execute(text(query_statement))
    df = pd.DataFrame(q.fetchall())
    return df


def bulk_append(input_df, table='summaries', engine=None):
    """
    Add articles to the `sources` table in the database from a dataframe containing article text and metadata.
    
    Parameters:
    - references_df: pandas dataframe containing article text and metadata.

    Returns: None
    """
    @remote_sql_session
    def insert_rows(session):
        try:
            print(f'Adding {len(input_df)} rows to the database...')
            def insert_row(row):
                if table == 'sources':
                    data = Sources(
                        title=row['title'],
                        text=row['text'],
                        abstract=row['abstract'],
                        publication=row['publication'],
                        authors=row['authors'],
                        year=row['year'],
                        month=row['month'],
                        pub_volume=row['pub_volume'],
                        pub_issue=row['pub_issue'],
                        start_page=row['start_page'],
                        end_page=row['end_page'],
                        doi=row['doi'],
                        section=row['section'] 
                    )
                    session.add(data)
                    print(f'\t{row["title"]}')
                elif table == 'summaries':
                    prompt = session.query(Prompts).filter_by(
                        full_template=row['full_summarize_task'],
                        system_role=row['system_role'],
                    ).first()
                    if prompt:
                        prompt_id = prompt.id
                    else:
                        prompt = Prompts(
                            full_template=row['full_summarize_task'],
                            prep_steps=row['prep_step'],
                            task=row['summarize_task'],
                            edit_steps=row['edit_task'],
                            audience=row['simplify_audience'],
                            simplify_steps=row['simplify_task'],
                            format_steps=row['format_task'],
                            system_role=row['system_role']
                        )
                        session.add(prompt)
                        session.flush()
                        prompt_id = prompt.id

                    summary = Summaries(
                        timestamp=row['timestamp'],
                        original_summary=row['summary'],
                        simple_summary=row['simple_summary'],
                        original_headline=row['headline'],
                        prompt_id=prompt_id,
                        reference_id=row['reference_id'],
                        choice=row['choice'],
                        model=row['model'],
                        temperature=row['temperature']
                    )
                    session.add(summary)
                    print(f'\tReference #{row["reference_id"]}: {row["headline"]}')
                elif table == 'feed':
                    source = session.query(Feed).filter_by(
                        title=row['title'],
                        journal=row['journal'],
                        doi=row['doi']
                    ).first()
                    if source:
                        print(f'\tAlready exists in the database: {row["title"]}.')

            input_df.apply(insert_row, axis=1)

            session.commit()
            print("Data added successfully!")
        except Exception as e:
            session.rollback()
            print(f"Error adding data to the database: {str(e)}")
        finally:
            session.close()

    return insert_rows()

iteration_id = 2.41
# partial_article_dict[iteration_id] = create_partial_article_dict(article_dict, n_articles=2, journals='all')
# feed_df_dict[iteration_id] = create_feed_table(partial_article_dict[iteration_id], col1='title', col2='pubmed_title')

# ## Create feed table
# feed_df_dict[iteration_id] = create_feed_table(article_dict, col1='title', col2='pubmed_title')
# feed_df_dict[iteration_id]

# article_limit = None
# get_table(table='sources', limit=article_limit)

# # Add rows from results to summaries and prompts table
# bulk_append(qna_dict[iteration_id])


## Save dataframe

In [123]:
from file_functions import save_output
help(save_output)

Help on function save_output in module file_functions:

save_output(df, filename=None, description=None, append_version=True, iteration_id=None, index=False, csv_path='C:\\Users\\silvh\\OneDrive\\lighthouse\\Ginkgo coding\\content-summarization\\output\\CSV', pickle_path='C:\\Users\\silvh\\OneDrive\\lighthouse\\Ginkgo coding\\content-summarization\\output\\pickles')
    Save an Python object as both pickle and CSV. Automatically create filename using date and time 
    if not provided.



In [124]:
description = 'feed_dataframe_2023-07-13'
path = r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output\pickles'
save_output(feed_df_dict[iteration_id], description=description, csv_path=None, pickle_path=path, append_version=False)


File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/pickles/feed_dataframe_2023-07-13_2023-07-14_0912.sav
	Time completed: 2023-07-14 09:12:21.991148
	Object saved as pickle


In [125]:
description = 'feed_dataframe_2023-07-13'
path = r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output\pickles'
save_output(feed_df_dict[iteration_id], description=description, csv_path=None, pickle_path=path, append_version=0)


File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/pickles/feed_dataframe_2023-07-13_2023-07-14_0922.sav
	Time completed: 2023-07-14 09:22:22.890473
	Object saved as pickle


In [126]:
description = 'feed_dataframe_2023-07-13'
path = r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output\pickles'
save_output(feed_df_dict[iteration_id], description=description, csv_path=None, pickle_path=path, append_version=False)


File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/pickles/feed_dataframe_2023-07-13_2023-07-14_0922.sav
	Time completed: 2023-07-14 09:22:47.375076
	Object saved as pickle


# *End of Page*