# Title
[]()
Copied code from https://github.com/mukulpatnaik/researchgpt/blob/main/main.py

In [2]:
import pickle
from datetime import datetime
def savepickle(model,filename, ext='sav', path=None,append_version=False):
    """
    Export object as a pickle.
    Parameters:
    - model: Model variable name.
    - filename: Root of the filename.
    - extension: Extension to append (do not include dot as it will be added)
    - filepath (raw string): Use the format r'<path>'. If None, file is saved in same director.
    - append_version (bool): If true, append date and time to end of filename.
    """
    if path:
        path = f'{path}/'.replace('\\','/')
    if append_version == True:
        filename+=datetime.now().strftime('%Y-%m-%d_%H%M')
    with open (path+filename+'.'+ext, 'wb') as fh:
        pickle.dump(model, fh)
    print('File saved: ',path+filename+'.'+ext)
    print('Time completed:', datetime.now())


In [1]:
# from flask import Flask, request, render_template
from io import BytesIO
import pandas as pd
import openai
import os
import requests
from PyPDF2 import PdfReader
from openai.embeddings_utils import get_embedding, cosine_similarity
import re
# from flask_cors import CORS
# from _md5 import md5
# from google.cloud import storage


# Iteration 1

In [5]:
class Chatbot():
    
    def extract_text(self, pdf):
        print("Parsing paper")
        number_of_pages = len(pdf.pages)
        print(f"Total number of pages: {number_of_pages}")
        paper_text = []
        for i in range(number_of_pages):
            page_text = pdf.pages[i].extract_text().strip().replace('\x03', '')
            paper_text.append({
                'text': page_text,
                'page': i
                })
        return paper_text

    def create_df(self, pdf):
        print('Creating dataframe')
        filtered_pdf= []
        for row in pdf:
            if len(row['text']) < 30:
                continue
            filtered_pdf.append(row)
        df = pd.DataFrame(filtered_pdf)
        # print(df.shape)
        # remove elements with identical df[text] and df[page] values
        df = df.drop_duplicates(subset=['text', 'page'], keep='first')
        df['length'] = df['text'].apply(lambda x: len(x))
        print('Done creating dataframe')
        return df

    def embeddings(self, df):
        print('Calculating embeddings')
        openai.api_key = os.getenv('api_openai')
        embedding_model = "text-embedding-ada-002"
        embeddings = df.text.apply([lambda x: get_embedding(x, engine=embedding_model)])
        df["embeddings"] = embeddings
        print('Done calculating embeddings')
        return df

    def search(self, df, query, n=3, pprint=True):
        query_embedding = get_embedding(
            query,
            engine="text-embedding-ada-002"
        )
        df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))
        
        results = df.sort_values("similarity", ascending=False, ignore_index=True)
        # make a dictionary of the the first three results with the page number as the key and the text as the value. The page number is a column in the dataframe.
        results = results.head(n)
        global sources 
        sources = []
        for i in range(n):
            # append the page number and the text as a dict to the sources list
            sources.append({'Page '+str(results.iloc[i]['page']): results.iloc[i]['text'][:150]+'...'})
        print(sources)
        return results.head(n)
    
    def create_prompt(self, df, user_input):
        result = self.search(df, user_input, n=3)
        print(result)
        system_role = """whose expertise is reading and summarizing scientific papers. You are given a query, 
        a series of text embeddings and the title from a paper in order of their cosine similarity to the query. 
        You must take the given embeddings and return a very detailed summary of the paper in the languange of the query: 
            
        Here is the question: """+ user_input + """
            
        and here are the embeddings: 
            
            1.""" + str(result.iloc[0]['text']) + """
            2.""" + str(result.iloc[1]['text']) + """
            3.""" + str(result.iloc[2]['text']) + """
        """

        user_content = f"""Given the question: "{str(user_input)}". Return a detailed answer based on the paper:"""

        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_content},]

        print('Done creating prompt')
        return messages

    def gpt(self, messages):
        print('Sending request to GPT-3')
        openai.api_key = os.getenv('api_openai')
        r = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, temperature=0.7, max_tokens=1500)
        answer = r.choices[0]["message"]["content"]
        print('Done sending request to GPT-3')
        response = {'answer': answer, 'sources': sources}
        return response

In [6]:
# Custom

def process_pdf(pdf_filename, path, filename=None,
    save_path=r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output'
    ):
    """
    Extract the text from the saved PDF and create the embeddings using OpenAI.
    Parameters:
        - pdf_filename (str): Filename of PDF.
        - path (raw string): File path copied from File Explorer.
    Returns:
        - DataFrame containing parsed text and word embeddings.
        - PdfReader(filename) class object.
        - Chatbot() class object.
    """
    print("Processing PDF")
    filename = f'{path}/'.replace('\\','/')+pdf_filename

    pdf = PdfReader(filename)
    chatbot = Chatbot()
    paper_text = chatbot.extract_text(pdf)
    df = chatbot.create_df(paper_text)
    df = chatbot.embeddings(df)
    if filename:
        try:
            filename = filename if (type(filename) == str) else (pdf_filename)
            savepickle(chatbot, filename='chatgbt_embeddings_'+filename, path=save_path)
            savepickle(df, filename='embeddings_df_'+filename, path=save_path)
        except:
            print('Unable to save outputs')
    
    return df, pdf, chatbot

pdf_filename = "Hypohydration but not menstrual phase influences pain perception in healthy women.pdf"
path = r"C:\Users\silvh\OneDrive\Ginkgo\knowledge library references\journal articles"

df, pdf, chatbot = process_pdf(pdf_filename, path, filename=True)

Processing PDF
Parsing paper
Total number of pages: 27
Creating dataframe
Done creating dataframe
Calculating embeddings
Done calculating embeddings
Unable to save outputs


In [9]:
df

Unnamed: 0,text,page,length,embeddings
0,Hypohydration but not Menstrual Phase Influenc...,0,832,"[-0.012927941046655178, -0.00770729873329401, ..."
1,ABSTRACT (239 words) 22 Chronic pain is a per...,1,2286,"[-0.022682175040245056, -0.007203123066574335,..."
2,INTRODUCTION 50 Pain is recognized as a public...,2,4064,"[-0.01526166032999754, -0.007876557298004627, ..."
3,confounded previous research on effects of the...,3,1001,"[-0.02810894139111042, -0.004133280366659164, ..."
4,METHODS 105 Ethical Approval 106 The study was...,4,3597,"[-0.025527486577630043, -0.006997073534876108,..."
5,(see Maximal Handgrip Strength Assessment) to ...,5,3750,"[-0.03222351893782616, 0.0027854228392243385, ..."
6,"Twenty-four hours prior to each trial, partici...",6,4069,"[-0.011756571009755135, 0.0026844630483537912,..."
7,"intensity and unpleasantness again, to verify ...",7,3324,"[-0.017357170581817627, -0.01086043007671833, ..."
8,were assessed to ensure they were normally dis...,8,1565,"[-0.014132192358374596, 0.014319879934191704, ..."
9,RESULTS 292 Following analysis of ovarian horm...,9,3641,"[-0.020886564627289772, -0.008561731316149235,..."


In [10]:
chatbot

<__main__.Chatbot at 0x1b1b2f397f0>

In [8]:
filename = True
save_path = r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output'
if filename:
    try:
        filename = filename if (type(filename) == str) else (pdf_filename)
        savepickle(chatbot, filename='chatgbt_embeddings_'+filename, path=save_path)
        savepickle(df, filename='embeddings_df_'+filename, path=save_path)
    except:
        print('Unable to save outputs')

File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/chatgbt_embeddings_Hypohydration but not menstrual phase influences pain perception in healthy women.pdf.sav
Time completed: 2023-04-04 18:52:40.940784
File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/embeddings_df_Hypohydration but not menstrual phase influences pain perception in healthy women.pdf.sav
Time completed: 2023-04-04 18:52:40.945782


In [11]:
def loadpickle(filename,filepath):
    """
    Load a pickled model using specified file path copied from windows file explorer.
    Back slashes in file path will be converted to forward slashes.
    Arguments:
    - filepath (raw string): Use the format r'<path>'.
    - filename (string).
    
    Returns saved object.
    """
    filename = f'{filepath}/'.replace('\\','/')+filename
    loaded_model = pickle.load(open(filename, 'rb'))
    print('Time completed:', datetime.now())
    return loaded_model

In [12]:
loadpickle(
    'embeddings_df_Hypohydration but not menstrual phase influences pain perception in healthy women.pdf.sav',
    save_path
)

Time completed: 2023-04-04 18:54:26.078177


Unnamed: 0,text,page,length,embeddings
0,Hypohydration but not Menstrual Phase Influenc...,0,832,"[-0.012927941046655178, -0.00770729873329401, ..."
1,ABSTRACT (239 words) 22 Chronic pain is a per...,1,2286,"[-0.022682175040245056, -0.007203123066574335,..."
2,INTRODUCTION 50 Pain is recognized as a public...,2,4064,"[-0.01526166032999754, -0.007876557298004627, ..."
3,confounded previous research on effects of the...,3,1001,"[-0.02810894139111042, -0.004133280366659164, ..."
4,METHODS 105 Ethical Approval 106 The study was...,4,3597,"[-0.025527486577630043, -0.006997073534876108,..."
5,(see Maximal Handgrip Strength Assessment) to ...,5,3750,"[-0.03222351893782616, 0.0027854228392243385, ..."
6,"Twenty-four hours prior to each trial, partici...",6,4069,"[-0.011756571009755135, 0.0026844630483537912,..."
7,"intensity and unpleasantness again, to verify ...",7,3324,"[-0.017357170581817627, -0.01086043007671833, ..."
8,were assessed to ensure they were normally dis...,8,1565,"[-0.014132192358374596, 0.014319879934191704, ..."
9,RESULTS 292 Following analysis of ovarian horm...,9,3641,"[-0.020886564627289772, -0.008561731316149235,..."


In [14]:
loadpickle(
    'chatgbt_embeddings_Hypohydration but not menstrual phase influences pain perception in healthy women.pdf.sav',
    save_path
)

Time completed: 2023-04-04 18:55:16.793347


<__main__.Chatbot at 0x1b1d6a2f250>

In [76]:
def reply(df, query):
    chatbot = Chatbot()
    prompt = chatbot.create_prompt(df, query)
    try:
        response = chatbot.gpt(prompt)
        return response
    except:
        print('**Unable to get response**')
        return prompt

query2 = """
How does menstrual cycle affect pain?
"""

reply4_2 = reply(df4, query2)
reply4_2

[{'Page 16': 'REFERENCES  521 1. Goldberg DS & McGee SJ. (2011). Pain as a global public health priority. BMC Public 522 Health 11, 770. 523 2. Treede RD, Rief W, B...'}, {'Page 19': '55. Straneva PA, Maixner W, Light KC, Pedersen CA, Costello NL & Girdler SS. (2002). 657 Menstrual cycle, beta-endorphins, and pain sensitivity in pre...'}, {'Page 12': 'Menstrual Phase and Experimental Pain Sensitivity  411 Menstrual phase did not modulate the hyperalgesic effect of hyp ohydration, nor did it 412 inde...'}]
                                                text  page  length  \
0  REFERENCES  521 1. Goldberg DS & McGee SJ. (20...    16    3414   
1  55. Straneva PA, Maixner W, Light KC, Pedersen...    19    2953   
2  Menstrual Phase and Experimental Pain Sensitiv...    12    4164   

                                          embeddings  similarity  
0  [-0.005879143252968788, -0.005777269136160612,...    0.856388  
1  [-0.02594314143061638, -0.012814132496714592, ...    0.856333  
2  [-0.

[{'role': 'system',
  'content': "whose expertise is reading and summarizing scientific papers. You are given a query, \n        a series of text embeddings and the title from a paper in order of their cosine similarity to the query. \n        You must take the given embeddings and return a very detailed summary of the paper in the languange of the query: \n            \n        Here is the question: \nHow does menstrual cycle affect pain?\n\n            \n        and here are the embeddings: \n            \n            1.REFERENCES  521 1. Goldberg DS & McGee SJ. (2011). Pain as a global public health priority. BMC Public 522 Health 11, 770. 523 2. Treede RD, Rief W, Barke A, Aziz Q, Bennett MI, Benoliel R, Coh en M, Evers S, Finnerup 524 NB, First MB, Giamberardino MA, Kaasa S, Kosek E, Lavand'homme P, Nicholas M, Perrot 525 S, Scholz J, Schug S, Smith BH, Svensson P, Vlaeyen JWS & Wang SJ. (2015). A 526 classification of chronic pain for ICD-11. Pain 156, 1003-1007. 527 3. Breivik H

## Iteration 2

In [None]:
env_name = 'api_openai'

class Chatbot():
    
    def extract_text(self, pdf):
        print("Parsing paper")
        number_of_pages = len(pdf.pages)
        print(f"Total number of pages: {number_of_pages}")
        paper_text = []
        for i in range(number_of_pages):
            page_text = pdf.pages[i].extract_text().strip().replace('\x03', '')
            paper_text.append({
                'text': page_text,
                'page': i
                })
        return paper_text

    def create_df(self, pdf):
        print('Creating dataframe')
        filtered_pdf= []
        for row in pdf:
            if len(row['text']) < 30:
                continue
            filtered_pdf.append(row)
        df = pd.DataFrame(filtered_pdf)
        df = df.drop_duplicates(subset=['text', 'page'], keep='first')
        df['length'] = df['text'].apply(lambda x: len(x))
        print('Done creating dataframe')
        return df

    def embeddings(self, df):
        print('Calculating embeddings')
        openai.api_key = os.getenv(env_name)
        embedding_model = "text-embedding-ada-002"
        embeddings = df.text.apply([lambda x: get_embedding(x, engine=embedding_model)])
        df["embeddings"] = embeddings
        print('Done calculating embeddings')
        return df

    def search(self, df, query, n=3, pprint=True):
        query_embedding = get_embedding(
            query,
            engine="text-embedding-ada-002"
        )
        df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))
        
        results = df.sort_values("similarity", ascending=False, ignore_index=True)
        # make a dictionary of the the first three results with the page number as the key and the text as the value. The page number is a column in the dataframe.
        results = results.head(n)
        global sources 
        sources = []
        for i in range(n):
            # append the page number and the text as a dict to the sources list
            sources.append({'Page '+str(results.iloc[i]['page']): results.iloc[i]['text'][:150]+'...'})
        print(sources)
        return results.head(n)
    
    def create_prompt(self, df, user_input):
        result = self.search(df, user_input, n=3)
        print(result)
        system_role = """whose expertise is reading and summarizing scientific papers. You are given a query, 
        a series of text embeddings and the title from a paper in order of their cosine similarity to the query. 
        You must take the given embeddings and return a very detailed summary of the paper in the languange of the query: 
            
        Here is the question: """+ user_input + """
            
        and here are the embeddings: 
            
            1.""" + str(result.iloc[0]['text']) + """
            2.""" + str(result.iloc[1]['text']) + """
            3.""" + str(result.iloc[2]['text']) + """
        """

        user_content = f"""Given the question: "{str(user_input)}". Return a detailed answer based on the paper:"""

        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_content},]

        print('Done creating prompt')
        return messages


In [23]:
import re
def process_pdf(pdf_filename, path, filename=None,
    save_path=r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output'
    ):
    """
    Extract the text from the saved PDF and create the embeddings using OpenAI.
    Parameters:
        - pdf_filename (str): Filename of PDF.
        - path (raw string): File path copied from File Explorer.
    Returns:
        - DataFrame containing parsed text and word embeddings.
        - PdfReader(filename) class object.
        - Chatbot() class object.
    """
    print("Processing PDF")
    filename = f'{path}/'.replace('\\','/')+pdf_filename

    pdf = PdfReader(filename)
    chatbot = Chatbot()
    paper_text = chatbot.extract_text(pdf)
    df = chatbot.create_df(paper_text)
    df = chatbot.embeddings(df)
    if filename:
        try:
            filename = filename if (type(filename) == str) else (re.sub(r'(.*).pdf', r'\1', pdf_filename))
            savepickle(chatbot, filename='chatgpt_embeddings_'+pdf_filename, path=save_path)
            savepickle(df, filename='embeddings_df_'+pdf_filename, path=save_path)
        except:
            print('Unable to save outputs')
    
    return df, pdf, chatbot

pdf_filename = "Hypohydration but not menstrual phase influences pain perception in healthy women.pdf"
path = r"C:\Users\silvh\OneDrive\Ginkgo\knowledge library references\journal articles"

df2, pdf2, chatbot2 = process_pdf(pdf_filename, path, filename=True)

Processing PDF
Parsing paper
Total number of pages: 27
Creating dataframe
Done creating dataframe
Calculating embeddings
Done calculating embeddings
File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/chatgpt_embeddings_Hypohydration but not menstrual phase influences pain perception in healthy women.pdf.sav
Time completed: 2023-04-04 19:05:28.284192
File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/embeddings_df_Hypohydration but not menstrual phase influences pain perception in healthy women.pdf.sav
Time completed: 2023-04-04 19:05:28.288188


In [47]:
pdf2.pages[0].extract_text()

'Hypohydration but not Menstrual Phase Influences Pain Perceptio n in Healthy Women 1 Beverly Tan\n1, Michael C. Philipp2, Ahmad Munir Che Muhamed3, Toby Mündel1* 2 \n1 Massey University, School of Sport Exercise and Nutrition, Palmerston North, New Zealand 3 \n2 Massey University, School of Psychology, Palmerston North, New Zealand 4 \n3 Advanced Medical and Dental Institute, Universiti Sains Malaysi a, Pulau Pinang, Malaysia 5  6 \n*Corresponding Author 7 Toby Mündel: T.Mundel@massey.ac.nz \n8 School of Sport Exercise and Nutrition (PN621) 9 Massey University  10 Private Bag 11 222 11 Palmerston North 4442 12 New Zealand 13  14 ORCID:  \n15 Ahmad Munir Che Muhamed: 0000-0002-0254-6838 16 Michael Philipp: 0000-0001-8203-8018 17 Toby Mündel: 0000-0002-4214-8543 18 \n 19 Running title:  \n20 Hydration, Menstrual Phase and Pain\n  21 '

In [27]:
for i in range (5):
    print(len(df2.loc[0,'embeddings']))

1536
1536
1536
1536
1536


In [28]:
df2

Unnamed: 0,text,page,length,embeddings
0,Hypohydration but not Menstrual Phase Influenc...,0,832,"[-0.012890724465250969, -0.007611474953591824,..."
1,ABSTRACT (239 words) 22 Chronic pain is a per...,1,2286,"[-0.022682175040245056, -0.007203123066574335,..."
2,INTRODUCTION 50 Pain is recognized as a public...,2,4064,"[-0.01526166032999754, -0.007876557298004627, ..."
3,confounded previous research on effects of the...,3,1001,"[-0.02810894139111042, -0.004133280366659164, ..."
4,METHODS 105 Ethical Approval 106 The study was...,4,3597,"[-0.025527486577630043, -0.006997073534876108,..."
5,(see Maximal Handgrip Strength Assessment) to ...,5,3750,"[-0.03222351893782616, 0.0027854228392243385, ..."
6,"Twenty-four hours prior to each trial, partici...",6,4069,"[-0.011679299175739288, 0.0027374434284865856,..."
7,"intensity and unpleasantness again, to verify ...",7,3324,"[-0.017357170581817627, -0.01086043007671833, ..."
8,were assessed to ensure they were normally dis...,8,1565,"[-0.014132192358374596, 0.014319879934191704, ..."
9,RESULTS 292 Following analysis of ovarian horm...,9,3641,"[-0.020886564627289772, -0.008561731316149235,..."


In [30]:
pdf_filename_b = "Daily energy expenditure through the human life course.pdf"
path = r"C:\Users\silvh\OneDrive\Ginkgo\knowledge library references\journal articles"

df2b, pdf2b, chatbot_2b = process_pdf(pdf_filename_b, path, filename=True)

Processing PDF
Parsing paper
Total number of pages: 14
Creating dataframe
Done creating dataframe
Calculating embeddings
Done calculating embeddings
File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/chatgpt_embeddings_Daily energy expenditure through the human life course.pdf.sav
Time completed: 2023-04-04 19:45:50.931441
File saved:  C:/Users/silvh/OneDrive/lighthouse/Ginkgo coding/content-summarization/output/embeddings_df_Daily energy expenditure through the human life course.pdf.sav
Time completed: 2023-04-04 19:45:50.934440


In [31]:
df2b

Unnamed: 0,text,page,length,embeddings
0,Daily Energy Expenditure through the Human Lif...,0,2983,"[0.01744239404797554, 0.001282965182326734, 0...."
1,activity ( 11–13) and age-related changes in t...,1,3653,"[0.012214376591145992, -0.005045928526669741, ..."
2,Juveniles (1 to 20 y):\nTotal and basal expend...,2,3501,"[0.009753364138305187, -0.010705556720495224, ..."
3,"absolute and size-adjusted values, then accele...",3,3713,"[0.006110848393291235, -0.014762049540877342, ..."
4,tissue-specific metabolism in early life may b...,4,3301,"[-0.004913498647511005, -0.020166708156466484,..."
5,"R. Westerterp61,†, William W. Wong18,†, John R...",5,3499,"[-0.0063083297573029995, -0.004717382602393627..."
6,"of Anthropology, Northwestern University, Evan...",6,3397,"[0.0033165293280035257, -1.1859740880026948e-0..."
7,Data Availability\nAll data used in these anal...,7,3568,"[0.0026076557114720345, -0.007029188331216574,..."
8,"19. Alderman H, Headey D, The timing of growth...",8,4204,"[-0.0005258726887404919, -0.015454488806426525..."
9,"42. Muggeo VMR, . Segmented: an R package to f...",9,2437,"[0.0022293475922197104, -0.007832749746739864,..."


In [35]:
df2b.loc[1, 'text'][-200:]

' ~50% elevated compared to adults (Figure 2).Pontzer et al. Page 2\nScience . Author manuscript; available in PMC 2022 August 13.\nAuthor Manuscript Author Manuscript Author Manuscript Author Manuscript'

## Test out prompts

In [36]:


def reply(df, user_input):
    chatbot = Chatbot()
    prompt = chatbot.create_prompt(df, user_input)
    try:
        response = chatbot.gpt(prompt)
        return response
    except:
        print('**Unable to get response**')
        return prompt

prompt = """
Can you rephrase that for a 12 year old
"""
reply2b = reply(df2b, prompt)


[{'Page 1': 'activity ( 11–13) and age-related changes in tissue-specific metabolic rates, as have been reported for the brain (\n14), are unclear. Total and basal ...'}, {'Page 11': 'Figure 2. \nFat free mass- and fat mass-adjusted expenditures over the life course. Individual subjects and age-sex cohort mean ± SD are shown. For bot...'}, {'Page 10': 'Figure 1. \nA. Total expenditure (TEE) increases with fat free mass in a power-law manner, but age groups cluster about the trend line differently. B. ...'}]
                                                text  page  length  \
0  activity ( 11–13) and age-related changes in t...     1    3653   
1  Figure 2. \nFat free mass- and fat mass-adjust...    11    1041   
2  Figure 1. \nA. Total expenditure (TEE) increas...    10     795   

                                          embeddings  similarity  
0  [0.012214376591145992, -0.005045928526669741, ...    0.722839  
1  [0.016398467123508453, -0.011595568619668484, ...    0.721423  
2  [0

In [38]:
reply2b['answer']

'This study looked at how our bodies use energy at different ages. They found that the more muscle someone has, the more energy they use. This is why kids who are growing and going through puberty use more energy than adults. They also found that as people get older and lose muscle mass, they use less energy. The study used a big group of people from lots of different countries to figure this out.'

In [39]:
prompt = """
TLDR
"""
reply2b_2 = reply(df2b, prompt)

[{'Page 9': '42. Muggeo VMR, . Segmented: an R package to fit regression models with broken-line relationships. R News8/1, 20–25 (2008).\n43. Elia M, in Physiology,...'}, {'Page 0': 'Daily Energy Expenditure through the Human Life Course\nA full list of authors and affiliations appears at the end of the article.\nAbstract\nTotal daily...'}, {'Page 6': 'of Anthropology, Northwestern University, Evanston, IL, USA 35Imperial College London Diabetes Centre, Abu Dhabi, United Arab Emirates and Imperial Co...'}]
                                                text  page  length  \
0  42. Muggeo VMR, . Segmented: an R package to f...     9    2437   
1  Daily Energy Expenditure through the Human Lif...     0    2983   
2  of Anthropology, Northwestern University, Evan...     6    3397   

                                          embeddings  similarity  
0  [0.0022293475922197104, -0.007832749746739864,...    0.724719  
1  [0.01744239404797554, 0.001282965182326734, 0....    0.720568  
2  [0.

In [40]:
reply2b_2['answer']

'I\'m sorry, but I cannot provide a detailed answer based on the paper as there is no clear connection between the paper and the given query "TLDR". The paper is about daily energy expenditure through the human life course, while the query "TLDR" is a request for a brief summary. Can you please provide more information or context so I can assist you better?'

In [None]:
prompt = """
TLDR
"""
reply2b_2 = reply(df2b, prompt)

[{'Page 9': '42. Muggeo VMR, . Segmented: an R package to fit regression models with broken-line relationships. R News8/1, 20–25 (2008).\n43. Elia M, in Physiology,...'}, {'Page 0': 'Daily Energy Expenditure through the Human Life Course\nA full list of authors and affiliations appears at the end of the article.\nAbstract\nTotal daily...'}, {'Page 6': 'of Anthropology, Northwestern University, Evanston, IL, USA 35Imperial College London Diabetes Centre, Abu Dhabi, United Arab Emirates and Imperial Co...'}]
                                                text  page  length  \
0  42. Muggeo VMR, . Segmented: an R package to f...     9    2437   
1  Daily Energy Expenditure through the Human Lif...     0    2983   
2  of Anthropology, Northwestern University, Evan...     6    3397   

                                          embeddings  similarity  
0  [0.0022293475922197104, -0.007832749746739864,...    0.724719  
1  [0.01744239404797554, 0.001282965182326734, 0....    0.720568  
2  [0.

In [41]:
prompt = """
Summarize the research in under 300 characters
"""
reply2b_3 = reply(df2b, prompt)

[{'Page 6': 'of Anthropology, Northwestern University, Evanston, IL, USA 35Imperial College London Diabetes Centre, Abu Dhabi, United Arab Emirates and Imperial Co...'}, {'Page 5': 'R. Westerterp61,†, William W. Wong18,†, John R. Speakman62,27,28,63,*,†, IAEA DLW database consortium\n#\nAffiliations\n1.Evolutionary Anthropology, Duke...'}, {'Page 9': '42. Muggeo VMR, . Segmented: an R package to fit regression models with broken-line relationships. R News8/1, 20–25 (2008).\n43. Elia M, in Physiology,...'}]
                                                text  page  length  \
0  of Anthropology, Northwestern University, Evan...     6    3397   
1  R. Westerterp61,†, William W. Wong18,†, John R...     5    3499   
2  42. Muggeo VMR, . Segmented: an R package to f...     9    2437   

                                          embeddings  similarity  
0  [0.0033165293280035257, -1.1859740880026948e-0...    0.760887  
1  [-0.0063083297573029995, -0.004717382602393627...    0.752493  
2  [0.

In [42]:
reply2b_3['answer']

"I'm sorry, I cannot provide a summary of the research as the text embeddings provided do not seem to belong to a specific paper or study. They include a list of authors, affiliations, and references to various scientific papers on topics such as anthropology, nutrition, and physical activity. Without a specific paper to reference, I cannot provide a summary of its findings."

In [43]:
prompt = """
At what age does daily expenditure decline in adults?
"""
reply2b_4 = reply(df2b, prompt)
reply2b_4['answer']

[{'Page 11': 'Figure 2. \nFat free mass- and fat mass-adjusted expenditures over the life course. Individual subjects and age-sex cohort mean ± SD are shown. For bot...'}, {'Page 2': 'Juveniles (1 to 20 y):\nTotal and basal expenditure, along with fat free mass, continued to increase with age throughout childhood and adolescence (Fig...'}, {'Page 0': 'Daily Energy Expenditure through the Human Life Course\nA full list of authors and affiliations appears at the end of the article.\nAbstract\nTotal daily...'}]
                                                text  page  length  \
0  Figure 2. \nFat free mass- and fat mass-adjust...    11    1041   
1  Juveniles (1 to 20 y):\nTotal and basal expend...     2    3501   
2  Daily Energy Expenditure through the Human Lif...     0    2983   

                                          embeddings  similarity  
0  [0.016398467123508453, -0.011595568619668484, ...    0.852639  
1  [0.009753364138305187, -0.010705556720495224, ...    0.850212  
2  [

'According to the paper "Daily Energy Expenditure through the Human Life Course", daily expenditure in adults declines after the age of 60. The study analyzed a large, globally diverse database of total expenditure measured by the doubly labeled water method for males and females aged 8 days to 95 years. The results showed that fat free mass-adjusted daily expenditure remains stable in adulthood (20-60 years) even during pregnancy, but declines in older adults (60+ years). The decline in expenditure in older adults is not only a function of reduced fat free mass and fat mass but is also evident in adjusted total expenditure and adjusted basal expenditure. Segmented regression analysis identified a break point at 63.0 years (95% CI: 60.1, 65.9), after which adjusted TEE begins to decline. This break point was somewhat earlier for adjusted basal expenditure (46.5, 95% CI: 40.6, 52.4). Therefore, the paper suggests that daily expenditure declines in adults after the age of 60.'

In [44]:
prompt = """
Summarize the research in under 300 characters
"""
reply2b_5 = reply(df2b, prompt)

reply2b_5['answer']

[{'Page 6': 'of Anthropology, Northwestern University, Evanston, IL, USA 35Imperial College London Diabetes Centre, Abu Dhabi, United Arab Emirates and Imperial Co...'}, {'Page 5': 'R. Westerterp61,†, William W. Wong18,†, John R. Speakman62,27,28,63,*,†, IAEA DLW database consortium\n#\nAffiliations\n1.Evolutionary Anthropology, Duke...'}, {'Page 9': '42. Muggeo VMR, . Segmented: an R package to fit regression models with broken-line relationships. R News8/1, 20–25 (2008).\n43. Elia M, in Physiology,...'}]
                                                text  page  length  \
0  of Anthropology, Northwestern University, Evan...     6    3397   
1  R. Westerterp61,†, William W. Wong18,†, John R...     5    3499   
2  42. Muggeo VMR, . Segmented: an R package to f...     9    2437   

                                          embeddings  similarity  
0  [0.0033165293280035257, -1.1859740880026948e-0...    0.760887  
1  [-0.0063083297573029995, -0.004717382602393627...    0.752493  
2  [0.

"I'm sorry, but I cannot provide a detailed answer based on the paper as the embeddings provided do not seem to be related to a specific paper or research topic. The embeddings consist of author affiliations, acknowledgments, and references to other papers. Can you please provide more information or a specific research topic?"

# Iteration 3: Improve text processing
Previous iterations gave poor results to queries

In [48]:
env_name = 'api_openai'

class Chatbot():
    
    def extract_text(self, pdf):
        print("Parsing paper")
        number_of_pages = len(pdf.pages)
        print(f"Total number of pages: {number_of_pages}")
        paper_text = []
        for i in range(number_of_pages):
            page = pdf.pages[i]
            page_text = []

            def visitor_body(text, cm, tm, fontDict, fontSize):
                x = tm[4]
                y = tm[5]
                # ignore header/footer
                if (y > 50 and y < 720) and (len(text.strip()) > 1):
                    page_text.append({
                    'fontsize': fontSize,
                    'text': text.strip().replace('\x03', ''),
                    'x': x,
                    'y': y
                    })

            _ = page.extract_text(visitor_text=visitor_body)

            blob_font_size = None
            blob_text = ''
            processed_text = []

            for t in page_text:
                if t['fontsize'] == blob_font_size:
                    blob_text += f" {t['text']}"
                    if len(blob_text) >= 2000:
                        processed_text.append({
                            'fontsize': blob_font_size,
                            'text': blob_text,
                            'page': i
                        })
                        blob_font_size = None
                        blob_text = ''
                else:
                    if blob_font_size is not None and len(blob_text) >= 1:
                        processed_text.append({
                            'fontsize': blob_font_size,
                            'text': blob_text,
                            'page': i
                        })
                    blob_font_size = t['fontsize']
                    blob_text = t['text']
                paper_text += processed_text
        print("Done parsing paper")
        # print(paper_text)
        return paper_text

    def create_df(self, pdf):
        print('Creating dataframe')
        filtered_pdf= []
        for row in pdf:
            if len(row['text']) < 30:
                continue
            filtered_pdf.append(row)
        df = pd.DataFrame(filtered_pdf)
        # print(df.shape)
        # remove elements with identical df[text] and df[page] values
        df = df.drop_duplicates(subset=['text', 'page'], keep='first')
        df['length'] = df['text'].apply(lambda x: len(x))
        print('Done creating dataframe')
        return df
    
    def embeddings(self, df):
        print('Calculating embeddings')
        openai.api_key = os.getenv(env_name)
        embedding_model = "text-embedding-ada-002"
        embeddings = df.text.apply([lambda x: get_embedding(x, engine=embedding_model)])
        df["embeddings"] = embeddings
        print('Done calculating embeddings')
        return df

    def search(self, df, query, n=3, pprint=True):
        query_embedding = get_embedding(
            query,
            engine="text-embedding-ada-002"
        )
        df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))
        
        results = df.sort_values("similarity", ascending=False, ignore_index=True)
        # make a dictionary of the the first three results with the page number as the key and the text as the value. The page number is a column in the dataframe.
        results = results.head(n)
        global sources 
        sources = []
        for i in range(n):
            # append the page number and the text as a dict to the sources list
            sources.append({'Page '+str(results.iloc[i]['page']): results.iloc[i]['text'][:150]+'...'})
        print(sources)
        return results.head(n)
    
    def create_prompt(self, df, user_input,
        system_role):
        result = self.search(df, user_input, n=3)
        print(result)
        system_role = """whose expertise is reading and summarizing scientific papers. You are given a query, 
        a series of text embeddings and the title from a paper in order of their cosine similarity to the query. 
        You must take the given embeddings and return a very detailed summary of the paper in the language of the query: 
            
        Here is the question: """+ user_input + """
            
        and here are the embeddings: 
            
            1.""" + str(result.iloc[0]['text']) + """
            2.""" + str(result.iloc[1]['text']) + """
            3.""" + str(result.iloc[2]['text']) + """
        """

        user_content = f"""Given the question: "{str(user_input)}". Return a detailed answer based on the paper:"""

        messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": user_content},]

        print('Done creating prompt')
        return messages

def process_pdf(pdf_filename, path, filename=None,
    save_path=r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output'
    ):
    """
    Extract the text from the saved PDF and create the embeddings using OpenAI.
    Parameters:
        - pdf_filename (str): Filename of PDF.
        - path (raw string): File path copied from File Explorer.
    Returns:
        - DataFrame containing parsed text and word embeddings.
        - PdfReader(filename) class object.
        - Chatbot() class object.
    """
    print("Processing PDF")
    filename = f'{path}/'.replace('\\','/')+pdf_filename

    pdf = PdfReader(filename)
    chatbot = Chatbot()
    paper_text = chatbot.extract_text(pdf)
    df = chatbot.create_df(paper_text)
    df = chatbot.embeddings(df)
    if filename:
        try:
            filename = filename if (type(filename) == str) else (re.sub(r'(.*).pdf', r'\1', pdf_filename))
            savepickle(chatbot, filename='chatgpt_embeddings_'+pdf_filename, path=save_path)
            savepickle(df, filename='embeddings_df_'+pdf_filename, path=save_path)
        except:
            print('Unable to save outputs')
    
    return df, pdf, chatbot

pdf_filename = "Hypohydration but not menstrual phase influences pain perception in healthy women.pdf"
path = r"C:\Users\silvh\OneDrive\Ginkgo\knowledge library references\journal articles"

df1_i3, pdf1_i3, chatbot1_i3 = process_pdf(pdf_filename, path, filename=None)

Processing PDF
Parsing paper
Total number of pages: 27


TypeError: extract_text() got an unexpected keyword argument 'visitor_text'

In [57]:
pdf1_i3

NameError: name 'pdf1_i3' is not defined

In [None]:


def reply(df, user_input):
    chatbot = Chatbot()
    prompt = chatbot.create_prompt(df, user_input)
    try:
        response = chatbot.gpt(prompt)
        return response
    except:
        print('**Unable to get response**')
        return prompt


# *End of Page*