This is meant to take the papers and create a training dataset of 
1) research goal prompt
2) research hypothesis

In [4]:
import os
import pandas as pd
from llm import complete_text_openai

# Folder path and maximum characters per file
logs_folder_path = 'autoscious_logs/'
MAX_CHARS_PER_PAPER = 50000 # 12500 tokens. GPT-3.5-Turbo-1106 has max context window of 16K, max output tokens of 4K.
MIN_CHARS_PER_PAPER = 500 # There's a decent amount of just errors in the text files, so normal and meaningful papers will be above this

# List to store file contents
file_contents = []

# Iterate through each file in the folder
for paper_file in os.listdir(logs_folder_path):
    file_path = os.path.join(logs_folder_path, paper_file)

    # Check if it's a file
    if os.path.isfile(file_path):
        # Read the file's content
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Check if the content is long enough
        if len(content) < MIN_CHARS_PER_PAPER:
            continue

        # Truncate the content if it exceeds the limit
        if len(content) > MAX_CHARS_PER_PAPER:
            content = content[:MAX_CHARS_PER_PAPER]

        # Append the content to the list
        file_contents.append(content)

# Create a DataFrame
df = pd.DataFrame(file_contents, columns=['paper_full_text'])

# Save the DataFrame to a CSV file
df.to_csv('autoscious_logs_full-text.csv', index=False)
print("Number of papers: ", len(df))

In [17]:
# Create research goal for each paper
def create_research_goal_prompt(full_paper_text):
    try:
        return complete_text_openai(f'''You will help me create a training dataset for generating a research goal prompt based on a research paper text. Try your best to determine what the starting research goal was before the project approach was determined and executed on.

Research paper text: {full_paper_text}

Return only the research goal prompt in the format "Propose a reasonable hypothesis about how to <insert research goal>". Do not respond with any conversation or explanation. If you cannot write a research goal prompt because there doesn't seem to be a research goal, just respond with "N/A".''')
    except Exception as e:
        print("Exception: ", e)
        return 'N/A'
    
# Read the existing DataFrame
df = pd.read_csv('autoscious_logs_full-text.csv')

# Before applying, add mechanism in case it fails to pick back up
# Add a column for status if it doesn't exist
if 'Status' not in df.columns:
    df['Status'] = 'Not Processed'

# Add a column for the research goal if it doesn't exist
if 'research_goal' not in df.columns:
    df['research_goal'] = None

# Apply the function to each row in the 'paper_full_text' column
for index, row in df.iterrows():
    if row['Status'] == 'Not Processed':
        try:
            # Print the current index
            print(f'Processing index: {index}')

            # Apply the function
            result = create_research_goal_prompt(row['paper_full_text'])
            df.at[index, 'research_goal'] = result
            df.at[index, 'Status'] = 'Processed'

        except Exception as e:
            print(f'Error at index {index}: {e}')
            df.at[index, 'Status'] = 'Failed'

        # Save progress intermittently
        if index % 20 == 0:
            df.to_csv('autoscious_logs_progress_full-text_goal.csv', index=False)

# Save the updated DataFrame back to CSV
df.drop(columns=['Status'], inplace=True)
df.to_csv('autoscious_logs_complete_full-text_goal.csv', index=False)

Processing index: 0
Processing index: 1
Processing index: 2
Processing index: 3
Processing index: 4
Processing index: 5
Processing index: 6
Processing index: 7
Processing index: 8
Processing index: 9
Processing index: 10
Processing index: 11
Processing index: 12
Processing index: 13
Processing index: 14
Processing index: 15
Processing index: 16
Processing index: 17
Processing index: 18
Processing index: 19
Processing index: 20
Exception:  Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, you requested 16461 tokens (14461 in the messages, 2000 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Processing index: 21
Processing index: 22
Processing index: 23
Processing index: 24
Processing index: 25
Processing index: 26
Processing index: 27
Processing index: 28
Processing index: 29
Processing index: 30
Exception:  Error co

In [32]:
df.head()

Unnamed: 0,paper_full_text,research_goal,research_hypothesis
0,Affinity Improvement of a Therapeutic Antibody...,Propose a reasonable hypothesis about how to o...,
1,We use essential cookies to make sure the site...,Propose a reasonable hypothesis about how to i...,
2,An Immunogenic Personal Neoantigen Vaccine for...,Propose a reasonable hypothesis about how to g...,
3,1\nSCieNTifiC REPORTS | (2018) 8:2260 | DO...,Propose a reasonable hypothesis about how to i...,
4,Application of PBPK modeling to predict monocl...,Propose a reasonable hypothesis about how to p...,


In [33]:
# Create research hypothesis and proposal for each paper and research goal
def create_research_hypothesis_proposal_prompt(full_paper_text, research_goal):
    try:
        return complete_text_openai(f'''You will help me create a training dataset for generating a research hypothesis and proposal based on a research paper text and its corresponding research goal. Try your best to determine what the initial research hypothesis and proposal was after the research goal was determined, but before the project approach was determined.

Research goal: {research_goal}

Research paper text: {full_paper_text}

Return only the research hypothesis and proposal. Do not respond with any conversation. If you cannot write a research hypothesis and proposal because there doesn't seem to be a research goal or research paper text, just respond with "N/A".''')
    except Exception as e:
        print("Exception: ", e)
        return 'N/A'
    
# Read the existing DataFrame
df = pd.read_csv('autoscious_logs_complete_full-text_goal.csv')

# Before applying, add mechanism in case it fails to pick back up
# Add a column for status if it doesn't exist
if 'Status' not in df.columns:
    df['Status'] = 'Not Processed'

# Add a column for the research goal if it doesn't exist
if 'research_hypothesis' not in df.columns:
    df['research_hypothesis'] = None

# Apply the function to each row in the 'paper_full_text' column
for index, row in df.iterrows():
    if row['Status'] == 'Not Processed':
        try:
            # Print the current index
            print(f'Processing index: {index}')

            # Apply the function
            result = create_research_hypothesis_proposal_prompt(row['paper_full_text'], row['research_goal'])
            df.at[index, 'research_hypothesis'] = result
            df.at[index, 'Status'] = 'Processed'

        except Exception as e:
            print(f'Error at index {index}: {e}')
            df.at[index, 'Status'] = 'Failed'

        # Save progress intermittently
        if index % 20 == 0:
            df.to_csv('autoscious_logs_progress_full-text_goal_hypothesis.csv', index=False)

# Save the updated DataFrame back to CSV
df.drop(columns=['Status'], inplace=True)
df.to_csv('autoscious_logs_complete_full-text_goal_hypothesis.csv', index=False)

Processing index: 0
Processing index: 1
Processing index: 2
Processing index: 3
Processing index: 4
Processing index: 5
Processing index: 6
Processing index: 7
Processing index: 8
Processing index: 9
Processing index: 10
Processing index: 11
Processing index: 12
Processing index: 13
Processing index: 14
Processing index: 15
Processing index: 16
Processing index: 17
Processing index: 18
Processing index: 19
Processing index: 20
Exception:  Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, you requested 16467 tokens (14467 in the messages, 2000 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Processing index: 21
Processing index: 22
Processing index: 23
Processing index: 24
Processing index: 25
Processing index: 26
Processing index: 27
Processing index: 28
Processing index: 29
Processing index: 30
Exception:  Error co

In [36]:
# Drop any rows with N/A before fine-tuning
import numpy as np

# Read the existing DataFrame
df = pd.read_csv('autoscious_logs_complete_full-text_goal_hypothesis.csv')

# Replace 'N/A' strings with NaN
df = df.replace('N/A', np.nan)

# Drop any rows with NaN
df = df.dropna()

df.to_csv('autoscious_logs_complete_cleaned.csv', index=False)

In [38]:
df.head()

Unnamed: 0,paper_full_text,research_goal,research_hypothesis
0,Affinity Improvement of a Therapeutic Antibody...,Propose a reasonable hypothesis about how to o...,Research Hypothesis: Introducing charged resid...
2,An Immunogenic Personal Neoantigen Vaccine for...,Propose a reasonable hypothesis about how to g...,Research hypothesis: Vaccination with neoantig...
3,1\nSCieNTifiC REPORTS | (2018) 8:2260 | DO...,Propose a reasonable hypothesis about how to i...,Research Hypothesis:\nThe affinity maturation ...
4,Application of PBPK modeling to predict monocl...,Propose a reasonable hypothesis about how to p...,Research Hypothesis:\nThe disposition of monoc...
5,Nucleic acids can not only hybridize to one an...,Propose a reasonable hypothesis about how to d...,Research hypothesis: We hypothesize that by us...
