Top 3 sections

In [None]:
import pandas as pd
df = pd.read_csv("dataset/df_data_preprocessed.csv")

In [None]:
!pip3 -q install transformers

In [None]:
from transformers import BertTokenizer
import pandas as pd
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# truncate the token limit to 10,000 (gpt 4 can take 128k, but it costs huge)
def truncate_row_to_limit_sentence(row, columns, max_tokens=12000):
    accumulated_token_count = 0
    last_full_column = None

    for column in columns:
        text = row[column]
        if pd.isna(text):
            row[column] = ""
            continue
        # Split text into sentences
        sentences = text.split('.')
        sentences = [sentence.strip() + '.' for sentence in sentences if sentence.strip() != '']
        new_text = []
        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)
            token_count = len(tokens)
            if accumulated_token_count + token_count > max_tokens:
                # If adding this sentence exceeds the max, truncate here
                row[column] = ' '.join(new_text)
                return row  # Stop processing further columns and sentences
            else:
                new_text.append(sentence)
                accumulated_token_count += token_count

        # Update the column with all sentences that fit
        row[column] = ' '.join(new_text)
        last_full_column = column

    # If all text fits without exceeding the limit
    if last_full_column:
        last_index = columns.index(last_full_column) + 1
        # Clear out all text beyond the last full column processed
        for column in columns[last_index:]:
            row[column] = ""

    return row

# Columns to process
columns_to_process = ['Abstract', 'Introduction', 'Conclusion']
df = df.apply(lambda row: truncate_row_to_limit_sentence(row, columns_to_process), axis=1)


In [None]:
df['response_string'] = df.apply(lambda row: f"""Abstract: {row['Abstract']}
Introduction: {row['Introduction']}
Conclusion: {row['Conclusion']}
""", axis=1)

In [None]:
!pip3 -q install openai

RAG + gpt 3.5 turbo

In [None]:
!pip install -q llama-index
!pip install -q openai
!pip install -q transformers
!pip install -q accelerate
!pip -q install llama-index-core
!pip -q install llama-index-llms-openai
!pip -q install llama-index-llms-replicate
!pip -q install llama-index-embeddings-huggingface

In [None]:
import os

import pandas as pd

# Sample 100 random rows from df for the training set
df_rag_train = df.sample(n=100, random_state=42)  # Use a fixed random state for reproducibility

# Create the testing set by dropping the sampled rows from df2
df_rag_test = df.drop(df_rag_train.index)

# Now df_rag_train contains 100 random samples, and df2_test contains the rest
df_rag_train.to_csv('dataset/future_work_rag/df_rag_train_fw.csv', index=False)
df_rag_test.to_csv('dataset/df_rag_test_fw.csv', index=False)

In [None]:
from llama_index.core.llms import LLM
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from IPython.display import Markdown, display
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import ServiceContext, set_global_service_context
from llama_index.llms import openai
import os
import logging
import sys

os.environ['OPENAI_API_KEY'] = ''

documents = SimpleDirectoryReader("dataset/future_work_rag").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
# reset index
df_rag_test.reset_index(inplace=True, drop=False)

df_rag_test['response_string'] = df_rag_test.apply(lambda row: f"""Abstract: {row['Abstract']}
Introduction: {row['Introduction']}
Conclusion: {row['Conclusion']}
""", axis=1)

index.storage_context.persist()


storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context=storage_context)


client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900
index = VectorStoreIndex.from_documents(documents)

system_prompt = """You are an AI trained to analyze scientific research and suggest future directions based on the content of a paper.
    Below, you will find sections from a scientific article including the 'Abstract', 'Introduction', 'Conclusion' of a scientific paper.
    Based on these details, please generate comprehensive and plausible future work suggestions that could extend the research findings,
    address limitations, and propose new avenues for exploration.
    Generate a future work based on these texts. Future work should be within 100 words. \n"""

import io
import sys

generated_limitations = []

for i in range(len(df_rag_test)):  # Assuming you need to iterate from 11 to 11, which is effectively just once
    query_engine = index.as_query_engine(
        similarity_top_k=3,
        streaming=True,
    )

    try:
        response = query_engine.query(
            system_prompt + ": " + df_rag_test['response_string'][i]
        )
        limitation_text = ""
        # Redirect stdout to capture print outputs
        old_stdout = sys.stdout  # Memorize the default stdout stream
        sys.stdout = buffer = io.StringIO()

        try:
            response.print_response_stream()
            limitation_text = buffer.getvalue()  # Get whatever was printed to the "fake" stdout
        finally:
            sys.stdout = old_stdout  # Restore stdout. Important to do this early

        generated_limitations.append(limitation_text.strip())

    except ValueError as e:
        if "Calculated available context size -418 was not non-negative" in str(e):
            print("Caught a ValueError due to negative context size: ", str(e))
        else:
            raise

In [None]:
df_generated_future_work = pd.DataFrame(generated_limitations, columns=['Future_Work'])
df_rag_test['generated_future_work'] = df_generated_future_work['Future_Work']

In [None]:
df_rag_test.to_csv('df_rag_future_work_gpt_4o_3_imp_sections.csv', index=False)