In [None]:
import pandas as pd
df = pd.read_csv("dataset/df_data_preprocessed.csv")

In [None]:
# Compute the average word count
df['Word Count'] = pd.to_numeric(df['Word Count'], errors='coerce')

average_word_count = df['Word Count'].mean()

print(f"Average Word Count: {average_word_count}")


Average Word Count: 67.6675


In [None]:
!pip3 -q install transformers

In [None]:
from transformers import BertTokenizer
import pandas as pd
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# truncate the token limit to 10,000 (gpt 4 can take 128k, but it costs huge)
def truncate_row_to_limit_sentence(row, columns, max_tokens=12000):
    accumulated_token_count = 0
    last_full_column = None

    for column in columns:
        text = row[column]
        if pd.isna(text):
            row[column] = ""
            continue
        # Split text into sentences
        sentences = text.split('.')
        sentences = [sentence.strip() + '.' for sentence in sentences if sentence.strip() != '']
        new_text = []
        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)
            token_count = len(tokens)
            if accumulated_token_count + token_count > max_tokens:
                # If adding this sentence exceeds the max, truncate here
                row[column] = ' '.join(new_text)
                return row  # Stop processing further columns and sentences
            else:
                new_text.append(sentence)
                accumulated_token_count += token_count

        # Update the column with all sentences that fit
        row[column] = ' '.join(new_text)
        last_full_column = column

    # If all text fits without exceeding the limit
    if last_full_column:
        last_index = columns.index(last_full_column) + 1
        # Clear out all text beyond the last full column processed
        for column in columns[last_index:]:
            row[column] = ""

    return row

# Columns to process
columns_to_process = ['Abstract', 'Introduction', 'Conclusion']
df = df.apply(lambda row: truncate_row_to_limit_sentence(row, columns_to_process), axis=1)




In [None]:
df['response_string'] = df.apply(lambda row: f"""Abstract: {row['Abstract']}
Introduction: {row['Introduction']}
Conclusion: {row['Conclusion']}
""", axis=1)


In [None]:
!pip3 -q install openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/386.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.9/386.9 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/78.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/325.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.2/325.2 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
from openai import OpenAI

os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)
generated_text = []

for i in range(len(df)):
    prompt = """You are an AI trained to analyze scientific research and suggest future directions based on the content of a paper.
    Below, you will find sections from a scientific article including the 'Abstract', 'Introduction', 'Conclusion' of a scientific paper.
    Based on these details, please generate comprehensive and plausible future work suggestions that could extend the research findings,
    address limitations, and propose new avenues for exploration.
    Generate a future work based on these texts. Future work should be within 100 words. \n""" + df['response_string'][i]
    summary_text = ""

    stream = client.chat.completions.create(
        model="gpt-3.5-turbo",
        # model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        stream=True,
        temperature=0.2  # Adjust the temperature as needed, max_tokens=150
    )

    for chunk in stream:
        summary_chunk = chunk.choices[0].delta.content or ""
        summary_text += summary_chunk
    text_chunks = []
    text_chunks.append(summary_text)
    generated_text.append(text_chunks)

In [None]:
df_generated_future_work = pd.DataFrame(generated_summary, columns=['Future_Work'])
df['Future_Work'] = df_generated_future_work['Future_Work']

In [None]:
df.to_csv('dataset/gpt3.5_3_imp_sections.csv', index=False)  # Set index=False if you don't want to save the row indices