## GPT API calls for Type 1 Constraint generation

In [None]:
import openai
from openai import OpenAI
import pandas as pd
from datetime import datetime
import os
import openpyxl
from dotenv import load_dotenv
from anthropic import Anthropic

In [None]:
# from google.colab import drive
# from google.colab import files
# drive.mount('/content/drive')

In [None]:
# path = "<Enter path to CSV file containing stories and their corresponding instruction.>"
# output_path = "<Enter path to CSV file where constraints are to be stored."

In [None]:
# df = pd.read_csv(path)
# df.head()
# # Ensure indexing of df starts at 0

In [None]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key=api_key)

In [None]:
claude_api_key = os.getenv('CLAUDE_API_KEY')
claude_client = Anthropic(
    api_key=claude_api_key
)
print(claude_api_key)

In [None]:
# # system_prompt = """You are a writing expert. I am going to give you a blog as an input. You can assume that a large language model (LLM) generated the blog.
# # Your task is to generate a set of 39 free-form constraints from the blog that you think might have been given to the LLM to generate the blog.
# # DO NOT REPEAT CONSTRAINTS. I am looking for a variety of creative, free-form constraints pertaining to the content of the blog. These constraints need to be atomic. An atomic constraint refers to a single, indivisible constraint/condition. If a constraint encompasses multiple conditions or constraints and can be decomposed further, please do so.
# # Avoid using proper nouns in your constraints. Try to generate constraints that satisfy/drive at least a few sentences in the blog (do not write very specific constraints that will be satisfied by only one line of the blog).
# # Read the input blog carefully, and come up with 39 content-related constraints. These should strictly pertain to the ideas, arguments, or narrative direction of the blog and impact how the blog develops.
# # If (and only if) you cannot write 39 atomic, content-based constraints, give me stylistic constraints based on how the blog is written. These could talk about the tone of the language, use of examples, voice (first/third person), mood, formatting, etc.
# # Write all the constraints in the form of instructions. For example, if you want to write "The blog should include practical tips.", phrase it as "The blog should include practical tips."
# # """

# prompt_examples = """ Here is an example -

# Input -
# Working from home has become the new normal for millions of professionals worldwide. While it offers flexibility and eliminates commutes, it also presents unique challenges that can impact both productivity and well-being.

# To optimize your home workspace, start by creating a dedicated area free from distractions. This space should have good lighting, comfortable seating, and all necessary equipment within reach. Many experts recommend facing a window for natural light, which can boost mood and energy levels.

# Establish clear boundaries between work and personal time. Set specific work hours and stick to them, just as you would in an office. Communicate these boundaries to family members or housemates to minimize interruptions during work hours.

# Take regular breaks throughout the day. The Pomodoro Technique, which involves 25-minute focused work sessions followed by 5-minute breaks, can help maintain concentration and prevent burnout. Use break time to stretch, hydrate, or take a short walk.

# Stay connected with colleagues through regular video calls and instant messaging. This helps maintain team cohesion and prevents feelings of isolation. Schedule virtual coffee breaks or team-building activities to foster relationships.

# Finally, prioritize your physical and mental health. Maintain a regular exercise routine, eat nutritious meals, and get adequate sleep. Consider meditation or mindfulness practices to manage stress and maintain focus.


# Output -
# 1. Require the setting of defined working hours.
# 2. Explain the risks of isolation if connection practices are neglected.
# 3. Warn about the risk of burnout without intentional self-care.
# 4. Explain how the removal of commuting affects time use and daily rhythm.
# 5. Suggest strategies for maintaining healthy eating while at home.
# 6. Emphasize reducing environmental distractions in that space.
# 7. Argue for the necessity of regular breaks during the workday.
# 8. Encourage informal online gatherings to maintain rapport.
# 9. Recommend mindfulness or meditation as stress-management tools.
# 10. Stress the value of adhering consistently to those hours.
# 11. Show how workspace ergonomics (chair, desk) influence long-term health.
# 12. Conclude with a call to action urging readers to adopt concrete changes immediately.
# 13. Link exercise directly to improved cognitive performance and focus.
# 14. Establish remote work as a global trend that has transformed professional life.
# 15. Connect emotional well-being to overall job performance and satisfaction.
# 16. Explain how balanced nutrition influences concentration and resilience.
# 17. Emphasize communicating work schedules to others in the household.
# 18. Identify productivity as a central theme in remote work discussions.
# 19. Recommend scheduled video calls to replicate face-to-face connection.
# 20. Recommend creating a physically separate space for work at home.
# 21. Stress the need for essential tools and equipment to be easily accessible.
# 22. Recommend light physical movement or stretching during pauses.
# 23. Integrate workspace, scheduling, health, and social practices into a unified remote-work
# strategy.
# 24. Stress the importance of sleep in sustaining energy and productivity.
# 25. Highlight the role of hydration and snacks in sustaining energy across breaks.
# 26. Suggest environmental cues (like décor or layout) that reinforce the sense of a work
# zone.
# 27. Describe the importance of adequate lighting for focus and energy.
# 28. Introduce one structured time-management method, such as work intervals.
# 29. Show how mental health practices support long-term work sustainability.
# 30. Show how shared rituals (e.g., virtual coffee breaks) strengthen belonging.
# 31. Highlight well-being as equally important alongside productivity.
# 32. Warn about the risk of personal time erosion without such boundaries.
# 33. Contrast the flexibility of remote work with the new challenges it creates.
# 34. Suggest instant messaging as a tool for quick, ongoing collaboration.
# 35. Highlight how a clear boundary between workspace and leisure areas aids focus.
# 36. Show how enforcing those boundaries prevents interruptions.
# 37. Recommend establishing a routine for daily physical exercise.
# 38. Explain how breaks counteract mental fatigue and sustain performance.
# 39. Stress that remote work requires deliberate maintenance of social contact.
# 40. Show how movement offsets the risks of prolonged sitting.
# """



# user_input = """
# Input - {}
# Output -
# """

In [None]:
system_prompt = """You are a writing expert. I am going to give you a blog as an input. 
You can assume that a large language model (LLM) generated the blog.

Your task has two parts:
1. Identify the main task of the blog in one sentence. 
   - For example: "The main task is to write a blog about strategies for successful remote working."
   - Phrase the main task as an instruction.
2. Generate a set of 39 free-form constraints that you think might have been given to the LLM to generate the blog.
   - DO NOT REPEAT CONSTRAINTS.
   - Constraints must be atomic (a single indivisible condition). If a constraint can be broken into smaller constraints, do so.
   - Avoid proper nouns in your constraints.
   - Constraints should drive at least a few sentences in the blog (do not write constraints that map to only one line).
   - Constraints must strictly pertain to the content, ideas, arguments, or narrative direction of the blog and should influence how the blog develops.
   - If (and only if) you cannot write 39 atomic, content-based constraints, give stylistic constraints based on how the blog is written (tone, use of examples, formatting, etc.).
   - Write all constraints in the form of instructions. For example: "The blog should include practical tips."
   - Do not write constraints in the same order or phrasing as the blog text. Randomize the order of the constraints.

Here is a worked example to guide you:

Input Blog:
Working from home has become the new normal for millions of professionals worldwide. While it offers flexibility and eliminates commutes, it also presents unique challenges that can impact both productivity and well-being.

To optimize your home workspace, start by creating a dedicated area free from distractions. This space should have good lighting, comfortable seating, and all necessary equipment within reach. Many experts recommend facing a window for natural light, which can boost mood and energy levels.

Establish clear boundaries between work and personal time. Set specific work hours and stick to them, just as you would in an office. Communicate these boundaries to family members or housemates to minimize interruptions during work hours.

Take regular breaks throughout the day. The Pomodoro Technique, which involves 25-minute focused work sessions followed by 5-minute breaks, can help maintain concentration and prevent burnout. Use break time to stretch, hydrate, or take a short walk.

Stay connected with colleagues through regular video calls and instant messaging. This helps maintain team cohesion and prevents feelings of isolation. Schedule virtual coffee breaks or team-building activities to foster relationships.

Finally, prioritize your physical and mental health. Maintain a regular exercise routine, eat nutritious meals, and get adequate sleep. Consider meditation or mindfulness practices to manage stress and maintain focus.

Output:
Main Task: Write a blog about strategies for successful remote working.

Constraints:
1. Require the setting of defined working hours.
2. Explain the risks of isolation if connection practices are neglected.
3. Warn about the risk of burnout without intentional self-care.
4. Explain how the removal of commuting affects time use and daily rhythm.
5. Suggest strategies for maintaining healthy eating while at home.
6. Emphasize reducing environmental distractions in that space.
7. Argue for the necessity of regular breaks during the workday.
8. Encourage informal online gatherings to maintain rapport.
9. Recommend mindfulness or meditation as stress-management tools.
10. Stress the value of adhering consistently to those hours.
11. Show how workspace ergonomics (chair, desk) influence long-term health.
12. Conclude with a call to action urging readers to adopt concrete changes immediately.
13. Link exercise directly to improved cognitive performance and focus.
14. Establish remote work as a global trend that has transformed professional life.
15. Connect emotional well-being to overall job performance and satisfaction.
16. Explain how balanced nutrition influences concentration and resilience.
17. Emphasize communicating work schedules to others in the household.
18. Identify productivity as a central theme in remote work discussions.
19. Recommend scheduled video calls to replicate face-to-face connection.
20. Recommend creating a physically separate space for work at home.
21. Stress the need for essential tools and equipment to be easily accessible.
22. Recommend light physical movement or stretching during pauses.
23. Integrate workspace, scheduling, health, and social practices into a unified remote-work strategy.
24. Stress the importance of sleep in sustaining energy and productivity.
25. Highlight the role of hydration and snacks in sustaining energy across breaks.
26. Suggest environmental cues (like décor or layout) that reinforce the sense of a work zone.
27. Describe the importance of adequate lighting for focus and energy.
28. Introduce one structured time-management method, such as work intervals.
29. Show how mental health practices support long-term work sustainability.
30. Show how shared rituals (e.g., virtual coffee breaks) strengthen belonging.
31. Highlight well-being as equally important alongside productivity.
32. Warn about the risk of personal time erosion without such boundaries.
33. Contrast the flexibility of remote work with the new challenges it creates.
34. Suggest instant messaging as a tool for quick, ongoing collaboration.
35. Highlight how a clear boundary between workspace and leisure areas aids focus.
36. Show how enforcing those boundaries prevents interruptions.
37. Recommend establishing a routine for daily physical exercise.
38. Explain how breaks counteract mental fatigue and sustain performance.
39. Stress that remote work requires deliberate maintenance of social contact.

Now use the same approach for the next input blog.
"""


In [None]:
blog_data_path = "../data/merged_blogs.csv"
blog_dataset = pd.read_csv(blog_data_path)
blog_dataset.head()

In [None]:
similar_blog_data_path = "../data/50_run1/merged_similar_blogs.csv"
blog_dataset_similar = pd.read_csv(similar_blog_data_path)
blog_dataset_similar.head()


In [None]:
dissimilar_blog_data_path = "../data/50_run1/merged_dissimilar_blogs.csv"
blog_dataset_dissimilar = pd.read_csv(dissimilar_blog_data_path)
blog_dataset_dissimilar.head()

In [None]:
#Data Cleaning
#Word Count Filtering
#Remove blogs with word count less than 1000
# blog_dataset = blog_dataset[blog_dataset['word_count'] >= 1000]

# #Remove blogs with word count greater than 2000
# blog_dataset = blog_dataset[blog_dataset['word_count'] <= 2000]

#Check for null values
blog_dataset_similar.isnull().sum()

#Drop null values
blog_dataset_similar = blog_dataset_similar.dropna()


#Check Count of blogs after cleaning
print(len(blog_dataset_similar))

# #Check Unique Provenance
# len(blog_dataset['provenance'].unique())

#Extract only Required Columns
# blog_dataset = blog_dataset[['text', "text_length"]]

#Fetch 5 random blogs
# sample_blogs = blog_dataset.sample(5)

# print(sample_blogs['text']) 
print(blog_dataset_similar['Merged Blog'])

In [None]:
# Functions to track API call cost. 

def log_usage(tokens):
    # Get the current date and time
    current_time = datetime.now().strftime("%m-%d-%Y %H:%M:%S")

    # Write the date-time and tokens used to the file
    with open("api_usage.txt", "a") as file:
        file.write(f"{current_time} : {tokens}\n")


def total_usage():
    total_tokens = 0
    with open("api_usage.txt", "r") as file:
        for line in file:
            # Split the line into date-time and tokens
            parts = line.split(" : ")
            if len(parts) == 2:
                _, tokens_str = parts
                tokens = int(tokens_str)
                total_tokens += tokens

    cost = (total_tokens*0.0015)/1000
    print("Total tokens used so far: ", total_tokens)
    print(f"Total cost so far: {cost}$")
    return

In [None]:
# def get_stories():
#     stories = [list(row.values()) for row in df.to_dict(orient='records')]
#     return stories # Returns one list. Each list element is a list of different numbers of constraints.

In [None]:
def chat_fn(instruction, model="gpt-4.1-mini", system_prompt=system_prompt, log=False):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content":system_prompt},
            {"role": "user", "content":user_input.format(instruction)},
        ]
    )

    # Un-comment the below 3 lines to track API usage cost. 
    log_usage(tokens=response.usage.total_tokens)
    if log:
       print("Total tokens used: ", response.usage.total_tokens)

    return response

In [None]:
# r = chat(df[0], model="gpt-4", system_prompt=system_prompt, log=False)

In [None]:
import pandas as pd
import logging
from time import sleep

def generate_constraints(
    df: pd.DataFrame,
    chat_fn,
    system_prompt: str,
    model: str = "gpt-4.1-mini",
    output_path: str = "../data/constraints.csv",
    retry_attempts: int = 3,
    delay: float = 1.0
):
    """
    Generates constraint text for each blog entry in `df` using an LLM chat function
    and saves the results as a CSV.

    Parameters:
        df (pd.DataFrame): Input DataFrame with a 'Merged Blog' column.
        chat_fn (callable): Function for LLM chat, e.g. `chat_fn(prompt, model, system_prompt, log=True)`.
        system_prompt (str): Context or few-shot prompt for the model.
        model (str): LLM model identifier.
        output_path (str): Output CSV file path.
        retry_attempts (int): Number of retries per failed generation.
        delay (float): Delay (seconds) between retries.
    """
    if 'Merged Blog' not in df.columns:
        raise ValueError("Input DataFrame must contain a 'Merged Blog' column.")

    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

    constraints_list = []
    for idx, row in df.iterrows():
        instruction = row['Merged Blog']
        instruction_num = idx + 1
        logging.info(f"Processing instruction #{instruction_num}")

        constraints_text = ""
        for attempt in range(1, retry_attempts + 1):
            try:
                response = chat_fn(instruction, model=model, system_prompt=system_prompt, log=True)
                constraints_text = response.choices[0].message.content.strip()
                break
            except Exception as e:
                logging.warning(f"Attempt {attempt}/{retry_attempts} failed for instruction {instruction_num}: {e}")
                if attempt < retry_attempts:
                    sleep(delay)
                else:
                    logging.error(f"Failed to generate constraints for instruction {instruction_num} after {retry_attempts} attempts.")

        constraints_list.append({
            "Instruction Number": instruction_num,
            "Instruction": instruction,
            "Constraints": constraints_text
        })

    result_df = pd.DataFrame(constraints_list)
    result_df.to_csv(output_path, index=False)
    logging.info(f"Constraints saved to {output_path}")

    return result_df


In [None]:
user_input = """
Input - {}
Output -
"""
generate_constraints(df=blog_dataset_similar, output_path="../data/50_run1/constraints_similar.csv", chat_fn=chat_fn, system_prompt=system_prompt)


In [None]:
user_input = """
Input - {}
Output -
"""
generate_constraints(df=blog_dataset_dissimilar, output_path="../data/50_run1/constraints_dissimilar.csv", chat_fn=chat_fn, system_prompt=system_prompt)


In [None]:
import pandas as pd
import re
import os

def extract_main_task(input_csv, output_csv, constraint_col="Constraints"):
    """
    Splits the 'Constraints' column into two columns:
    - 'Main Task': text following 'Main Task:'
    - 'Constraint': text following 'Constraints:'
    and saves the new file.
    """

    df = pd.read_csv(input_csv)
    main_tasks, refined_constraints = [], []

    for text in df[constraint_col].astype(str):
        # Look for 'Main Task:' and 'Constraints:' markers
        main_match = re.search(r"Main Task\s*:\s*(.*?)\s*(?=Constraints\s*:|$)", text, flags=re.S)
        constraint_match = re.search(r"Constraints\s*:\s*(.*)", text, flags=re.S)

        main_task = main_match.group(1).strip() if main_match else ""
        constraint = constraint_match.group(1).strip() if constraint_match else ""

        main_tasks.append(main_task)
        refined_constraints.append(constraint)

    df["Main Task"] = main_tasks
    df["Constraint"] = refined_constraints

    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Extracted Main Task and Constraint → saved to {output_csv}")


In [None]:
extract_main_task(input_csv="../data/50_run1/constraints_similar.csv", output_csv="../data/50_run1/constraints_similar_split.csv")

In [None]:
extract_main_task(input_csv="../data/50_run1/constraints_dissimilar.csv", output_csv="../data/50_run1/constraints_dissimilar_split.csv")