In [None]:
import openai
from openai import OpenAI
import pandas as pd
from datetime import datetime
import os
import openpyxl
from dotenv import load_dotenv
from anthropic import Anthropic

In [None]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')


client = OpenAI(api_key=api_key)

In [None]:
base_blog_df = pd.read_csv('../data/base_blog.csv')
print(base_blog_df)

In [None]:
claude_api_key = os.getenv('CLAUDE_API_KEY')
claude_client = Anthropic(
    api_key=claude_api_key
)

In [None]:
base_blog_similar_df = pd.read_csv('../data/50_run1/base_blog_similar.csv')
print(base_blog_similar_df.head())

In [None]:
base_blog_dissimilar_df = pd.read_csv('../data/50_run1/base_blog_dissimilar.csv')
print(base_blog_dissimilar_df.head())

In [None]:
import pandas as pd
import re
import logging

def expand_constraints_subsets(
    df: pd.DataFrame,
    subset_sizes: list = [7, 15, 23, 31, 39],
    output_path: str = "../data/trial_prompt2/constraints_bucket_blog.csv"
):
    """
    Expands each row of the input DataFrame by creating copies with progressively
    larger subsets of constraints (1–N) based on `subset_sizes`.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame containing a 'Constraints' column formatted with a
        leading "Constraints:" line followed by numbered items (1., 2., ...).
    subset_sizes : list
        Constraint counts to include for each generated subset.
    output_path : str
        Path for saving the expanded DataFrame as a CSV.

    Returns
    -------
    pd.DataFrame
        Expanded DataFrame with a new 'selected_constraints' column and a
        'subset_size' column for clarity.
    """

    if "Constraint" not in df.columns:
        raise ValueError("Input DataFrame must contain a 'Constraint' column.")

    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
    expanded_rows = []

    for idx, row in df.iterrows():
        constraints_text = row["Constraint"]

        # --- Clean and split constraints properly ---
        # Remove "Constraints:" prefix if present
        constraints_text = re.sub(r"^Constraints:\s*", "", constraints_text.strip(), flags=re.IGNORECASE)

        # Split on numeric list markers (e.g., "1.", "2.", ...)
        constraints_list = re.split(r'\n\s*\d+\.\s*', constraints_text)
        constraints_list = [c.strip() for c in constraints_list if c.strip()]

        total_constraints = len(constraints_list)
        instruction_num = row.get("Instruction Number", idx + 1)
        logging.info(f"Instruction #{instruction_num}: Found {total_constraints} constraints")

        for size in subset_sizes:
            subset = constraints_list[:min(size, total_constraints)]

            # Clean existing numbering and re-number consistently
            cleaned_subset = [re.sub(r'^\d+\.\s*', '', c).strip() for c in subset]
            selected_text = "\n".join(f"{i+1}. {cleaned_subset[i]}" for i in range(len(cleaned_subset)))

            new_row = row.copy()
            new_row["selected_constraints"] = selected_text
            new_row["subset_size"] = min(size, total_constraints)
            expanded_rows.append(new_row)


    expanded_df = pd.DataFrame(expanded_rows)
    expanded_df.to_csv(output_path, index=False)
    logging.info(f"Expanded dataset with constraint subsets saved to {output_path}")

    return expanded_df


In [None]:
expand_constraints_subsets(df=base_blog_similar_df, output_path="../data/50_run1/constraints_bucket_blog_similar.csv")

In [None]:
expand_constraints_subsets(df=base_blog_dissimilar_df, output_path="../data/50_run1/constraints_bucket_blog_dissimilar.csv")

In [None]:
# system_prompt = """You are a blog writer.
# Revise or rewrite the provided base blog so it satisfies all listed constraints.
# Maintain realism, tone, and narrative flow. Integrate constraints naturally.
# Generate three distinctly different internal drafts (in your reasoning only),
# then produce one final cohesive blog that reads as a single-author piece.
# Only output the final merged blog. 
# """

# user_input = """
# Base Blog - {}
# Constraints - {}
# """

In [None]:
system_prompt = """
You are an expert creative writer trained to satisfy complex multi-constraint objectives.
Your goal is to rewrite or expand the given base blog so that it fulfills *all* listed constraints.

However, each constraint must be addressed through *distinct, independent content*.
You must not reuse the same sentence or paragraph to satisfy multiple constraints.
Your challenge is to maintain coherence and natural flow while ensuring every constraint
is uniquely expressed through its own textual evidence.

Follow this structured reasoning and generation protocol:

Step 1 — Interpret the Task:
Read the base blog and the list of constraints carefully.
Identify conceptual clusters among the constraints.
Within each cluster, ensure that constraints are still treated as separate creative targets.

Step 2 — Creative Planning:
For each constraint:
- Write one unique paragraph or at least one complete sentence addressing it explicitly.
- Vary style, sentence rhythm, and imagery between constraints.
- Avoid repeating phrases or reusing identical syntactic structures.
- Use transitions that preserve narrative flow but separate ideas logically.

Step 3 — Expansion and Integration:
Once all constraints have been addressed separately,
weave them into a coherent blog post that reads as single-author text.
Use natural transitions, metaphors, and connective logic to link ideas smoothly,
but do not merge or collapse paragraphs that serve different constraints.

Step 4 — Verification:
Before finalizing, verify that:
- Each constraint corresponds to unique textual evidence.
- No two constraints rely on identical or nearly identical wording.
- The final text feels organic, creative, and contextually unified.

Output only the final blog post.
Do not include explanations, lists, or reasoning steps.
"""
user_input = """
Base Blog - {}
Constraints - {}
"""

In [None]:
# Functions to track API call cost. 

def log_usage(tokens):
    # Get the current date and time
    current_time = datetime.now().strftime("%m-%d-%Y %H:%M:%S")

    # Write the date-time and tokens used to the file
    with open("api_usage.txt", "a") as file:
        file.write(f"{current_time} : {tokens}\n")


def total_usage():
    total_tokens = 0
    with open("api_usage.txt", "r") as file:
        for line in file:
            # Split the line into date-time and tokens
            parts = line.split(" : ")
            if len(parts) == 2:
                _, tokens_str = parts
                tokens = int(tokens_str)
                total_tokens += tokens

    cost = (total_tokens*0.0015)/1000
    print("Total tokens used so far: ", total_tokens)
    print(f"Total cost so far: {cost}$")
    return

In [None]:
def chat_fn(instruction, model="gpt-4.1-mini", system_prompt=system_prompt, log=False):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": instruction},
        ]
    )

    log_usage(tokens=response.usage.total_tokens)
    if log:
        print("Tokens used:", response.usage.total_tokens)

    return response


In [None]:
import pandas as pd
import logging
from time import sleep

def fit_blogs_to_constraints(
    df: pd.DataFrame,
    chat_fn,
    system_prompt: str,
    model: str = "gpt-4.1-mini",
    output_path: str = "../data/trial_prompt2/fitted_blogs_summarized.csv",
    retry_attempts: int = 3,
    delay: float = 1.0
):
    """
    Fits each base blog in the DataFrame to its corresponding set of constraints
    using an LLM (e.g., GPT-4.1-mini), and saves the rewritten blogs to a new CSV.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame with 'base_blog' and 'selected_constraints' columns.
    chat_fn : callable
        Function that performs the chat call, e.g. chat_fn(prompt, model, system_prompt, log=True).
    system_prompt : str
        The editing role prompt guiding the model's rewrite behavior.
    model : str
        Model identifier, default 'gpt-4.1-mini'.
    output_path : str
        File path where the updated CSV (with fitted blogs) will be saved.
    retry_attempts : int
        Number of retries per failed generation.
    delay : float
        Delay in seconds between retries.

    Returns
    -------
    pd.DataFrame
        The updated DataFrame containing a new 'fitted_blog' column.
    """

    required_cols = {"base_blog", "selected_constraints"}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"Input DataFrame must contain {required_cols} columns.")

    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

    fitted_blogs = []

    for idx, row in df.iterrows():
        base_blog = row["base_blog"]
        selected_constraints = row["selected_constraints"]
        instruction_num = row.get("Instruction Number", idx + 1)
        logging.info(f"Processing Instruction #{instruction_num} with subset size {row.get('subset_size', 'N/A')}")

        fitted_blog = ""

        for attempt in range(1, retry_attempts + 1):
            try:
                user_prompt = (
                    f"Base Blog:\n{base_blog}\n\n"
                    f"Constraints to Satisfy:\n{selected_constraints}\n\n"
                )

                response = chat_fn(user_prompt, model=model, system_prompt=system_prompt, log=True)
                fitted_blog = response.choices[0].message.content.strip()
                break

            except Exception as e:
                logging.warning(f"Attempt {attempt}/{retry_attempts} failed for Instruction #{instruction_num}: {e}")
                if attempt < retry_attempts:
                    sleep(delay)
                else:
                    logging.error(f"Failed to generate fitted blog for Instruction #{instruction_num} after {retry_attempts} attempts.")

        fitted_blogs.append(fitted_blog)

    # Add fitted_blog column
    df["fitted_blog"] = fitted_blogs

    # Save updated CSV
    df.to_csv(output_path, index=False)
    logging.info(f"Fitted blogs saved to {output_path}")

    return df


In [None]:
constraints_bucket_similar_df = pd.read_csv('../data/50_run1/constraints_bucket_blog_similar.csv')
#print 1 row of constraints_bucket_df
print(constraints_bucket_similar_df.iloc[0])


In [None]:
constraints_bucket_dissimilar_df = pd.read_csv('../data/50_run1/constraints_bucket_blog_dissimilar.csv')
#print 1 row of constraints_bucket_df
print(constraints_bucket_dissimilar_df.iloc[0])

In [None]:
print(system_prompt)
fit_blogs_to_constraints(df=constraints_bucket_similar_df, output_path="../data/50_run1/fit_blog_similar.csv", chat_fn=chat_fn, system_prompt=system_prompt)

In [None]:
fit_blogs_to_constraints(df=constraints_bucket_dissimilar_df, output_path="../data/50_run1/fit_blog_dissimilar.csv", chat_fn=chat_fn, system_prompt=system_prompt)

In [None]:
#Ask GPT to summarize the fitted blogs
system_prompt = """Given the blog post, rewrite a summarized version of the blog that is approximately 25% of the original length. 
Preserve the logical flow, tone, and all key insights. 
Ensure no critical data or arguments are omitted, but remove redundancies, filler sentences, and excessive examples.
"""

def summarize_blog(df: pd.DataFrame, chat_fn, system_prompt: str, model: str = "gpt-4.1-mini", output_path: str = "../data/trial_prompt2/25_percent_summarized_blogs.csv", retry_attempts: int = 3, delay: float = 1.0):
    """
    Summarizes each blog post in `df` using an LLM chat function
    and saves the results as a CSV.

    Parameters:
    df (pd.DataFrame): Input DataFrame with 'blog' column.
    chat_fn (callable): Function for LLM chat, e.g. chat_fn(prompt, model, system_prompt, log=True).
    system_prompt (str): System-level prompt defining model behavior.
    model (str): LLM model identifier, default 'gpt-4.1-mini'.
    output_path (str): Path for saving the new CSV.
    """
    if 'fitted_blog' not in df.columns:
        raise ValueError("Input DataFrame must contain a 'fitted_blog' column.")
    
    summarized_blogs = []
    
    for idx, row in df.iterrows():
        fitted_blog = row['fitted_blog']
        for attempt in range(1, retry_attempts + 1):
            try:
                response = chat_fn(fitted_blog, model=model, system_prompt=system_prompt, log=True)
                summarized_blog = response.choices[0].message.content.strip()
                break
            except Exception as e:
                logging.warning(f"Attempt {attempt}/{retry_attempts} failed for instruction {instruction_num}: {e}")
        
        summarized_blogs.append(summarized_blog)

    df['fitted_summarized_blog'] = summarized_blogs
    
    df.to_csv(output_path, index=False)
    logging.info(f"Summarized blogs saved to {output_path}")
    
    return df


In [None]:
constraints_bucket_similar_df = pd.read_csv("../data/50_run1/fit_blog_similar.csv")
summarize_blog(df=constraints_bucket_similar_df, chat_fn=chat_fn, system_prompt=system_prompt)

In [None]:
import pandas as pd

df = pd.read_csv("../data/50_run1/fit_blog_similar.csv")

# Rename columns to match evaluation script
df = df.rename(columns={
    "fitted_summarized_blog": "FinalGeneratedStory",
    "selected_constraints": "SelectedConstraints",
    "subset_size": "Number_of_Constraints"
})

# Save to a new file for evaluation
df.to_csv("../data/50_run1/for_evaluation_fit_blog_similar.csv", index=False)