<a href="https://colab.research.google.com/github/Joshua-Chiu/CPSC532V-LLM-SSE-Project/blob/master/Generate_Documents_using_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from typing import List
import numpy as np
import csv
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, fcluster
import openai
import math
import random

OPENAI_API_KEY = ""
openai.client = openai.OpenAI(api_key=OPENAI_API_KEY)

# Logger using print statements
def log(message):
    print(f"[LOG]: {message}")

# Append generated documents to CSV
def append_to_csv(filename, documents):
    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for doc in documents:
            writer.writerow([doc])

def generate_documents(documents: List[str], target: int, output_csv: str) -> List[str]:
    """
    Generate more documents in the same style based on an input list of documents. Return a list of all new documents including the argument.

    Keyword arguments:
    documents -- the list of reference documents
    target -- the target number of documents including the list given
    output_csv -- file path to save generated documents incrementally
    """

    if not documents or target <= len(documents):
        log("No new documents need to be generated.")
        return documents  # No need to generate new ones

    log("Starting document generation process...")

    # Step 1: Vectorize text
    log("Vectorizing input documents...")
    vectorizer = TfidfVectorizer(stop_words='english', max_features=300)
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Step 2: Compute distances & perform clustering
    log("Computing cosine distance matrix and performing clustering...")
    distance_matrix = pdist(tfidf_matrix.toarray(), metric='euclidean')
    linkage_matrix = linkage(distance_matrix, method='ward')

    # Step 3: Assign clusters using a distance threshold
    merge_distances = linkage_matrix[:, 2]
    distance_threshold = np.percentile(merge_distances, 65)
    clusters = fcluster(linkage_matrix, distance_threshold, criterion='distance')
    log(f"Assigned {len(set(clusters))} clusters.")

    # Step 4: Generate new documents from each cluster
    cluster_dict = {}
    for i, cluster in enumerate(clusters):
        if cluster not in cluster_dict:
            cluster_dict[cluster] = []
        cluster_dict[cluster].append(documents[i])

    new_documents = []
    missing_count = target - len(documents)
    fixed_count = missing_count
    message_per_cluster = math.ceil(missing_count / len(set(clusters)))

    while len(new_documents) < fixed_count:
        log(f"New documents generated: {len(new_documents)}, missing count: {missing_count}")
        if missing_count <= 0:
          break
        for selected_cluster in sorted(cluster_dict.keys()):  # Process clusters sequentially
          if missing_count <= 0:
              break

          sample_size = min(3, len(cluster_dict[selected_cluster]))  # Take 2-3 samples
          examples = cluster_dict[selected_cluster][:sample_size]  # Take sequential samples

          log(f"Generating {message_per_cluster} new texts based on cluster {selected_cluster}...")

          # Generate new text using GPT
          prompt = f"Generate {message_per_cluster} new texts similar to the following examples:\n\n"
          prompt += '\n\n'.join(f'Example {i+1}: {doc}' for i, doc in enumerate(examples))
          prompt += "\n\nReturn only the response as a separate entry and put exactly '###' between them so i can parse them. Do not generate any extra messages."

          retries = 3
          for attempt in range(retries):
              try:
                  response = openai.client.chat.completions.create(
                      model="gpt-4o",
                      messages=[{"role": "system", "content": "You are an AI that generates text similar to given examples."},
                                {"role": "user", "content": prompt}],
                      max_tokens=500
                  )
                  message = response.choices[0].message.content
                  if message:
                      generated_texts = message.split('###')
                      new_documents.extend(generated_texts)
                      missing_count = max(0, fixed_count - len(new_documents))
                      log(f"Generated {len(generated_texts)} new documents.")

                      # Append progress to CSV after each response
                      append_to_csv(output_csv, generated_texts)
                      log(f"Appended progress to {output_csv}.")
                      break  # Exit retry loop on success
                  else:
                      log("No valid choices received from OpenAI response.")
              except Exception as e:
                  log(f"Error during OpenAI API call: {e}")
                  if attempt < retries - 1:
                      wait_time = 2 ** attempt  # Exponential backoff
                      log(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                      time.sleep(wait_time)
                  else:
                      break

          if (len(new_documents)) >= fixed_count:
                log(f"New documents generated: {len(new_documents)}, missing count: {missing_count}, so stoping now")
                break

    log("Document generation process completed.")
    return new_documents[:fixed_count]  # Ensure we don't exceed the target


In [None]:
import random
def generate_documentsV2(documents: List[str], target: int, output_csv: str) -> List[str]:
    """
    Generate more documents in the same style based on an input list of documents. Return a list of all new documents not including the argument.

    Keyword arguments:
    documents -- the list of reference documents
    target -- the target number of documents including the list given
    output_csv -- file path to save generated documents incrementally
    """
    new_documents = []
    missing_count = target - len(documents)
    fixed_count = missing_count
    while len(new_documents) < fixed_count:
        log(f"New documents generated: {len(new_documents)}, missing count: {target - len(new_documents)}")
        examples = random.sample(documents, min(3, len(documents)))
        message_per_cluster = 10
        prompt = f"Generate {message_per_cluster} new texts similar to the following examples:\n\n"
        prompt += '\n\n'.join(f'Example {i+1}: {doc}' for i, doc in enumerate(examples))
        prompt += "\n\nReturn only the response as a separate entry and put exactly '###' between them so i can parse them. Do not generate any extra messages."
        retries = 3
        for attempt in range(retries):
              try:
                  response = openai.client.chat.completions.create(
                      model="gpt-4o",
                      messages=[{"role": "system", "content": "You are an AI that generates text similar to given examples."},
                                {"role": "user", "content": prompt}],
                      max_tokens=500
                  )
                  message = response.choices[0].message.content
                  if message:
                      generated_texts = message.split('###')
                      new_documents.extend(generated_texts)
                      missing_count = max(0, fixed_count - len(new_documents))
                      log(f"Generated {len(generated_texts)} new documents.")

                      # Append progress to CSV after each response
                      append_to_csv(output_csv, generated_texts)
                      log(f"Appended progress to {output_csv}.")
                      break  # Exit retry loop on success
                  else:
                      log("No valid choices received from OpenAI response.")
              except Exception as e:
                  log(f"Error during OpenAI API call: {e}")
                  if attempt < retries - 1:
                      wait_time = 2 ** attempt  # Exponential backoff
                      log(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                      time.sleep(wait_time)
                  else:
                      break

    return new_documents[:fixed_count]  # Ensure we don't exceed the target

In [None]:
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "To be or not to be, that is the question.",
    "All that glitters is not gold."
]

target = 100  # Generate 10 documents in total
generated_documents = generate_documentsV2(documents, target, 'sample.csv')
print(generated_documents)

[LOG]: New documents generated: 0, missing count: 100
['To be or not to be, that is the question.', 'The quick brown fox jumps over the lazy dog.', 'All that glitters is not gold.']
[LOG]: Generated 10 new documents.
[LOG]: Appended progress to sample.csv.
[LOG]: New documents generated: 10, missing count: 90
['The quick brown fox jumps over the lazy dog.', 'A journey of a thousand miles begins with a single step.', 'All that glitters is not gold.']
[LOG]: Generated 10 new documents.
[LOG]: Appended progress to sample.csv.
[LOG]: New documents generated: 20, missing count: 80
['A journey of a thousand miles begins with a single step.', 'The quick brown fox jumps over the lazy dog.', 'All that glitters is not gold.']
[LOG]: Generated 10 new documents.
[LOG]: Appended progress to sample.csv.
[LOG]: New documents generated: 30, missing count: 70
['The quick brown fox jumps over the lazy dog.', 'All that glitters is not gold.', 'To be or not to be, that is the question.']


KeyboardInterrupt: 