In [136]:
from openai import OpenAI
import json

In [137]:
import os
import sys
import pandas as pd
import random

In [138]:
# expert examples
df_DR = pd.read_csv('../data/expert_data/DR.csv')
df_dreaddit = pd.read_csv('../data/expert_data/dreaddit.csv')
df_Irf = pd.read_csv('../data/expert_data/Irf.csv')
df_MultiWD = pd.read_csv('../data/expert_data/MultiWD.csv')
df_SAD = pd.read_csv('../data/expert_data/SAD.csv')

In [139]:
DR_train = pd.read_csv('../data/released_data_mentalllama/train_data/instruction_data/DR/train.csv')

In [140]:
client = OpenAI(api_key="sk-222f957d1a354c51ac268b01f978403a", base_url="https://api.deepseek.com")

In [141]:
# instructions
# DR 
instruction = ''': You will be presented with a post and an assigned label to identify whether the poster
shows symptoms of depression. Consider the emotions expressed from post to explain the reasoning of the label step by step.
Here are twenty examples:'''

In [142]:
import json

class PromptBuilder:
    def __init__(self, expert_dfs, train_df, num_examples_per_dataset=2):
        self.expert_dfs = expert_dfs
        self.train_df = train_df
        self.num_examples_per_dataset = num_examples_per_dataset
        self.train_pointer = 0 

    def sample_expert_examples(self):
        sampled_dfs = []
        for df in self.expert_dfs:
            sampled = df.sample(n=self.num_examples_per_dataset, random_state=random.randint(0, 10000))
            sampled_dfs.append(sampled)
        combined_df = pd.concat(sampled_dfs, ignore_index=True)
        return combined_df
    # n: number of target question each prompt has
    def get_next_training_questions(self, n=5):
        end = self.train_pointer + n
        if end <= len(self.train_df):
            questions = self.train_df.iloc[self.train_pointer:end]['query'].tolist()
        else:
            part1 = self.train_df.iloc[self.train_pointer:]['query'].tolist()
            part2 = self.train_df.iloc[:end - len(self.train_df)]['query'].tolist()
            questions = part1 + part2
        self.train_pointer = (self.train_pointer + n) % len(self.train_df)
        return questions

    def build_prompt(self, instruction):
        examples_df = self.sample_expert_examples()
        target_questions = self.get_next_training_questions()

        prompt = instruction.strip() + "\n\n"
        prompt += "### Expert-written examples:\n"

        for _, row in examples_df.iterrows():
            example_json = {
                "query": row['query'],
                "answer": row['gpt-3.5-turbo']
            }
            prompt += json.dumps(example_json, ensure_ascii=False) + "\n"

        prompt += "\n### Now answer the following queries in the following format: {\"response\": \"...\", \"reasoning\": \"...\"}\n"
        for i, q in enumerate(target_questions, 1):
            prompt += f"Q{i}: {q}\n"

        return prompt, target_questions


In [143]:
def generate_mental_health_case(builder, instruction):
    prompt, queries = builder.build_prompt(instruction)
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens= 1500)
        results = response.choices[0].message.content.strip(), queries
        return results
    except Exception as e:
        print(f"Generation failed: {str(e)}")
        return None, None


In [144]:
import re
def extract_response_reasoning_plain_text(result_text):

    pattern = r'response":\s*"([^"]+)"\s*,\s*"reasoning":\s*"([^"]+)"'
    matches = re.findall(pattern, result_text)

    return matches 

In [148]:
# num_batches: number of iteration of prompts
def generate_pipeline(builder, instruction, output_file="generated_dataset.csv", num_batches=1):
    rows = []

    for i in range(num_batches):
        print(f"Generating batch {i+1}/{num_batches}...")
        result_text, queries = generate_mental_health_case(builder, instruction)

        if result_text and queries:
            pairs = extract_response_reasoning_plain_text(result_text)
            if len(pairs) != len(queries):
                print(f"Warning: Mismatch between number of queries ({len(queries)}) and responses ({len(pairs)})")
                continue
            for query, (response, reasoning) in zip(queries, pairs):
                rows.append({
                    "query": query,
                    "response": response,
                    "reasoning": reasoning
                })

        else:
            print("Skipping this batch due to API or parsing error.")

    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False)
    print(f"\nSaved {len(df)} rows to {output_file}")



In [146]:
datasets = [
    df_DR,
    df_dreaddit,
    df_Irf,
    df_MultiWD,
    df_SAD
]
builder = PromptBuilder(datasets, DR_train)
generate_pipeline(builder, instruction, output_file="generated_dataset.csv")

Generating batch 1/1...

Saved 5 rows to generated_dataset.csv


In [147]:
df = pd.read_csv("generated_dataset.csv")
df

Unnamed: 0,query,response,reasoning
0,"Consider this post: ""How to avoid a relapse? I...","Yes, the poster suffers from depression.",The post describes a series of traumatic event...
1,"Consider this post: ""This is how I feel and I ...","Yes, the poster suffers from depression.",The post contains multiple symptoms of depress...
2,"Consider this post: ""Advice for getting back i...","Yes, the poster suffers from depression.",The post reflects a loss of interest in previo...
3,"Consider this post: ""Having a really bad stret...","Yes, the poster suffers from depression.",The post details a long history of depression ...
4,"Consider this post: ""I dont understand how peo...","Yes, the poster suffers from depression.",The post indicates a lack of interest in most ...
