In [2]:
!pip install cohere

Collecting cohere
  Downloading cohere-5.15.0-py3-none-any.whl.metadata (3.4 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.11.1-cp312-cp312-macosx_10_13_universal2.whl.metadata (5.7 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.4.20250611-py3-none-any.whl.metadata (2.1 kB)
Downloading cohere-5.15.0-py3-none-any.whl (259 kB)
Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading fastavro-1.11.1-cp312-cp312-macosx_10_13_universal2.whl (939 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.6/939.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading types_requests-2.32.4.20250611-py3-none-any.whl (20 kB)
Installing collected packages: types-requests, httpx-sse, fastavro, cohere
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/

In [None]:
import pandas as pd
import time
import cohere
import json
import re

# === CONFIGURATION ===
COHERE_API_KEY = "YOUR_API_KEY"
INPUT_CSV = "psychforums_dataset_cleaned_3.csv"
OUTPUT_CSV = "psychforums_dataset_cleaned_final_2.csv"
TEXT_COLUMN = "text"
ID_COLUMN = "id"
LABEL_COLUMN = "label"
BATCH_SIZE = 8
DELAY_BETWEEN_BATCHES = 5  # seconds

# === SETUP ===
co = cohere.ClientV2(COHERE_API_KEY)

# === PROMPT FUNCTION ===
def build_batch_prompt(id_text_pairs):
    examples = """You are a multi-label text classifier. Each input text can belong to multiple emotional states. Choose **all applicable labels** from the following list:

0 -> neutral  
1 -> low mood / depressed  
2 -> anxious / worried  
3 -> stressed / overwhelmed  

Return a JSON dictionary where **each key is the ID (string)** and **the value is a list of one or more labels** (e.g., [1, 2]). Only return the raw JSON — no explanations.

Examples:
{
  "1": [0],
  "2": [1, 2],
  "3": [2, 3],
  "4": [3]
}

Now classify the following:
""" + '\n'.join([f'"{id_}": "{text}"' for id_, text in id_text_pairs]) + '\n\nReturn JSON:'

    return examples

# === LOAD DATA ===
df = pd.read_csv(INPUT_CSV)

In [2]:
# === PROCESS IN BATCHES ===
NUM_LABELS = 4

for i in range(0, len(df), BATCH_SIZE):
    batch = df.iloc[i:i + BATCH_SIZE]
    id_text_pairs = list(zip(batch[ID_COLUMN].astype(str), batch[TEXT_COLUMN]))

    prompt = build_batch_prompt(id_text_pairs)

    try:
        response = co.chat(
            model="command-a-03-2025",
            messages=[{"role": "user", "content": prompt}]
        )

        output_text = "".join(part.text for part in response.message.content)
        cleaned_output = re.sub(r"```(?:json)?\n(.*?)```", r"\1", output_text.strip(), flags=re.DOTALL)
        output_json = json.loads(cleaned_output)

        for id_str, label_list in output_json.items():
            idx = df[df[ID_COLUMN].astype(str) == id_str].index
            if len(idx) > 0:
                # Create all-zero label row
                binary_labels = [0] * NUM_LABELS
                 # Mark the appropriate columns as 1
                for label_id in label_list:
                    if 0 <= label_id < NUM_LABELS:  # safety check
                        binary_labels[label_id] = 1
                 # Store labels in separate columns
                for i, lab_val in enumerate(binary_labels):
                    df.at[idx[0], f"label_{i}"] = lab_val
                print(f"Labeled ID {id_str}: {label_list}")
            else:
                print(f"Warning: ID {idx_str} not found in DataFrame.")

    except Exception as e:
        print(f"❌ Error at batch starting index {i}: {e}")

    time.sleep(DELAY_BETWEEN_BATCHES)

Labeled ID 1: [2]
Labeled ID 2: [0]
Labeled ID 3: [1, 2]
Labeled ID 4: [2, 3]
Labeled ID 5: [1, 2, 3]
Labeled ID 6: [0]
Labeled ID 7: [0]
Labeled ID 8: [2]
Labeled ID 9: [2]
Labeled ID 10: [0]
Labeled ID 11: [2, 3]
Labeled ID 12: [2]
Labeled ID 13: [0]
Labeled ID 14: [0]
Labeled ID 15: [2]
Labeled ID 16: [2]
Labeled ID 17: [2, 3]
Labeled ID 18: [2]
Labeled ID 19: [2]
Labeled ID 20: [2]
Labeled ID 21: [2]
Labeled ID 22: [2]
Labeled ID 23: [2]
Labeled ID 24: [2]
Labeled ID 25: [2]
Labeled ID 26: [2]
Labeled ID 27: [2]
Labeled ID 28: [2]
Labeled ID 29: [2]
Labeled ID 30: [2]
Labeled ID 31: [2]
Labeled ID 32: [2]
Labeled ID 33: [2]
Labeled ID 34: [2, 3]
Labeled ID 35: [1, 2]
Labeled ID 36: [2, 3]
Labeled ID 37: [2, 3]
Labeled ID 38: [3]
Labeled ID 39: [3]
Labeled ID 40: [3]
Labeled ID 41: [1, 2, 3]
Labeled ID 42: [1, 2, 3]
Labeled ID 43: [1, 2]
Labeled ID 44: [0]
Labeled ID 45: [1]
Labeled ID 46: [2]
Labeled ID 47: [2, 3]
Labeled ID 48: [1, 2, 3]
Labeled ID 49: [2]
Labeled ID 50: [2, 3]
La

In [3]:
# === SAVE OUTPUT ===
df.to_csv(OUTPUT_CSV, index=False)
print(f"Labeled data saved to: {OUTPUT_CSV}")

Labeled data saved to: psychforums_dataset_cleaned_final_2.csv
