In [1]:
pip install google-generativeai pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import time
import google.generativeai as genai
import json

# === CONFIGURATION ===
API_KEY = "YOUR_API_KEY"
INPUT_CSV = "psychforums_dataset_cleaned_3.csv"
OUTPUT_CSV = "psychforums_dataset_cleaned_final_1.csv"
TEXT_COLUMN = "text"
ID_COLUMN = "id"
LABEL_COLUMN = "label"
BATCH_SIZE = 3
DELAY_BETWEEN_BATCHES = 5  # seconds

# === SETUP ===
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel("models/gemini-2.5-flash-lite-preview-06-17")

# === PROMPT FUNCTION ===
def build_batch_prompt(id_text_pairs):
    examples = """You are a multi-label text classifier. Each input text can belong to multiple emotional states. Choose **all applicable labels** from the following list:

0 -> neutral  
1 -> low mood / depressed  
2 -> anxious / worried
3 -> stressed / overwhelmed  

Return a JSON dictionary where **each key is the ID (string)** and **the value is a list of one or more labels** (e.g., [1, 2]). Only return the raw JSON — no explanations.

Examples:
{
  "1": [0],
  "2": [1, 2],
  "3": [2, 3],
  "4": [3]
}

Now classify the following:
""" + '\n'.join([f'"{id_}": "{text}"' for id_, text in id_text_pairs]) + '\n\nReturn JSON:'

    return examples

# === LOAD DATA ===
df = pd.read_csv(INPUT_CSV)

In [2]:
# === PROCESS IN BATCHES ===
NUM_LABELS = 4

for i in range(0, len(df), BATCH_SIZE):
    batch = df.iloc[i:i + BATCH_SIZE]
    id_text_pairs = list(zip(batch[ID_COLUMN].astype(str), batch[TEXT_COLUMN]))

    prompt = build_batch_prompt(id_text_pairs)

    try:
        response = model.generate_content(prompt)
        json_text = response.text.strip()

        # Extract valid JSON from text
        json_start = json_text.find("{")
        json_end = json_text.rfind("}") + 1
        json_str = json_text[json_start:json_end]

        label_dict = json.loads(json_str)

        for id_str, label_list in label_dict.items():
            idx = df[df[ID_COLUMN].astype(str) == id_str].index
            if len(idx) > 0:
                # Create all-zero label row
                binary_labels = [0] * NUM_LABELS
                 # Mark the appropriate columns as 1
                for label_id in label_list:
                    if 0 <= label_id < NUM_LABELS:  # safety check
                        binary_labels[label_id] = 1
                 # Store labels in separate columns
                for i, lab_val in enumerate(binary_labels):
                    df.at[idx[0], f"label_{i}"] = lab_val
                
                print(f"Labeled ID {id_str}: {label_list}")
            else:
                print(f"Warning: ID {id_str} not found in DataFrame")
    except Exception as e:
        print(f"❌ Error during batch: {e}")
        for id_str, _ in id_text_pairs:
            idx = df[df[ID_COLUMN].astype(str) == id_str].index
            if len(idx) > 0:
                df.at[idx[0], LABEL_COLUMN] = "ERROR"

    time.sleep(DELAY_BETWEEN_BATCHES)

Labeled ID 1: [1, 2, 3]
Labeled ID 2: [0]
Labeled ID 3: [1, 2]
Labeled ID 4: [2, 3]
Labeled ID 5: [1, 2]
Labeled ID 6: [0]
Labeled ID 7: [0]
Labeled ID 8: [1, 2, 3]
Labeled ID 9: [0, 2]
Labeled ID 10: [0]
Labeled ID 11: [0, 1]
Labeled ID 12: [2]
Labeled ID 13: [0]
Labeled ID 14: [0]
Labeled ID 15: [0, 2]
Labeled ID 16: [2]
Labeled ID 17: [2]
Labeled ID 18: [2]
Labeled ID 19: [2]
Labeled ID 20: [2, 3]
Labeled ID 21: [2]
Labeled ID 22: [2, 3]
Labeled ID 23: [2, 3]
Labeled ID 24: [2]
Labeled ID 25: [2, 3]
Labeled ID 26: [1, 2]
Labeled ID 27: [1, 2]
Labeled ID 28: [2]
Labeled ID 29: [2, 3]
Labeled ID 30: [2]
Labeled ID 31: [2, 3]
Labeled ID 32: [2]
Labeled ID 33: [1, 2]
❌ Error during batch: '<=' not supported between instances of 'int' and 'str'
Labeled ID 37: [2, 3]
Labeled ID 38: [2, 3]
Labeled ID 39: [2, 3]
❌ Error during batch: '<=' not supported between instances of 'int' and 'str'
Labeled ID 43: [2, 3]
Labeled ID 44: [0]
Labeled ID 45: [1]
Labeled ID 46: [0, 1, 2]
Labeled ID 47: [1,

In [None]:
logs = """

"""

import re, ast
# Step 2: Parse into a dictionary
parsed_labels = {}
for line in logs.strip().split("\n"):
    match = re.match(r"Labeled ID (\d+): (\[.*\])", line.strip())
    if match:
        id_ = int(match.group(1))
        label_list = ast.literal_eval(match.group(2))
        # Convert all labels to integers
        label_list = [int(label) for label in label_list]
        parsed_labels[id_] = label_list

# Update only the rows whose IDs are in the parsed_labels dict
df['label'] = df.apply(
    lambda row: parsed_labels[row['id']] if row['id'] in parsed_labels else row['label'],
    axis=1
)

# ✅ Done: df['label'] is now updated only for IDs in the logs
print(df.loc[df['id'] == 1200, 'label'].values)

In [8]:
df.head(2)

Unnamed: 0,text,id,label_0,label_1,label_2,label_3,label
0,how are you feeling today? i had a fight with ...,1,1.0,2.0,3.0,,
1,how are you feeling today? snaga wrote: a stic...,2,0.0,2.0,,,


In [12]:
# === SAVE OUTPUT ===
df.to_csv('psychforums_dataset_cleaned_final_1', index=False)
print(f"Saved labeled data to {OUTPUT_CSV}")

Saved labeled data to Datasets/journal_reddit_posts_3.csv
