In [None]:
# --------------------------------------------------------------------
# 1. Imports and Config
# --------------------------------------------------------------------
import pandas as pd
import os
from tqdm import tqdm
import time
from huggingface_hub import InferenceClient
import re

client = InferenceClient(
    provider="together",
    api_key="REPLACE WITH YOUR KEY",
)

LLAMA_MODEL = "meta-llama/Llama-3.3-70B-Instruct"
TEMPERATURE = 1
TOP_P = 1
MAX_TOKENS = 800
N = 1

data_long = pd.read_csv('../Data/data_long.csv')
output_path = '../Output/llm_chunking_scoring.csv'

In [10]:
chunking_prompt = """
You are an expert in cognitive psychology and verbal protocol analysis.

You are given a transcription of a participant thinking aloud while solving a problem.

Your task is to segment this transcription into meaningful chunks.

A chunk should correspond to a coherent idea, thought, or step in the participant's reasoning or verbal expression — including hesitations, repetitions, or meta-comments. The goal is not only to segment explicit reasoning steps but to preserve the full structure of the verbalization.

Guidelines:
- Do not remove or suppress any part of the original text.
- Do not segment based on arbitrary word count or length.
- Split only when the participant clearly moves to another distinct thought, idea, or reasoning step (e.g., shifting from generating an answer to justifying it, or reflecting on their uncertainty).
- Be conservative in splitting: avoid unnecessary fragmentation.
- Preserve the original wording exactly in each chunk.

Provide your output strictly in the following structure:

Chunk 1:
[exact text of chunk 1]

Chunk 2:
[exact text of chunk 2]

Chunk 3:
[exact text of chunk 3]

Be exhaustive.
"""

In [21]:
# --------------------------------------------------------------------
# Define API Call
# --------------------------------------------------------------------
def chunk_transcription(transcription):
    user_prompt = f"Here is the transcription to chunk:\n\n{transcription}"
    response = client.chat.completions.create(
        model=LLAMA_MODEL,
        messages=[
            {"role": "system", "content": chunking_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=1000,
        temperature=0,
        top_p=1,
        n=1,
    )
    return response.choices[0].message.content.strip()

# --------------------------------------------------------------------
# Run Chunking Loop
# --------------------------------------------------------------------
# --------------------------------------------------------------------
# Run Chunking Loop (Skip Already Chunked)
# --------------------------------------------------------------------
output_path = '../Output/chunked_transcriptions.csv'
os.makedirs('../Output', exist_ok=True)

# Load existing results if available
if os.path.exists(output_path):
    existing_df = pd.read_csv(output_path)
    already_chunked = set(zip(existing_df['subject_id'], existing_df['question']))
    results = existing_df.to_dict(orient='records')
    print(f"Loaded {len(existing_df)} already chunked transcriptions.")
else:
    already_chunked = set()
    results = []

# Loop over all transcriptions
for idx, row in tqdm(data_long.iterrows(), total=len(data_long), desc="Chunking transcriptions"):
    key = (row['subject_id'], row['question'])

    if key in already_chunked:
        continue  # Skip already chunked

    try:
        chunks = chunk_transcription(row['transcription_new'])

        results.append({
            'subject_id': row['subject_id'],
            'question': row['question'],
            'transcription_new': row['transcription_new'],
            'chunks': chunks
        })

        # Save every 20
        if (idx + 1) % 20 == 0:
            print(f"Saving progress at idx {idx + 1}...")
            pd.DataFrame(results).to_csv(output_path, index=False)

    except Exception as e:
        print(f"Error at idx {idx}: {e}")
        time.sleep(60)

# Final save
pd.DataFrame(results).to_csv(output_path, index=False)
print(f"Chunking completed and saved to {output_path}")

Loaded 1020 already chunked transcriptions.


Chunking transcriptions: 100%|██████████| 1020/1020 [00:00<00:00, 45663.74it/s]

Chunking completed and saved to ../Output/chunked_transcriptions.csv





In [65]:
scoring_prompt = """
You are an expert in cognitive psychology.

You are given a short chunk of a participant's think-aloud transcription during a reasoning task.

Your task is to rate how strongly this chunk expresses each of the following deliberation functions.

Definitions of the deliberation functions:

- Response Control: Inhibiting, rejecting, or resisting an obvious or intuitive response that first comes to mind. Typical signs include expressions of doubt, suppression of initial answers, hesitation, or stopping oneself from blurting out an impulsive response.

- Response Generation: Actively searching for new possible answers, alternatives, or hypotheses. This includes exploring options, mentally simulating scenarios, considering possibilities, or applying step-by-step logical reasoning.

- Response Justification: Providing explicit reasons, arguments, or explanations to support a response that is currently being considered (whether intuitive or not). This includes defending a choice, explaining why an answer makes sense, or making an argument.

- Response Regulation: Reflecting on one's own reasoning process, monitoring one's performance, allocating effort, expressing uncertainty, or deciding whether to continue thinking or stop. This includes metacognitive monitoring or strategic regulation of effort.

Important Instructions:

- Each score should reflect the extent to which the chunk expresses the function (even partially), using a continuous scale from 0 (not at all present) to 100 (very strongly present).
- These functions are not mutually exclusive — a chunk may score highly on multiple functions if they co-occur.
- If the chunk contains no trace of any of these 4 functions, assign 0 to all functions. This is perfectly acceptable.
- If the chunk expresses a completely different kind of function (not captured by the 4 above), mention it below using a very broad and generic label (e.g., "Reading Aloud", "Task Repetition", "Social Comment", etc.). This should only happen rarely and only if clearly justified by the content of the chunk.
- Be conservative: If you are unsure whether a function is expressed, prefer giving a low score (0-10).
- Do not explain or justify the scores unless the chunk clearly expresses a different kind of function.

Output strictly in this structure (and nothing else):

Response Control: [score between 0 and 100]
Response Generation: [score between 0 and 100]
Response Justification: [score between 0 and 100]
Response Regulation: [score between 0 and 100]

[Optional broad label for a different function — only if clearly needed]
"""

In [67]:
# --------------------------------------------------------------------
# Define API Call for Scoring
# --------------------------------------------------------------------
def score_chunk(chunk_text, previous_chunks, question_text, correct_answer, lured_answer):
    context = "\n\nPrevious chunks (for context):\n" + previous_chunks if previous_chunks else ""

    user_prompt = f"""Here is the question the participant was solving:

{question_text}

The most obvious or intuitive (but incorrect) answer is: {lured_answer}

The correct answer is: {correct_answer}

{context}

Here is the current chunk of the transcription to score:

{chunk_text}
"""

    response = client.chat.completions.create(
        model=LLAMA_MODEL,
        messages=[
            {"role": "system", "content": scoring_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=500,
        temperature=0,
        top_p=1,
        n=1,
    )
    return response.choices[0].message.content.strip()


# --------------------------------------------------------------------
# Reload Chunked Transcriptions
# --------------------------------------------------------------------
chunked_df = pd.read_csv('../Output/chunked_transcriptions.csv')

chunked_df = chunked_df.merge(
    data_long[['subject_id', 'question', 'question_text', 'correct_answer', 'lured_answer']],
    on=['subject_id', 'question'],
    how='left'
)

output_path = '../Output/scored_chunks.csv'

if os.path.exists(output_path):
    existing_df = pd.read_csv(output_path)
    already_scored = set(zip(existing_df['subject_id'], existing_df['question']))
    results = existing_df.to_dict(orient='records')
    print(f"Loaded {len(existing_df)} already scored chunks.")
else:
    already_scored = set()
    results = []

#chunked_df = chunked_df.head(10)  # For testing

# --------------------------------------------------------------------
# Main Scoring Loop
# --------------------------------------------------------------------
for idx, row in tqdm(chunked_df.iterrows(), total=len(chunked_df), desc="Scoring chunks"):
    key = (row['subject_id'], row['question'])

    if key in already_scored:
        continue

    try:
        chunks = row['chunks']
        chunk_texts = re.split(r'Chunk \d+:', chunks)
        chunk_texts = [c.strip() for c in chunk_texts if c.strip()]

        previous_chunks = ""

        for chunk_id, chunk_text in enumerate(chunk_texts, start=1):

            for attempt in range(3):
                raw_output = score_chunk(
                    chunk_text,
                    previous_chunks,
                    row['question_text'],
                    row['correct_answer'],
                    row['lured_answer']
                )
                scores, comment = parse_scores(raw_output)

                if scores is not None:
                    break
                print(f"Retry {attempt+1} for chunk {chunk_id}...")
                time.sleep(5)

            if scores is None:
                scores = {
                    'response_control': np.nan,
                    'response_generation': np.nan,
                    'response_justification': np.nan,
                    'response_regulation': np.nan
                }

            results.append({
                'subject_id': row['subject_id'],
                'question': row['question'],
                'chunk_id': chunk_id,
                'chunk_text': chunk_text,
                'response_control': scores['response_control'],
                'response_generation': scores['response_generation'],
                'response_justification': scores['response_justification'],
                'response_regulation': scores['response_regulation'],
                'llm_comment': comment
            })

            previous_chunks += chunk_text + " "  # Update context

        if (idx + 1) % 20 == 0:
            print(f"Saving progress at idx {idx + 1}...")
            pd.DataFrame(results).to_csv(output_path, index=False)

    except Exception as e:
        print(f"Error at idx {idx}: {e}")
        time.sleep(60)

pd.DataFrame(results).to_csv(output_path, index=False)
print(f"Scoring completed and saved to {output_path}")


Scoring chunks:   2%|▏         | 20/1020 [00:32<34:39,  2.08s/it]

Saving progress at idx 20...


Scoring chunks:   4%|▍         | 40/1020 [01:38<41:51,  2.56s/it]  

Saving progress at idx 40...


Scoring chunks:   6%|▌         | 60/1020 [02:32<33:49,  2.11s/it]  

Saving progress at idx 60...


Scoring chunks:   8%|▊         | 80/1020 [03:11<39:06,  2.50s/it]

Saving progress at idx 80...


Scoring chunks:  10%|▉         | 100/1020 [04:06<26:36,  1.74s/it] 

Saving progress at idx 100...


Scoring chunks:  12%|█▏        | 120/1020 [05:04<32:44,  2.18s/it]  

Saving progress at idx 120...


Scoring chunks:  14%|█▎        | 140/1020 [05:25<07:51,  1.87it/s]

Saving progress at idx 140...


Scoring chunks:  16%|█▌        | 160/1020 [06:11<09:00,  1.59it/s]  

Saving progress at idx 160...


Scoring chunks:  18%|█▊        | 180/1020 [07:09<34:30,  2.47s/it]  

Saving progress at idx 180...


Scoring chunks:  20%|█▉        | 200/1020 [08:14<58:48,  4.30s/it]  

Saving progress at idx 200...


Scoring chunks:  22%|██▏       | 220/1020 [09:26<39:26,  2.96s/it]  

Saving progress at idx 220...


Scoring chunks:  24%|██▎       | 240/1020 [09:44<08:19,  1.56it/s]

Saving progress at idx 240...


Scoring chunks:  25%|██▌       | 260/1020 [10:17<30:19,  2.39s/it]

Saving progress at idx 260...


Scoring chunks:  27%|██▋       | 280/1020 [10:56<25:49,  2.09s/it]

Saving progress at idx 280...


Scoring chunks:  29%|██▉       | 300/1020 [11:54<23:38,  1.97s/it]

Saving progress at idx 300...


Scoring chunks:  31%|███▏      | 320/1020 [13:10<35:15,  3.02s/it]  

Saving progress at idx 320...


Scoring chunks:  33%|███▎      | 340/1020 [14:59<46:05,  4.07s/it]  

Saving progress at idx 340...


Scoring chunks:  35%|███▌      | 360/1020 [16:36<44:20,  4.03s/it]  

Saving progress at idx 360...


Scoring chunks:  37%|███▋      | 380/1020 [17:48<47:10,  4.42s/it]

Saving progress at idx 380...


Scoring chunks:  39%|███▉      | 400/1020 [19:01<31:21,  3.03s/it]  

Saving progress at idx 400...


Scoring chunks:  41%|████      | 420/1020 [20:28<1:14:05,  7.41s/it]

Saving progress at idx 420...


Scoring chunks:  43%|████▎     | 440/1020 [21:04<07:54,  1.22it/s]  

Saving progress at idx 440...


Scoring chunks:  45%|████▌     | 460/1020 [21:48<41:11,  4.41s/it]

Saving progress at idx 460...


Scoring chunks:  47%|████▋     | 480/1020 [23:08<28:28,  3.16s/it]  

Saving progress at idx 480...


Scoring chunks:  49%|████▉     | 500/1020 [24:33<23:02,  2.66s/it]

Saving progress at idx 500...


Scoring chunks:  51%|█████     | 520/1020 [25:25<21:45,  2.61s/it]

Saving progress at idx 520...


Scoring chunks:  53%|█████▎    | 540/1020 [26:11<16:38,  2.08s/it]

Saving progress at idx 540...


Scoring chunks:  55%|█████▍    | 560/1020 [27:04<22:09,  2.89s/it]

Saving progress at idx 560...


Scoring chunks:  57%|█████▋    | 580/1020 [28:08<09:49,  1.34s/it]

Saving progress at idx 580...


Scoring chunks:  59%|█████▉    | 600/1020 [29:23<27:42,  3.96s/it]

Saving progress at idx 600...


Scoring chunks:  61%|██████    | 620/1020 [30:57<28:22,  4.26s/it]

Saving progress at idx 620...


Scoring chunks:  63%|██████▎   | 640/1020 [32:17<20:39,  3.26s/it]

Saving progress at idx 640...


Scoring chunks:  65%|██████▍   | 660/1020 [33:49<23:02,  3.84s/it]

Saving progress at idx 660...


Scoring chunks:  67%|██████▋   | 680/1020 [34:44<13:22,  2.36s/it]

Saving progress at idx 680...


Scoring chunks:  69%|██████▊   | 700/1020 [36:10<19:43,  3.70s/it]

Saving progress at idx 700...


Scoring chunks:  71%|███████   | 720/1020 [37:54<22:01,  4.41s/it]

Saving progress at idx 720...


Scoring chunks:  73%|███████▎  | 740/1020 [38:28<04:02,  1.16it/s]

Saving progress at idx 740...


Scoring chunks:  75%|███████▍  | 760/1020 [39:28<15:31,  3.58s/it]

Saving progress at idx 760...


Scoring chunks:  76%|███████▋  | 780/1020 [40:46<09:23,  2.35s/it]

Saving progress at idx 780...


Scoring chunks:  78%|███████▊  | 800/1020 [41:49<13:31,  3.69s/it]

Saving progress at idx 800...


Scoring chunks:  80%|████████  | 820/1020 [42:38<14:57,  4.49s/it]

Saving progress at idx 820...


Scoring chunks:  82%|████████▏ | 840/1020 [43:41<05:04,  1.69s/it]

Saving progress at idx 840...


Scoring chunks:  84%|████████▍ | 860/1020 [44:52<06:18,  2.36s/it]

Saving progress at idx 860...


Scoring chunks:  86%|████████▋ | 880/1020 [46:30<18:49,  8.07s/it]

Saving progress at idx 880...


Scoring chunks:  88%|████████▊ | 900/1020 [47:21<04:28,  2.24s/it]

Saving progress at idx 900...


Scoring chunks:  90%|█████████ | 920/1020 [48:06<01:08,  1.46it/s]

Saving progress at idx 920...


Scoring chunks:  92%|█████████▏| 940/1020 [48:48<01:31,  1.15s/it]

Saving progress at idx 940...


Scoring chunks:  94%|█████████▍| 960/1020 [49:45<02:19,  2.32s/it]

Saving progress at idx 960...


Scoring chunks:  96%|█████████▌| 980/1020 [50:43<03:00,  4.51s/it]

Saving progress at idx 980...


Scoring chunks:  98%|█████████▊| 1000/1020 [51:49<00:54,  2.71s/it]

Saving progress at idx 1000...


Scoring chunks: 100%|██████████| 1020/1020 [53:08<00:00,  3.13s/it]

Saving progress at idx 1020...
Scoring completed and saved to ../Output/scored_chunks.csv



