# Instruction on how to run this template #

## 1) Make sure to have uploaded the consolidated master sheet as a csv and rename the file name to master.csv
## 2) Execute each cell by pressing "ctrl or cmd + space" or the play button on the top command bar. A cell is fully executed if it displays a number on the left hand side of the cell (for example "[1]"). If it shows "[*]", it means that the cell is still running. The execution line can take up to 30 minutes to run depending on the amount of your inputs. Wait until a cell is finished loading before loading other cells.



In [None]:
# importing packages

from langchain_dartmouth.llms import ChatDartmouthCloud
import time
import json
import pandas as pd

In [None]:
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

In [None]:
# find you dartmouth openai api key at
# https://rcweb.dartmouth.edu/~d20964h/2024-12-11-dartmouth-chat-api/api_key/

key = "ENTER KEY HERE"
llm = ChatDartmouthCloud(model_name="openai.gpt-4.1-mini-2025-04-14", dartmouth_chat_api_key=key, temperature=1)

In [None]:
#make sure to upload the consolidated master sheet as a csv, and rename it master.csv in your file

df = pd.read_csv('master.csv')

#segment to the rows you want to code

df = df[ #insert rows you want to inspect here ]
df = df[["Artifact","Original Citation","Consensus Code"]]
df = df.rename(columns={"Artifact" : "tag", "Original Citation" : "text", "Consensus Code" : "label"})
df

In [None]:
df["text"] = df["text"].astype(str)
df["tag"] = df["tag"].astype(str).str.lower().str.strip()

### CODING

In [None]:
def build_prompt_batch(segments):
    
prompt = f"""
You are a citation coding assistant. Use the following codebook as your reference for classification:
{codebook}

For coding type (1) — *contextual information* — base your decisions on the central thesis provided below in the column "Thesis" of master.csv.

To determine whether a segment should be tagged as context, assess its relevance to the thesis using both the codebook definitions and any thematic alignment.

Now, code the following text segments accordingly:

"""
    for i, segment in enumerate(segments):
        prompt += f"Text {i+1}: {segment}\n"
    prompt += (
        
        "\nReturn ONLY a valid JSON list of dictionaries like this (no explanation):\n"
        "[\n"
        "  {\"coding\": 1, \"justification\": \"Provides historical context.\"},\n"
        "  {\"coding\": 3, \"justification\": \"Supports the author's argument.\"}\n"
        "]"
    )

    return prompt

def call_llm(prompt, model=llm):
    try:
        response = model.invoke(prompt)
        return json.loads(response.content)
    except Exception as e:
        print("Error:", e)
        time.sleep(5)
        return None

In [None]:
#actual execution

%%time

batch_size = 10
results = []

for i in range(0, len(df), batch_size):
    batch = df["cleaned_sentence"].iloc[i:i+batch_size].tolist()
    prompt = build_prompt_batch(batch)
    batch_result = call_llm(prompt)
    
    if batch_result and isinstance(batch_result, list):
        results.extend(batch_result)
    else:
        print(f"Batch {i}-{i+batch_size} failed or returned invalid JSON")
        results.extend([{"coding": None, "justification": "Error"} for _ in batch])

In [None]:
#print results

df_result = pd.DataFrame(results)
df_final = pd.concat([df.reset_index(drop=True), df_result], axis=1)
df_final

In [None]:
#save results as a pdf

df_final.to_csv("res.csv",encoding="utf-8-sig")