How running CMR1 was done

In [None]:
import json 

def CMR1_predict_sample(row,client,model):
    """
    The function provided is designed to assess whether two given texts belong to the same causal variable. 
    Below is a summary of the function's key actions and inputs:
      Generating Completion: It uses the LLM model through the client to generate a completion for the given task.
      Extracting Response: The function extracts the response from the generated completion and converts it into a JSON object.
      Adding Additional Information: It adds additional information to the response, including the prediction model, whether the generated and input causal variables are the same, the data generation model, the generated variable name, and the domain.
      Returning Response: Finally, the function returns the response as a JSON object.
    """
  

    First_text=row['Text1']
    Second_text=row['Text2']
    domain=row['domain']
    completion = client.chat.completions.create(
                model=model,
                messages=[
                  {"role": "system", 
                  "content":f"""
                  You are an expert in causality and {domain}. Your task is to help users model their domain knowledge by identifying if two texts describe the same causal variable. 
                  Texts that describe different values or the same value of a causal variable should be indicated.
                  """},
                  {"role": "user", "content": f"""  
                  Your task is to assess if the following two texts belong to the same causal variable. 
                        - If the two texts belong to the same causal variable, provide the variable name.
                        - If the two texts are similar but do not belong to the same variable set the variable name to '', provide your explanation.
                    Structure your answer as a JSON object including string 'Text1', string 'Text2', boolean 'Predicted Same Causal Variable', string 'Predicted Variable Name', and string 'Explanation'.
                    
                    First text: ```{First_text}```
                    Second text: ```{Second_text}```       
                    """}
                  ],
                  response_format={ "type": "json_object" }
                  ,
                temperature=0, ## no creativity here 
                max_tokens=4096
              )
    response=completion.choices[0].message.content
    try:
      response=json.loads(response)
        
    except:
       with open('to_check.txt','w') as f:
            f.write(str(response)+'\n')
            f.write('Prediction Model: '+str(model)+'\n')
            f.write('Generated Interaction value: '+str(row['Same Causal Variable'])+'\n')
            f.write('Data Generation Model: '+str(row['model Name'])+'\n')
            f.write('Domain: '+str(domain)+'\n')
       
    response['Prediction Model']=model
    response['Generated Same Causal Variable']=row['Same Causal Variable']
    response['Data Generation Model']=row['model Name']
    response['Generated Variable Name']=row['Variable Name']
    response['Domain']=domain
    return response

generating training data, make sure prompt is the same

In [2]:
import pandas as pd
import json
import os
os.chdir("/home/feline/Evaluating-Large-Language-Models-for-Causal-Modeling/finetuning")

def generate_messages(row):
    """
    Generate the list of messages for the training example.
    This uses the exact prompt from your non-finetuned test.
    """
    domain = row['domain']
    first_text = row['Text1']
    second_text = row['Text2']
    
    system_message = {
        "role": "system",
        "content": f"""You are an expert in causality and {domain}. Your task is to help users model their domain knowledge by identifying if two texts describe the same causal variable. 
Texts that describe different values or the same value of a causal variable should be indicated."""
    }
    
    user_message = {
        "role": "user",
        "content": f"""Your task is to assess if the following two texts belong to the same causal variable. 
- If the two texts belong to the same causal variable, provide the variable name.
- If the two texts are similar but do not belong to the same variable set the variable name to '', provide your explanation.
Structure your answer as a JSON object including string 'Text1', string 'Text2', boolean 'Predicted Same Causal Variable', string 'Predicted Variable Name', and string 'Explanation'.

First text: ```{first_text}```
Second text: ```{second_text}```"""
    }
    
    return [system_message, user_message]

def generate_ground_truth(row):
    """
    Generate the ground truth JSON object for the assistant message.
    This object mirrors the non-finetuned output format.
    """
    # Convert "Same Causal Variable" to a boolean if needed.
    same_causal = row["Same Causal Variable"]
    if isinstance(same_causal, str):
        same_causal = same_causal.strip().lower() == 'true'
    
    ground_truth = {
        "Text1": row["Text1"],
        "Text2": row["Text2"],
        "Predicted Same Causal Variable": same_causal,
        "Predicted Variable Name": row["Variable Name"],
        "Domain": row["domain"],
        "Explanation": "None"
    }
    return ground_truth

def main(csv_path, jsonl_output_path, target_domains=None):
    """
    Reads the CSV file, optionally filters rows by a list of target domains,
    and writes a JSONL file with training examples in the desired format.
    """
    df = pd.read_csv(csv_path)
    
    if target_domains is not None:
        # Filter rows whose 'domain' is in the provided list of target domains.
        df = df[df["domain"].isin(target_domains)]
    
    with open(jsonl_output_path, "w", encoding="utf-8") as f_out:
        for _, row in df.iterrows():
            messages = generate_messages(row)
            ground_truth = generate_ground_truth(row)
            # Build the training example with three messages:
            # system message, user message, and the assistant (ground truth) message.
            example = {
                "messages": [
                    messages[0],
                    messages[1],
                    {"role": "assistant", "content": json.dumps(ground_truth, ensure_ascii=False)}
                ]
            }
            f_out.write(json.dumps(example, ensure_ascii=False) + "\n")

if __name__ == '__main__':
    # Set the path to your CSV file
    csv_path = "sampled_data_set_large.csv"  # Replace with your CSV file path
    # Set the desired output JSONL file path
    jsonl_output_path = "training4.jsonl"
    # Specify the target domain to fine-tune on (adjust as needed)
    target_domain = ["physics", "computer science"]
    main(csv_path, jsonl_output_path, target_domain)


In [None]:

import os
os.chdir("/home/feline/Evaluating-Large-Language-Models-for-Causal-Modeling/finetuning")



!accelerate launch -m axolotl.cli.train config_70b_en.yaml

The following values were not passed to `accelerate launch` and had defaults used instead:
	`--num_processes` was set to a value of `1`
	`--num_machines` was set to a value of `1`
	`--mixed_precision` was set to a value of `'no'`
	`--dynamo_backend` was set to a value of `'no'`
/usr/bin/ld: skipping incompatible /usr/lib32/libgcc_s.so.1 when searching for libgcc_s.so.1
/usr/bin/ld: skipping incompatible /usr/lib32/libgcc_s.so.1 when searching for libgcc_s.so.1
[2025-03-17 11:53:20,541] [INFO] [datasets.<module>:54] [PID:3163210] PyTorch version 2.6.0 available.
[2025-03-17 11:53:22,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-03-17 11:53:23,145] [INFO] [root.spawn:60] [PID:3163210] gcc -pthread -B /home/feline/miniconda3/envs/axolotl/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/feline/miniconda3/envs/axolotl/include -fPIC -O2 -isystem /home/feline/miniconda3/envs/axolotl/inclu

code to run trained model now

In [1]:
import os
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
from tqdm import tqdm
import gc

os.chdir("/home/feline/Evaluating-Large-Language-Models-for-Causal-Modeling/finetuning")

def load_model_and_tokenizer(checkpoint_path, load_in_8bit=True, load_in_4bit=False):
    """
    Loads the model and tokenizer from a checkpoint.
    You can adjust quantization parameters (8-bit or 4-bit) as needed.
    """
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

    model = AutoModelForCausalLM.from_pretrained(
        checkpoint_path,
        device_map='auto',
        quantization_config=bnb_config,
    )

    generator = pipeline(
        'text-generation',
        model=model,
        tokenizer=tokenizer,
        device_map='auto',
        return_full_text=False  # so that only new tokens are returned
    )
    return tokenizer, model, generator

def construct_chat_prompt(row):
    """
    Constructs the chat prompt using the exact text from your non-finetuned tests.
    This prompt will be used for every evaluation sample, regardless of domain.
    """
    domain = row["domain"]
    first_text = row["Text1"]
    second_text = row["Text2"]

    # System message
    system_message = (
        f"You are an expert in causality and {domain}. Your task is to help users model their domain knowledge by identifying if two texts describe the same causal variable. "
        "Texts that describe different values or the same value of a causal variable should be indicated."
    )

    # User message – note the exact wording as in your non-finetuned tests.
    user_message = (
        "Your task is to assess if the following two texts belong to the same causal variable. \n"
        "- If the two texts belong to the same causal variable, provide the variable name.\n"
        "- If the two texts are similar but do not belong to the same variable set the variable name to '', provide your explanation.\n"
        "Structure your answer as a JSON object including string 'Text1', string 'Text2', boolean 'Predicted Same Causal Variable', "
        "string 'Predicted Variable Name', and string 'Explanation'.\n\n"
        f"First text: ```{first_text}```\n"
        f"Second text: ```{second_text}```"
    )
    
    # Combine messages in a chat-like format (using special tokens) so the prompt is identical to the non-finetuned tests.
    prompt = f"<|system|> {system_message}\n<|user|> {user_message}\n<|assistant|> "
    return prompt

def main():
    # === Configuration ===
    # Path to your trained model checkpoint
    CHECKPOINT_PATH = "lora_70b_en_v0_alt_v2"  # UPDATE with your checkpoint path
    # Input CSV file with evaluation examples. This file should have at least the columns:
    # Text1, Text2, Same Causal Variable, Variable Name, model Name, domain
    INPUT_CSV = "sampled_data_set_small.csv"  # UPDATE with your evaluation CSV path
    # Output CSV file for saving results
    OUTPUT_CSV = "evaluation_results_alt.csv"
    # Batch size (adjust according to your GPU memory)
    BATCH_SIZE = 1
    # Maximum number of new tokens to generate
    MAX_NEW_TOKENS = 200

    # Load model, tokenizer, and generator pipeline
    tokenizer, model, generator = load_model_and_tokenizer(
        CHECKPOINT_PATH, load_in_8bit=False, load_in_4bit=True
    )
    prediction_model_name = os.path.basename(CHECKPOINT_PATH)

    # Load the evaluation CSV (all domains will be processed)
    df = pd.read_csv(INPUT_CSV)

    output_rows = []
    num_samples = len(df)
    num_batches = (num_samples + BATCH_SIZE - 1) // BATCH_SIZE

    print(f"Processing {num_samples} samples in {num_batches} batches...")

    for batch_idx in tqdm(range(num_batches), desc="Evaluating"):
        start_idx = batch_idx * BATCH_SIZE
        end_idx = min(start_idx + BATCH_SIZE, num_samples)
        batch_df = df.iloc[start_idx:end_idx]

        # Create a prompt for each row using the same non-finetuned format
        prompts = [construct_chat_prompt(row) for _, row in batch_df.iterrows()]

        try:
            responses = generator(
                prompts,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,        # Use deterministic generation for fair comparison
                temperature=0.0,
                num_return_sequences=1
            )
        except Exception as e:
            print(f"Error during generation: {e}")
            responses = ["" for _ in prompts]

        for i, response in enumerate(responses):
            # The response may be returned as a list/dict or a string, so handle accordingly.
            if isinstance(response, list) and response:
                generated_text = response[0].get('generated_text', '')
            elif isinstance(response, dict):
                generated_text = response.get('generated_text', '')
            elif isinstance(response, str):
                generated_text = response
            else:
                generated_text = ""

            # Remove the prompt part if it is present in the output.
            prompt_text = prompts[i]
            answer_text = generated_text[len(prompt_text):].strip() if generated_text.startswith(prompt_text) else generated_text.strip()

            # Try to parse the answer as a JSON object.
            try:
                parsed_answer = json.loads(answer_text)
                predicted_same_causal = parsed_answer.get("Predicted Same Causal Variable", "")
                predicted_variable_name = parsed_answer.get("Predicted Variable Name", "")
                explanation = parsed_answer.get("Explanation", "")
            except Exception as e:
                print(f"Failed to parse JSON for sample index {start_idx + i}: {e}")
                predicted_same_causal = ""
                predicted_variable_name = ""
                explanation = answer_text  # Save the raw output in case JSON parsing fails

            # Save results for each sample with the required columns.
            row_input = batch_df.iloc[i]
            output_rows.append({
                "Text1": row_input["Text1"],
                "Text2": row_input["Text2"],
                "Generated Same Causal Variable": row_input["Same Causal Variable"],
                "Predicted Same Causal Variable": predicted_same_causal,
                "Generated Variable Name": row_input["Variable Name"],
                "Predicted Variable Name": predicted_variable_name,
                "Data Generation Model": row_input["model Name"],
                "Prediction Model": prediction_model_name,
                "Domain": row_input["domain"],
                "Explanation": explanation
            })

    # Save the aggregated results to CSV in the requested format.
    output_df = pd.DataFrame(output_rows)
    output_df.to_csv(OUTPUT_CSV, index=False, sep=",", encoding="utf-8")
    print(f"Evaluation results saved to {OUTPUT_CSV}")

    # Cleanup: unload the model to free up GPU memory.
    del model, tokenizer, generator
    torch.cuda.empty_cache()
    gc.collect()

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Device set to use cuda:0


Processing 19 samples in 19 batches...


Evaluating:  11%|█         | 2/19 [00:34<05:24, 19.06s/it]

Failed to parse JSON for sample index 1: Extra data: line 1 column 183 (char 182)


Evaluating:  16%|█▌        | 3/19 [01:01<06:01, 22.60s/it]

Failed to parse JSON for sample index 2: Extra data: line 1 column 185 (char 184)


Evaluating:  26%|██▋       | 5/19 [01:36<04:46, 20.43s/it]

Failed to parse JSON for sample index 4: Extra data: line 1 column 185 (char 184)


Evaluating:  37%|███▋      | 7/19 [02:11<03:56, 19.69s/it]

Failed to parse JSON for sample index 6: Extra data: line 1 column 191 (char 190)


Evaluating:  42%|████▏     | 8/19 [02:38<04:01, 21.92s/it]

Failed to parse JSON for sample index 7: Extra data: line 1 column 196 (char 195)


Evaluating:  53%|█████▎    | 10/19 [03:12<03:03, 20.41s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Failed to parse JSON for sample index 9: Extra data: line 1 column 194 (char 193)


Evaluating:  58%|█████▊    | 11/19 [03:39<02:58, 22.33s/it]

Failed to parse JSON for sample index 10: Extra data: line 1 column 189 (char 188)


Evaluating:  63%|██████▎   | 12/19 [04:06<02:45, 23.66s/it]

Failed to parse JSON for sample index 11: Extra data: line 1 column 183 (char 182)


Evaluating:  68%|██████▊   | 13/19 [04:32<02:27, 24.57s/it]

Failed to parse JSON for sample index 12: Extra data: line 1 column 186 (char 185)


Evaluating:  74%|███████▎  | 14/19 [04:59<02:06, 25.21s/it]

Failed to parse JSON for sample index 13: Extra data: line 1 column 193 (char 192)


Evaluating:  79%|███████▉  | 15/19 [05:26<01:42, 25.66s/it]

Failed to parse JSON for sample index 14: Extra data: line 1 column 181 (char 180)


Evaluating:  84%|████████▍ | 16/19 [05:52<01:17, 25.97s/it]

Failed to parse JSON for sample index 15: Extra data: line 1 column 196 (char 195)


Evaluating:  89%|████████▉ | 17/19 [06:19<00:52, 26.18s/it]

Failed to parse JSON for sample index 16: Extra data: line 1 column 189 (char 188)


Evaluating:  95%|█████████▍| 18/19 [06:46<00:26, 26.34s/it]

Failed to parse JSON for sample index 17: Extra data: line 1 column 180 (char 179)


Evaluating: 100%|██████████| 19/19 [07:13<00:00, 22.79s/it]

Failed to parse JSON for sample index 18: Extra data: line 1 column 189 (char 188)
Evaluation results saved to evaluation_results_alt.csv





v2

In [1]:
import os
import json
import re
import gc
import torch
from tqdm import tqdm
import pandas as pd
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline,
    BitsAndBytesConfig, 
    StoppingCriteria, 
    StoppingCriteriaList
)
from datasets import load_dataset

# Change working directory if needed.
os.chdir("/home/feline/Evaluating-Large-Language-Models-for-Causal-Modeling/finetuning")

# --- Custom batched stopping criteria ---
class StopOnCompleteJSON(StoppingCriteria):
    """
    Stops generation once each sequence in the batch has a balanced JSON object.
    For each sequence, it looks for the first '{' and then counts opening and closing braces.
    Generation stops only when all sequences have a complete JSON object.
    """
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, input_ids, scores, **kwargs):
        finished = []
        for seq in input_ids:
            decoded = self.tokenizer.decode(seq, skip_special_tokens=True)
            try:
                start = decoded.index("{")
                json_part = decoded[start:]
                count = 0
                complete = False
                for char in json_part:
                    if char == "{":
                        count += 1
                    elif char == "}":
                        count -= 1
                    if count == 0 and char == "}":
                        complete = True
                        break
                finished.append(complete)
            except ValueError:
                finished.append(False)
        # Only stop when all sequences in the batch are complete.
        return all(finished)

# --- Helper function to fix invalid JSON values ---
def fix_invalid_json(text):
    # Replace standalone NaN with null.
    text = re.sub(r'\bNaN\b', 'null', text)
    # Replace True/False (outside of quotes) with true/false.
    text = re.sub(r'(?<!")\bTrue\b(?!")', 'true', text)
    text = re.sub(r'(?<!")\bFalse\b(?!")', 'false', text)
    return text

# --- Load model and tokenizer ---
def load_model_and_tokenizer(checkpoint_path, load_in_8bit=True, load_in_4bit=False):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(
        checkpoint_path,
        device_map='auto',
        quantization_config=bnb_config,
    )
    generator = pipeline(
        'text-generation',
        model=model,
        tokenizer=tokenizer,
        device_map='auto',
        return_full_text=False  # Only new tokens are returned.
    )
    return tokenizer, model, generator

# --- Construct chat prompt (exactly as before) ---
def construct_chat_prompt(row):
    # Use the "domain" field; it will be renamed later.
    domain = row["domain"] if "domain" in row else row["Domain"]
    first_text = row["Text1"]
    second_text = row["Text2"]
    system_message = (
        f"You are an expert in causality and {domain}. Your task is to help users model their domain knowledge by identifying if two texts describe the same causal variable. "
        "Texts that describe different values or the same value of a causal variable should be indicated."
    )
    user_message = (
        "Your task is to assess if the following two texts belong to the same causal variable. \n"
        "- If the two texts belong to the same causal variable, provide the variable name.\n"
        "- If the two texts are similar but do not belong to the same variable set the variable name to '', provide your explanation.\n"
        "Structure your answer as a JSON object including string 'Text1', string 'Text2', boolean 'Predicted Same Causal Variable', "
        "string 'Predicted Variable Name', and string 'Explanation'.\n\n"
        f"First text: ```{first_text}```\n"
        f"Second text: ```{second_text}```"
    )
    prompt = f"<|system|> {system_message}\n<|user|> {user_message}\n<|assistant|> "
    return prompt

# --- Process a batch of samples ---
def process_batch(batch):
    prompts = []
    # Construct prompts for each example in the batch.
    for i in range(len(batch["Text1"])):
        row = {
            "Text1": batch["Text1"][i],
            "Text2": batch["Text2"][i],
            "domain": batch["domain"][i] if "domain" in batch else batch["Domain"][i]
        }
        prompts.append(construct_chat_prompt(row))
    
    responses = generator(
        prompts,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False,
        temperature=0.0,
        num_return_sequences=1,
        stopping_criteria=StoppingCriteriaList([StopOnCompleteJSON(tokenizer)])
    )
    
    predicted_same_causal_list = []
    predicted_variable_name_list = []
    explanation_list = []
    
    for i, response in enumerate(responses):
        if isinstance(response, list) and response:
            generated_text = response[0].get('generated_text', '')
        elif isinstance(response, dict):
            generated_text = response.get('generated_text', '')
        elif isinstance(response, str):
            generated_text = response
        else:
            generated_text = ""
        
        prompt = prompts[i]
        answer_text = generated_text[len(prompt):].strip() if generated_text.startswith(prompt) else generated_text.strip()
        answer_text = fix_invalid_json(answer_text)
        try:
            parsed_answer = json.loads(answer_text)
            predicted_same_causal = parsed_answer.get("Predicted Same Causal Variable", "")
            predicted_variable_name = parsed_answer.get("Predicted Variable Name", "")
            explanation = parsed_answer.get("Explanation", "")
        except Exception as e:
            print(f"Failed to parse JSON for sample with Text1: {batch['Text1'][i]} and Text2: {batch['Text2'][i]}: {e}")
            predicted_same_causal = ""
            predicted_variable_name = ""
            explanation = answer_text
        
        predicted_same_causal_list.append(predicted_same_causal)
        predicted_variable_name_list.append(predicted_variable_name)
        explanation_list.append(explanation)
    
    batch["Predicted Same Causal Variable"] = predicted_same_causal_list
    batch["Predicted Variable Name"] = predicted_variable_name_list
    batch["Prediction Model"] = [prediction_model_name] * len(batch["Text1"])
    batch["Explanation"] = explanation_list
    return batch

# --- Main processing function ---
def main():
    global tokenizer, generator, prediction_model_name, MAX_NEW_TOKENS
    CHECKPOINT_PATH = "lora_70b_en_v0_alt_v2"  # UPDATE with your checkpoint path.
    INPUT_CSV = "sampled_data_set_small.csv"     # UPDATE with your evaluation CSV path.
    OUTPUT_CSV = "evaluation_results_alt.csv"
    BATCH_SIZE = 8  # Increase batch size for better GPU utilization.
    MAX_NEW_TOKENS = 200

    tokenizer, model, generator = load_model_and_tokenizer(
        CHECKPOINT_PATH, load_in_8bit=False, load_in_4bit=True
    )
    prediction_model_name = os.path.basename(CHECKPOINT_PATH)

    # Load CSV using Hugging Face Datasets.
    dataset = load_dataset("csv", data_files=INPUT_CSV)["train"]

    # Process the dataset in batches.
    dataset = dataset.map(process_batch, batched=True, batch_size=BATCH_SIZE)

    # Convert to pandas DataFrame.
    df = dataset.to_pandas()

    # Rename columns to match the required output.
    df.rename(columns={
        "Same Causal Variable": "Generated Same Causal Variable",
        "Variable Name": "Generated Variable Name",
        "model Name": "Data Generation Model",
        "domain": "Domain"
    }, inplace=True)
    df["Prediction Model"] = prediction_model_name

    # Reorder columns exactly as specified.
    ordered_columns = [
        "Text1",
        "Text2",
        "Generated Same Causal Variable",
        "Predicted Same Causal Variable",
        "Generated Variable Name",
        "Predicted Variable Name",
        "Data Generation Model",
        "Prediction Model",
        "Domain",
        "Explanation"
    ]
    df = df[ordered_columns]

    # Save the results.
    df.to_csv(OUTPUT_CSV, index=False, sep=",", encoding="utf-8")
    print(f"Evaluation results saved to {OUTPUT_CSV}")

    # Cleanup.
    del model, tokenizer, generator
    torch.cuda.empty_cache()
    gc.collect()

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Device set to use cuda:0


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]



Evaluation results saved to evaluation_results_alt.csv


just some code to get IAA

In [4]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import os
os.chdir("/home/feline/Evaluating-Large-Language-Models-for-Causal-Modeling/finetuning")

def compute_kappa_by_domain(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Convert the columns to booleans (in case they are read as strings)
    df['Generated Same Causal Variable'] = df['Generated Same Causal Variable']\
        .astype(str).str.strip().str.lower() == 'true'
    df['Predicted Same Causal Variable'] = df['Predicted Same Causal Variable']\
        .astype(str).str.strip().str.lower() == 'true'
    
    # Compute Cohen's kappa for each Domain
    kappa_results = []
    for domain, group in df.groupby('Domain'):
        kappa = cohen_kappa_score(group['Generated Same Causal Variable'],
                                  group['Predicted Same Causal Variable'])
        kappa_results.append({'Domain': domain, 'Kappa': kappa})
    
    return pd.DataFrame(kappa_results)

def main():
    # Specify your CSV file paths
    file1_path = 'llama3-70b_model_prediction_large.csv'
    file2_path = 'llama3-70b_model_epoch4_v1_v2.csv'
    
    # Compute Kappa by Domain for each file
    kappa_file1 = compute_kappa_by_domain(file1_path)
    kappa_file2 = compute_kappa_by_domain(file2_path)
    
    print("Cohen's Kappa by Domain for File 1:")
    print(kappa_file1)
    print("\nCohen's Kappa by Domain for File 2:")
    print(kappa_file2)
    
    # Optionally, merge the results to compare which file has higher kappa per Domain
    comparison = pd.merge(kappa_file1, kappa_file2, on='Domain', suffixes=('_file1', '_file2'))
    comparison['better'] = comparison.apply(
        lambda row: 'file1' if row['Kappa_file1'] > row['Kappa_file2'] 
                    else ('file2' if row['Kappa_file1'] < row['Kappa_file2'] else 'equal'), axis=1)
    
    print("\nComparison of Cohen's Kappa by Domain:")
    print(comparison)

if __name__ == "__main__":
    main()


Cohen's Kappa by Domain for File 1:
                        Domain     Kappa
0             computer science  0.142857
1                      finance  0.457143
2                       health  0.585714
3                      physics  0.114286
4                   psychology  0.428571
5  semiconductor manufacturing  0.542857
6                    sociology  0.414286
7                urban studies  0.428571

Cohen's Kappa by Domain for File 2:
                        Domain     Kappa
0             computer science  0.957143
1                      finance  0.642857
2                       health  0.857143
3                      physics  0.914286
4                   psychology  0.700000
5  semiconductor manufacturing  0.785714
6                    sociology  0.871429
7                urban studies  0.728571

Comparison of Cohen's Kappa by Domain:
                        Domain  Kappa_file1  Kappa_file2 better
0             computer science     0.142857     0.957143  file2
1                    