In [1]:
import os
import requests
import json
import graphviz as gv
import networkx as nx
import pandas as pd
#import jsonlines

url = "http://localhost:11434/api/generate"

In [17]:
def llama3(prompt:str, sample:int = 0.3):
    
    data = {
        "model": "llama3.1:8b-instruct-q8_0",
        "prompt": prompt,
        "system": "You are a highly capable causal inference expert. Your primary task is to identify and analyze causal relationships between variables, focusing on direct cause-and-effect connections between those variables. Act as a sceptical causal reasoning agent, systematically solving causality problems using logical analysis and evidence-based reasoning",
        "stream": False,
        "format": "json",
        "options": {
            "top_p": 0 #sample # (0-1) 0 is no sampling // api default is 0.9 // testing seems good with 0.3
        }
    }

    response = requests.post(url, json=data)
    return(response.json()["response"])

### Helper-Functions

In [3]:
def connect(var:str, target_trees:list, leaves:bool, strict: bool = False):
    #tries to connect a var to a list of trees based on strictness

    #TODO handle same concept and direct relations better ->> can be done here because direction finder is here also
    
    #print(f"running connect with: -- var:{var} -- targets: {targets}")
    
    connections = []
    target0 = target_trees[0]["nodes"].copy()
    target1 = target_trees[1]["nodes"].copy()
    target1.remove(target_trees[0]["root"])
    
    #for efficency, by avoiding redundant leave connections in batch execution
    if not leaves:
        for leave in target_trees[0]["leaves"]:
            target0.remove(leave)
        for leave in target_trees[1]["leaves"]:
            target1.remove(leave)

    targets = [target0, target1]

    for i in range(0,2):
        if len(targets[i]) == 0:
            break
        #print(f"running connect with: -- var:{var} -- tree: {targets[i]}")
        response = connect_prompt(var, targets[i])

        for r in response:
            connection = {
                "variable": var,
                "target": r["list_variable"],
                "varTree": "",
                "tarTree": target_trees[i]["id"],
                "struct" : "",
                "edge": ()
            }

            if r["relationship"] == "Unrelated":
                connections = connections
            elif r["relationship"] == "Directly Related":
                if strict:
                    connections = connections
                else:
                    #connections.add((var, r["list_variable"]))
                    connections.append(connection)
            elif r["relationship"] == "Same Concept":
                    #connections.add((var, r["list_variable"]))
                    connections.append(connection)
            else:
                print("------------------wrong relationship in connect method--------------------", r["relationship"])

        #if len(connections) > 0:
            #break

    output = []
    for conn in connections:
        link = direction_finder(conn)
        if link['cause'] != "NONE" and link["effect"] != "NONE":
            conn["edge"] = (link['cause'], link['effect'])
            output.append(conn)

    #return is list of connection *structs* edge  like (var1,var2), (cause,efect),
    return output

In [None]:
def direction_finder(connection):
    #solves directionality question for connection
    #print(f"running direction_finder with: -- var1:{var[0]} -- var1:{var[1]}")
    
    var1 = connection[0]
    var2 = connection[1]

    #prompt = f"For two provided variables, that are causaly related, identify wich variable causes the other. You should tag the given variables accordingly with 'cause' or 'effect. The two provided Variables are: {[var1, var2]} . Format the output as JSON using the following template: {json_template}"
    #prompt = f"For the two provided variables, which are causally related, identify which variable is the cause and which is the effect. You must use the exact variables as provided without introducing or modifying any variables. Clearly tag one as 'cause' and the other as 'effect.'Ensure the output strictly adheres to the following JSON format: {json_template}. The two provided variables are: '{var1}' and '{var2}'. If you are uncertain, make the best determination based on logical reasoning but avoid altering the input or deviating from the JSON format."

    json_template = '{"cause": "var", "effect: "var"}'
    prompt_start = f'Given two provided variables that are causally related, identify which variable is the cause and which is the effect. You must use the exact variables as provided without shortening, summarizing or modifying any variables. Clearly label one variable as "cause" and the other as "effect" based strictly on causal reasoning. Dont change the formulation or spelling of the variables but ensure correct syntax'
    prompt_end = f'Ensure the output strictly adheres to the following JSON format: {json_template}. The first Variable is:"{var1}". The second Variable is:"{var2}".' #If the two variables are too loosly connected to form any kind auf cause and effect relationship, simply replace both variable names with 'NONE' in the output json."
    prompt = prompt_start + prompt_end

    response = llama3(prompt, sample=0)

    #print(prompt)

    with open("log_dump/direction.txt", "a") as f:
        f.write("\n--------------------------------------------------------------\n")
        tmp = f"var1: {var1} var2: {var2} = \n"
        f.write(tmp)
        f.write(response)

    rep = json.loads(response)

    if rep["cause"] == var1 and rep["effect"] == var2:
        out = "->"
    elif rep["effect"] == var1 and rep["cause"] == var2:
        out = "<-"
    else:
        if var1 != rep["cause"] and var1 != rep["effect"]:
            print("-----------------------------------var1 wrong")
            print(var1)
            print(rep["cause"])
            print(rep["effect"])
            print("-----------------------------------")
        elif var2 != rep["cause"] and var2 != rep["effect"]:
            print("var2 wrong", var2)
            print("-----------------------------------var2 wrong")
            print(var2)
            print(rep["cause"])
            print(rep["effect"])
            print("-----------------------------------")
        else:
            print("both wrong")
        out = ""


    return out

In [5]:
def safe(filename: str, input):
    #simple write for later use

    with open(filename + ".json", 'w') as file:
        file.write(json.dumps(input))

In [6]:
def load(path:str, old:bool = False):
    #simple load with backwards compatability

    if old == False:
        with open(path) as file:
            got = json.loads(file.read())
    else:
        log = []
        with open(path) as file:
            for line in file:
                tmp = line.replace("'", "\"")
                tmp2 = json.loads(tmp)
                log.append(tmp2)
            
        got = {
            "example": ["no", "no"],
            "final_graph": log
        }
    
    return got

### MAIN

### Control Area

In [73]:
# example Backlog

own_examples = [ 
    ["High Chocolate Consumption", "Nobel Laureate Density"],
    ["Season", "Rain"],
    ["Forest Fires", "Ice Cream Sales"],
    ["Bad Weather", "Movie Sales"],
    ["Mount Everest Climbs", "Electric Car Sales"],
    #--5--
    ["Solar Flares", "Recurring Nightmares"],       #unrelated?
    ["GMO use", "Pirate attacks"],                  #spurious correlations
    ["Solarpower generation", "Internet Acess"],    #spurious correlations
    ["Mount Everest climbs", "HotDogs consumed"],   #spurious correlations
    ["Rice Consumption", "Headaches"],              #spurious correlations
    #--10--
    ["Lost City Found", "Bird Flu Outbreak"],       #random words
    ["Meteor storm", "Australian Elections"],       #random words
    ["Bruising", "Videogame Highscore"],            #random words
    ["Person sleeps lot", "good grades"],
    ["alarm clock", "getting fired"],
    #--15--
    ["Gravity","Latitude"]
            ]

tubing = load("manual_safes/datasets/tubing_clean.json")
tubing_examples = []

for i in tubing:
    #print(tubing[i])
    var1 = tubing[i]["var1"].replace(","," -")      #replaces , for - because llm mistakes it as sepereate elements otherwise
    var2 = tubing[i]["var2"].replace(","," -")
    tubing_examples.append([var1, var2, i])
#pair 52 not workable: x and y are both 4-dimensional variables for day
#pair 53,54 vars are list of things

crab_load = load("manual_safes/datasets/crab/crab_clean.json")
crab_examples = []
for c in crab_load:
    var1 = c["event_a"]
    var2 = c["event_b"]
    crab_examples.append([var1, var2, c["pair_id"]])


print("amount own example", len(own_examples))
print("amount tubing example", len(tubing_examples))
print("amount crab example", len(crab_examples))

amount own example 16
amount tubing example 91
amount crab example 80


In [6]:
#generate clean crab dataset for testing ease
filepath = "manual_safes/datasets/crab/pairwise_causality.jsonl"
jsonObj = pd.read_json(path_or_buf=filepath, lines=True)

with open(filepath) as f:
    crab_raw = [json.loads(line) for line in f]

crab_data = []
for line in crab_raw:
    if line["score"] > 80:
        entry = {
            "topic_id": line["topic_id"],
            "event_a": line["event_a"],
            "event_order_a": line["event_order_a"],
            "event_b": line["event_b"],
            "event_order_b": line["event_order_b"],
            "score": line["score"],
            "class": line["class"],
        }
        crab_data.append(entry)

ordered_crab = []
for entry in crab_data:
    found = 0
    for o in ordered_crab:
        if entry["topic_id"] == o["id"]:
            o["entries"].append(entry)
            found = 1
    if found == 0:        
        ordered_crab.append({"id": entry["topic_id"], "entries": [entry]})

final_crab = []
for o in ordered_crab:
    tmp = [{"score":0},{"score":0},{"score":0},{"score":0}]

    for e in o["entries"]:
        for t in range(4):
            if e["score"] > tmp[t]["score"]:
                tmp[t] = e
                break
    
    for t in range(4):
        new_entry = {
            "pair_id": f"{tmp[t]['topic_id']}{t}".zfill(4),
            "event_a": tmp[t]["event_a"].rstrip(" ").rstrip(".").replace("\"", "'"),
            "event_b": tmp[t]["event_b"].rstrip(" ").rstrip(".").replace("\"", "'"),
            "direction": "->"
                     }
        
        if f"{tmp[t]['topic_id']}{t}".zfill(4) == "0352":
            new_entry = {"pair_id": "0352", "event_a": "The volumes declined as the prices rose", "event_b": "The trading range narrowed", "direction": "->"}

        final_crab.append(new_entry)


    #.append(tmp)


In [8]:
for i in ordered_crab:
    print(i)

NameError: name 'ordered_crab' is not defined

In [None]:
#safe the dataset
#safe("crab_clean", final_crab)

In [74]:
#parameters
run_automation = 1
examples = crab_load

chosen_example =  17     #to use without automation
run_from = 1          #to use with automation           (last tubing 1,108,5)
run_to =  80           #applied like ls[run_from-1:run_to] 
nth_element = 1        #runs only every Nth element from the above selected


In [75]:
#execute direct crab test
open("log_dump/direction.txt", "w").close()

if run_automation:
    example_sublist = examples[run_from-1:run_to] 
    example_sublist = example_sublist[0::nth_element]
    num_runs = len(example_sublist)
    run_id = 0

    num_correct = 0

    for example in example_sublist:
        start_var = example["event_a"]
        end_var = example["event_b"]
        dir = example["direction"]
        pid = example["pair_id"]

        direct = direction_finder([start_var, end_var])               #returns "->"

        #print(start_var, "---", end_var)
        if dir == direct:
            print(pid,"correct")
            num_correct += 1
        else:
            print(pid,"false", dir, direct)

        run_id += 1
        

    print("------------End of Execution------------")
    print(f"got correct: {num_correct}/{num_runs}")

0010 false -> <-
0011 correct
0012 false -> <-
0013 correct
0030 false -> <-
0031 correct
0032 false -> <-
0033 correct
0040 false -> <-
0041 correct
0042 false -> <-
0043 false -> <-
0050 correct
0051 false -> <-
0052 correct
0053 false -> <-
0060 false -> <-
0061 correct
0062 false -> <-
0063 correct
0070 correct
0071 false -> <-
0072 correct
0073 false -> <-
0080 false -> <-
0081 false -> <-
0082 false -> <-
0083 false -> <-
0090 correct
0091 false -> <-
0092 correct
0093 false -> <-
0110 correct
0111 correct
0112 correct
0113 correct
0130 correct
0131 correct
0132 correct
0133 false -> <-
0210 correct
0211 correct
0212 correct
0213 false -> <-
0220 correct
0221 correct
0222 correct
0223 correct
0270 correct
0271 correct
0272 correct
0273 correct
0350 false -> <-
0351 correct
0352 correct
0353 correct
0480 correct
0481 correct
0482 correct
0483 correct
0530 false -> <-
0531 false -> <-
0532 false -> <-
0533 correct
0590 correct
0591 false -> <-
0592 correct
0593 correct
0660 false -

In [72]:
if run_automation:
    example_sublist = examples[run_from-1:run_to] 
    example_sublist = example_sublist[0::nth_element]
    num_runs = len(example_sublist)
    run_id = 0

    num_correct = 0

    for example in example_sublist:
        start_var = example["event_a"]
        end_var = example["event_b"]
        dir = example["direction"]
        pid = example["pair_id"]


        if pid != "0352":
            continue

        print(start_var)
        print(end_var)

        start_var = "The volumes declined and the prices rose"


        direct = direction_finder([start_var, end_var])               #returns "->"

        #print(start_var, "---", end_var)
        if dir == direct:
            print(pid,"correct")
            num_correct += 1
        else:
            print(pid,"false", dir, direct)

        run_id += 1
        

    print("------------End of Execution------------")
    print(f"got correct: {num_correct}/{num_runs}")

The volumes declined as the prices rose
The trading range narrowed
0352 correct
------------End of Execution------------
got correct: 1/80
