### Generate and save data

In [None]:
import json
import jsonlines
import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-large-mnli").cuda()


In [2]:
input_name = "test_5_legal_actions.jsonl"

In [3]:
with open(input_name, 'r', encoding='utf-8') as json_file:
   solutions = []
   for i, line in enumerate(json_file):
      if i % 2 == 0:
        try:
            solutions.append(json.loads(line.strip()))
        except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {i + 1}: {e}")


In [None]:
def deberta_pred(text_1, text_2, report=False):
    input = text_1 + ' [SEP] ' + text_2
    encoded_input = tokenizer.encode(input, padding=True)
    prediction = model(torch.tensor(torch.tensor([encoded_input]), device='cuda'))['logits']
    predicted_label = torch.argmax(prediction, dim=1)

    reverse_input = text_2 + ' [SEP] ' + text_1
    encoded_reverse_input = tokenizer.encode(reverse_input, padding=True)
    reverse_prediction = model(torch.tensor(torch.tensor([encoded_reverse_input]), device='cuda'))['logits']
    reverse_predicted_label = torch.argmax(reverse_prediction, dim=1)

    deberta_prediction = 1
    if 0 in predicted_label or 0 in reverse_predicted_label:
        deberta_prediction = 0

    return deberta_prediction

In [21]:
# Organize data
similarity_data = {}
i = 0
for solution_group in solutions:
    for sol in solution_group:
        if sol["question"] not in similarity_data.keys():
            similarity_data[sol["question"]] = {"question_id": i, "partial_sol": {}}
            i += 1
    
        previous_steps = sol["input_prefix"].split("<|im_end|>\n<|im_start|>assistant\n")[1]

        previous_step_list = previous_steps.split("\n\n")[:-1]
        if not len(previous_step_list)==(sol["depth"]-1):
            print("Error: number of steps not matching depth.")
            print(f"Step number: {len(previous_step_list)}")
            depth = sol["depth"]
            print(f"Depth: {depth-1}")

        if len(previous_steps) >= 2:
            previous_steps = previous_steps[:-2]

        if previous_steps not in similarity_data[sol["question"]]["partial_sol"].keys():
            similarity_data[sol["question"]]["partial_sol"][previous_steps] = {}
            similarity_data[sol["question"]]["partial_sol"][previous_steps]["action_list"] = set()
            similarity_data[sol["question"]]["partial_sol"][previous_steps]["cluster_list"] = [[]] # place-holder
            similarity_data[sol["question"]]["partial_sol"][previous_steps]["adjacency_mat"] = [[]] # place-holder

        similarity_data[sol["question"]]["partial_sol"][previous_steps]["action_list"].add(sol["action"].replace("\n\n", ""))

new_sim_data = []

for question_text, question_prop in similarity_data.items():
    partial_sol_list = []
    for partial_sol_text, partial_sol_dict in question_prop["partial_sol"].items():
        partial_sol_prop = {}
        partial_sol_prop["partial_sol_text"] = partial_sol_text
        partial_sol_prop["action_list"] = list(partial_sol_dict["action_list"])
        partial_sol_prop["cluster_list"] = partial_sol_dict["cluster_list"]
        partial_sol_prop["adjacency_mat"] = partial_sol_dict["adjacency_mat"]
        partial_sol_list.append(partial_sol_prop)
    new_sim_data.append({"question": question_text, "partial_sol": partial_sol_list})
    

similarity_data = {}


In [22]:
# Compute similarities
for question_data in new_sim_data:
    for partial_sol_dict in question_data["partial_sol"]:
        question_text = question_data["question"]
        partial_sol_text = partial_sol_dict["partial_sol_text"]
        combined_question = f"{question_text}\n{partial_sol_text}"
        action_list = partial_sol_dict["action_list"]
        adjacency_mat = [[0 for _ in range(len(action_list))] for _ in range(len(action_list))]
        cluster_list = [[i] for i in range(len(action_list))]
        for i in range(len(action_list)):
            for j in range(i+1, len(action_list)):
                action_1 = action_list[i]
                action_2 = action_list[j]

                qa_1 = f"{combined_question}\n{action_1}"
                qa_2 = f"{combined_question}\n{action_2}"

                deberta_prediction = deberta_pred(qa_1, qa_2)

                if deberta_prediction == 1:
                    adjacency_mat[i][j] = 1
                    adjacency_mat[j][i] = 1
                    for cluster in cluster_list:
                        if i in cluster and j not in cluster:
                            # Need merge
                            for cluster_2 in cluster_list:
                                if j in cluster_2:
                                    cluster.extend(cluster_2)
                                    cluster_list.remove(cluster_2)
                                    break
                    if [j] in cluster_list:
                        cluster_list.remove([j])
            
        for cluster in cluster_list:
            cluster.sort()
        partial_sol_dict["adjacency_mat"] = adjacency_mat
        partial_sol_dict["cluster_list"] = cluster_list

*   Save data

In [23]:
output_name = "test_5_legal_actions_similarity.json"
with jsonlines.open(output_name, mode='w') as writer:
    writer.write_all(new_sim_data)

### Load and inspect data

In [None]:
import json
import jsonlines

*   Load data

In [24]:
input_name = "test_5_legal_actions_similarity.json"
with open(input_name, 'r', encoding='utf-8') as json_file:
    new_sim_data_2 = [json.loads(line) for line in json_file]

*   Inspect data

```Python
new_sim_data = [{"question": str, 
                 "partial_sol": [{"partial_sol_text": str, 
                                  "action_list": [str],
                                  "cluster_list": [[int]],
                                  "adjacency_mat": [[int]]
                                }]
                }]
```

*   View partial solutions

In [28]:
idx = 0

question = new_sim_data_2[idx]["question"]
print(f"Question: {question}\n\nPartial solutions:\n")
for i, partial_sol in enumerate(new_sim_data_2[idx]["partial_sol"]):
    partial_sol_text = partial_sol["partial_sol_text"]
    print(f"{i}. {partial_sol_text}\n")

Question: Suppose $\sin D = 0.7$ in the diagram below. What is $DE$? [asy]
pair D,E,F;
F = (0,0);
D = (sqrt(51),7);
E = (0,7);
draw(D--E--F--D);
draw(rightanglemark(D,E,F,15));
label("$D$",D,NE);
label("$E$",E,NW);
label("$F$",F,SW);
label("$7$",(E+F)/2,W);
[/asy]

Partial solutions:

0. 

1. To find the length of \(DE\) in the given right triangle \(DEF\), we start by identifying the given information and the relationships in the triangle. We know that \(\sin D = 0.7\) and that \(EF = 7\). In a right triangle, the sine of an angle is the ratio of the length of the opposite side to the length of the hypotenuse. Here, \(\sin D = \frac{EF}{DE}\).

2. To find the length of \(DE\) in the given right triangle \(DEF\), we start by identifying the given information and the relationships in the triangle.

3. To determine the length of \(DE\), we start by analyzing the given right triangle \(DEF\). We know that \(\sin D = 0.7\) and that \(EF = 7\). Since \(\sin D\) is the ratio of the opposite 

*   View actions and clusters

In [32]:
idx = 1
partial_sol_idx = 1

question = new_sim_data_2[idx]["question"]
partial_sol_text = new_sim_data_2[idx]["partial_sol"][partial_sol_idx]["partial_sol_text"]
print(f"Question: {question}\n\nPartial solution: {partial_sol_text}\n\nActions:\n")
for i, action in enumerate(new_sim_data_2[idx]["partial_sol"][partial_sol_idx]["action_list"]):
    print(f"{i}. {action}\n")
cluster_list = new_sim_data_2[idx]["partial_sol"][partial_sol_idx]["cluster_list"]
print(f"Clustering: {cluster_list}")

Question: Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\theta),$ where $r > 0$ and $0 \le \theta < 2 \pi.$

Partial solution: To convert the point \((0,3)\) from rectangular coordinates to polar coordinates, we need to find the values of \(r\) and \(\theta\). The formulas for converting from rectangular coordinates \((x, y)\) to polar coordinates \((r, \theta)\) are:

Actions:

0. \[
r = \sqrt{x^2 + y^2}
\]

1. \[
r = \sqrt{x^2 + y^2}
\]
\[
\theta = \tan^{-1}\left(\frac{y}{x}\right)
\]

2. \[ r = \sqrt{x^2 + y^2} \]
\[ \theta = \tan^{-1}\left(\frac{y}{x}\right) \]

Clustering: [[0], [1], [2]]
