In [1]:
import networkx as nx
import os
import numpy as np
from tqdm import tqdm_notebook
from io import StringIO
import pandas as pd
import json
import re
import torch
import plotly.graph_objects as go

In [2]:
def parser(path):
    with open(path) as file:
        data = json.load(file)

    df = pd.json_normalize(data, record_path=['data'])
    return data, df

path = 'problems_2023_01_30/problems MoonBoard 2016 .json'

data, df = parser(path)

In [3]:
grade_df = df['grade']

In [4]:
import networkx as nx
from itertools import combinations
import numpy as np
import pandas as pd

# Define the create_graph function
def create_graph(moves, max_distance):
    G = nx.Graph()

    pos = {}  # Positions dictionary for nodes
    nodes = []
    for move in moves:
        node = move['description']
        x = ord(node[0]) - ord('A') + 1
        y = 19 - int(node[1:])
        pos[node] = (x, y)
        nodes.append(node)
        G.add_node(node, isStart=move['isStart'], isEnd=move['isEnd'])
    
    # Calculate all pair distances
    distances = {node: {} for node in nodes}
    for n1, n2 in combinations(nodes, 2):
        dist = np.linalg.norm(np.array(pos[n1]) - np.array(pos[n2]))
        distances[n1][n2] = dist
        distances[n2][n1] = dist
    
    # Connect each node to its two closest nodes
    for node in nodes:
        sorted_neighbors = sorted(distances[node].items(), key=lambda item: item[1])
        closest_neighbors = [neighbor for neighbor, dist in sorted_neighbors[:2]]
        for neighbor in closest_neighbors:
            G.add_edge(node, neighbor, weight=distances[node][neighbor])
    
    # Optionally, add extra edges based on max_distance
    for (n1, n2) in combinations(nodes, 2):
        if n2 not in G[n1] and distances[n1][n2] <= max_distance:
            G.add_edge(n1, n2, weight=distances[n1][n2])
    
    return G, pos

# Assume df is your DataFrame containing the 'moves' column
# Define max_distance
max_distance = 5

# Iterate through each row in the DataFrame and create a graph for each set of moves
graphs_and_pos = []
for moves in df['moves']:
    graph, pos = create_graph(moves, max_distance)
    graphs_and_pos.append(graph)

In [5]:
graphs_and_pos

[<networkx.classes.graph.Graph at 0x165d0fb50>,
 <networkx.classes.graph.Graph at 0x165dbc510>,
 <networkx.classes.graph.Graph at 0x31b3a0b10>,
 <networkx.classes.graph.Graph at 0x31b3a1350>,
 <networkx.classes.graph.Graph at 0x31b3a2150>,
 <networkx.classes.graph.Graph at 0x31b3a24d0>,
 <networkx.classes.graph.Graph at 0x31b3a2e50>,
 <networkx.classes.graph.Graph at 0x31b3a3c50>,
 <networkx.classes.graph.Graph at 0x31b3c0050>,
 <networkx.classes.graph.Graph at 0x31b3a2a90>,
 <networkx.classes.graph.Graph at 0x31b3c1410>,
 <networkx.classes.graph.Graph at 0x31b3c1850>,
 <networkx.classes.graph.Graph at 0x31b3c20d0>,
 <networkx.classes.graph.Graph at 0x31b3c27d0>,
 <networkx.classes.graph.Graph at 0x31b3c2ed0>,
 <networkx.classes.graph.Graph at 0x31b3c36d0>,
 <networkx.classes.graph.Graph at 0x31b3c3cd0>,
 <networkx.classes.graph.Graph at 0x31b3ab890>,
 <networkx.classes.graph.Graph at 0x31b3ab950>,
 <networkx.classes.graph.Graph at 0x31b3ab790>,
 <networkx.classes.graph.Graph at 0x31b3

In [6]:
graph = graphs_and_pos[0]

In [7]:
for i in graph.nodes():
    print('target:', i, ",", 'context:', list(graph.neighbors(i)))

target: E6 , context: ['E8', 'C5']
target: C5 , context: ['E6', 'E8']
target: E8 , context: ['E6', 'C5', 'F11']
target: F11 , context: ['E8', 'C13', 'D15']
target: C13 , context: ['F11', 'D15', 'D18']
target: D15 , context: ['C13', 'D18', 'F11']
target: D18 , context: ['D15', 'C13']


In [8]:
graph.edges('G6')

EdgeDataView([])

In [9]:
import gensim
from gensim.models import Word2Vec

def train_directed_embedding(graph):
    sentences = []
    for graph in graphs_and_pos:
        for node in graph.nodes():
            if graph.degree(node) > 0:
                context = [node] + list(graph.neighbors(node))
                sentences.append(context)

    model = Word2Vec(sentences, vector_size=3, window=1, min_count=1, sg=1, workers=4, epochs=10, max_final_vocab=198)

    return model

model = train_directed_embedding(graph)

In [10]:
node = 'K5'
if node in model.wv:
    node_vector = model.wv[node]
    print("Embedding for node", node, ":", node_vector)
else:
    print("Node", node, "not found in the model.")

Embedding for node K5 : [-0.10749207  1.848286   -0.9963151 ]


Embedding for node K5 : [ 0.74462616 -1.9113648  -0.13629538]

In [11]:
vocabulary = list(model.wv.key_to_index.keys())  # Get all unique nodes/words in the model's dictionary


In [12]:
x = []
for node in vocabulary:
    x.append((node, model.wv[node]))

In [13]:
embeddings = x

In [15]:
avg_dif = pd.read_csv('avg_difficulty')

In [None]:
avg_diff = avg_dif

In [None]:
labels = [label for label, _ in embeddings]
x_coords = [coords[0] for _, coords in embeddings]
y_coords = [coords[1] for _, coords in embeddings]
z_coords = [coords[2] for _, coords in embeddings]

# Prepare the DataFrame for joining by ensuring matching cases
avg_diff['hold'] = avg_diff['hold'].str.upper()  # Ensure matching is case-insensitive

# Map grades to embeddings using the labels
grades = [avg_diff.loc[avg_diff['hold'] == label, 'grade'].values[0] if not avg_diff[avg_diff['hold'] == label].empty else float('nan') for label in labels]

# Create a Plotly figure
fig = go.Figure(data=[go.Scatter3d(
    x=x_coords,
    y=y_coords,
    z=z_coords,
    text=labels,
    mode='markers+text',  # Combine markers and text
    marker=dict(
        size=5,
        color=grades,  # Use node grades as the marker colors
        colorscale='Viridis',  # Use a color scale which is perceptually uniform
        opacity=0.8,
        colorbar=dict(title='Node Difficulty')
    ),
    textposition='top center'
)])

# Update layout for a better visualization
fig.update_layout(
    title='3D Visualization of Node Embeddings with Difficulty Scaling',
    scene=dict(
        xaxis_title='Dimension 1',
        yaxis_title='Dimension 2',
        zaxis_title='Dimension 3'
    ),
    margin=dict(l=0, r=0, b=0, t=0)  # Minimal margin for full use of space
)

# Show the figure
fig.show()

In [None]:
grade_mapping = {
    '6B+': 0, '6C': 1, '6C+': 2, '7A': 3, '7A+': 4,
    '7B': 5, '7B+': 6, '7C': 7, '7C+': 8, '8A': 9,
    '8A+': 10, '8B': 11, '8B+': 12
}

In [None]:
def encode_grades(df, grade_mapping):
    df['grade_encoded'] = df['grade'].apply(lambda x: grade_mapping.get(x, -1))
    return df

df_encoded = encode_grades(df, grade_mapping)

In [None]:
df_encoded['moves_length'] = df_encoded['moves'].apply(lambda x: len(x))

In [None]:
df_encoded = df_encoded[['moves', 'moves_length', 'grade_encoded']]

In [None]:
df_encoded['moves'][0]

[{'problemId': 19215, 'description': 'E6', 'isStart': True, 'isEnd': False},
 {'problemId': 19215, 'description': 'C5', 'isStart': True, 'isEnd': False},
 {'problemId': 19215, 'description': 'E8', 'isStart': False, 'isEnd': False},
 {'problemId': 19215, 'description': 'F11', 'isStart': False, 'isEnd': False},
 {'problemId': 19215, 'description': 'C13', 'isStart': False, 'isEnd': False},
 {'problemId': 19215, 'description': 'D15', 'isStart': False, 'isEnd': False},
 {'problemId': 19215, 'description': 'D18', 'isStart': False, 'isEnd': True}]

In [None]:
embeddings_dict = {desc: np.array(vec) for desc, vec in embeddings}


In [None]:
def aggregate_embeddings(moves_list, embeddings, embedding_dim=3):
    embeddings_list = [embeddings.get(move['description'], None) for move in moves_list if move['description'] in embeddings]
    embeddings_list = [emb for emb in embeddings_list if emb is not None]  # Filter out None values
    
    if embeddings_list:
        # Calculate mean embedding for the observation
        aggregated_embedding = np.mean(np.array(embeddings_list), axis=0)
        return aggregated_embedding
    else:
        # Return a zero vector if no valid embeddings are found
        return np.zeros(embedding_dim)
    


In [None]:
df_encoded['aggregated_embeddings'] = df_encoded['moves'].apply(lambda x: aggregate_embeddings(x, embeddings_dict, 100))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df_encoded = df_encoded[['aggregated_embeddings', 'moves_length', 'grade_encoded']]

In [None]:
final_df = df_encoded.dropna()

In [None]:
final_df

Unnamed: 0,aggregated_embeddings,moves_length,grade_encoded
0,"[-1.0153769, 0.29040357, 0.16770588]",7,0
1,"[-0.118511915, 0.9861211, 0.20696987]",7,0
2,"[-0.42990097, 0.7141531, 0.12121254]",7,0
3,"[-0.69993865, 0.61210394, -0.005958572]",8,3
4,"[-0.8657431, 0.40325475, 0.28957334]",8,3
...,...,...,...
59501,"[-0.78409356, 0.71407926, -0.12703161]",10,0
59502,"[-0.41040096, 0.9432128, -0.20007917]",10,0
59503,"[-0.3484845, 1.0457246, -0.24870378]",11,0
59504,"[-0.4694314, 0.8197418, -0.076410204]",8,0


In [None]:
lim = len(final_df['aggregated_embeddings'][0])

for i in range(lim):
    final_df[f'emb_dim{i+1}'] = None

# Populate the embedding dimensions from the 'aggregated_embeddings' column
for i in range(lim):
    final_df[f'emb_dim{i+1}'] = final_df['aggregated_embeddings'].apply(lambda x: x[i])


In [None]:
final_df.drop(['aggregated_embeddings'], axis=1, inplace=True)

In [None]:
final_df.to_csv('embeddings_1')

In [None]:
final_df

Unnamed: 0,moves_length,grade_encoded,emb_dim1,emb_dim2,emb_dim3
0,7,0,-1.015377,0.290404,0.167706
1,7,0,-0.118512,0.986121,0.206970
2,7,0,-0.429901,0.714153,0.121213
3,8,3,-0.699939,0.612104,-0.005959
4,8,3,-0.865743,0.403255,0.289573
...,...,...,...,...,...
59501,10,0,-0.784094,0.714079,-0.127032
59502,10,0,-0.410401,0.943213,-0.200079
59503,11,0,-0.348484,1.045725,-0.248704
59504,8,0,-0.469431,0.819742,-0.076410


# Check embeddings with RF model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
X = final_df[[f'emb_dim{i}' for i in range(1, lim+1)]]  # Assuming you have columns named emb_dim1 to emb_dim100
y = final_df['grade_encoded']  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)


In [None]:
classifier.fit(X_train, y_train)

In [None]:
predictions = classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of the RandomForest model: {accuracy:.2%}")

Accuracy of the RandomForest model: 30.34%


# Grid Search for optimal embedding params

In [None]:
dist_list = [15, 20, 25, 30, 35, 40]
emb_list = [50, 100, 200]
window_list = [1,2]

In [None]:
for dist in dist_list:
    for emb in emb_list:
        for w in window_list:
            max_distance = dist
            graphs_and_pos = [create_graph(moves, max_distance) for moves in df['moves']]
            graphs = [graph for graph, _ in graphs_and_pos]
            #model = Word2Vec(sentences, vector_size=3, window=1, min_count=1, sg=1, workers=4, epochs=10, max_final_vocab=198)

            model = train_embedding(graphs, v_size=emb, window=w)

            df_encoded['aggregated_embeddings'] = df_encoded['moves'].apply(lambda x: aggregate_embeddings(x, model.wv))
            X = pd.DataFrame(df_encoded['aggregated_embeddings'].tolist())
            y = df_encoded['grade_encoded']

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            classifier = RandomForestClassifier(n_estimators=100, random_state=42)
            classifier.fit(X_train, y_train)

            predictions = classifier.predict(X_test)
            accuracy = accuracy_score(y_test, predictions)
            print(f"Distance: {dist}, Emb: {emb}, Window: {w}, Accuracy: {accuracy:.2%}")