In [1]:
import sys
import os
import json
import pandas as pd


# Adjust this path to point to your project root (the directory containing 'src')
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
from src.core.data_access.dgraph_client import DgraphClient
client = DgraphClient()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
target_queries_path = os.path.join(os.getcwd(), "target_queries.json")
with open(target_queries_path, "r") as f:
    target_queries = json.load(f)["queries"]

In [4]:
# Test 1
# Precision

# Steps
# 1. There is a query -> contracts mapping
# 2. Search the query using the Dgraph Client
# 3. Get the results
# 4. For every contracts in the result, check:
#    - If the contract is in the query mapping, then it is True Positive
#    - If the contract is not in the query mapping, then it is False Positive
# 5. Calculate the precision

# Test functions
def get_contract_uids_from_results(results):
    # This function should extract contract UIDs from your Dgraph search results
    # Adjust this according to your actual result structure
    return [r.get('uid') for r in results if 'uid' in r]

def calculate_precision(expected_uids, retrieved_uids):
    true_positives = set(retrieved_uids) & set(expected_uids)
    false_positives = set(retrieved_uids) - set(expected_uids)
    if not retrieved_uids:
        return 0.0, set(), set()
    precision = len(true_positives) / len(retrieved_uids)
    return precision, true_positives, false_positives

# Test Execution

all_test_queries = target_queries

# Prepare to collect results
results_list = []

for q in all_test_queries:
    query_text = q['query']
    expected_uids = q['expected_results']['contract_uids']

    results = client.vector_search(query_text, limit=20)

    retrieved_uids = get_contract_uids_from_results(results)
    print(f"Query: {query_text}")
    print(f"Retrieved UIDs: {retrieved_uids}")
    print(f"Expected UIDs: {expected_uids}")
    print('-' * 40)
    precision, tp, fp = calculate_precision(expected_uids, retrieved_uids)

    # Store result as a dict
    results_list.append({
        "Query": query_text,
        "Expected UIDs": expected_uids,
        "Actual UIDs": retrieved_uids,
        "TP": list(tp),
        "FP": list(fp),
        "Precision": precision
    })

# Create DataFrame
df = pd.DataFrame(results_list)

# Show the DataFrame
display(df)

# Calculate and print overall statistics
overall_precision = df["Precision"].mean()
print(f"Overall average precision: {overall_precision:.3f}")
print(f"Total queries: {len(df)}")
print(f"Queries with perfect precision: {(df['Precision'] == 1.0).sum()}")
print(f"Queries with zero precision: {(df['Precision'] == 0.0).sum()}")


2025-07-24 15:27:01,307 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-24 15:27:01,487 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 20 similar contracts for query: library contracts
Query: library contracts
Retrieved UIDs: ['0x192d5ee', '0x198f094', '0x19bfddc', '0x1a39ee7', '0x1a5258f', '0x1a6ac24', '0x1a8329a', '0x1ab4006', '0x1acc621', '0x1acc683', '0x1acc6a4', '0x1ae4d45', '0x1afd403', '0x1b15a19', '0x1b467d4', '0x1b5edc6', '0x1b8fbad', '0x1b8fbd1', '0x1ba8219', '0x1c2234f']
Expected UIDs: ['0x19bfddc', '0x198f094', '0x1acc6a4', '0x1b15a19', '0x1a6ac24', '0x1afd403', '0x1b467d4', '0x1ae4d45', '0x19a771d', '0x1a5258b', '0x1a5258f', '0x1ab4021', '0x1c09ce8']
----------------------------------------
2025-07-24 15:27:01,513 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-24 15:27:01,569 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 20 similar contracts for query: proxy cont

Unnamed: 0,Query,Expected UIDs,Actual UIDs,TP,FP,Precision
0,library contracts,"[0x19bfddc, 0x198f094, 0x1acc6a4, 0x1b15a19, 0...","[0x192d5ee, 0x198f094, 0x19bfddc, 0x1a39ee7, 0...","[0x19bfddc, 0x198f094, 0x1acc6a4, 0x1ae4d45, 0...","[0x1b5edc6, 0x192d5ee, 0x1a39ee7, 0x1b8fbad, 0...",0.45
1,proxy contracts,"[0x1a6ac23, 0x1ba824d, 0x19bfdca, 0x1a524b1, 0...","[0x192d5c5, 0x1945c70, 0x19bfdca, 0x19f0a36, 0...","[0x1ba824d, 0x1a524b1, 0x192d5c5, 0x19bfdca]","[0x1945c70, 0x1a39ee7, 0x1b8fbad, 0x19f0af3, 0...",0.2
2,token contracts erc-1155,"[0x1a2182e, 0x1b15a33, 0x1b2e136, 0x1b8fbd1, 0...","[0x195e313, 0x195e342, 0x1a09192, 0x1a217ff, 0...","[0x1a2182e, 0x1b2e136, 0x1a2182f, 0x1b8fbd1, 0...","[0x1b159fe, 0x1b5ede4, 0x1b4679e, 0x1b5ee4e, 0...",0.3
3,token contracts erc-20,"[0x1945c4a, 0x195e313, 0x195e358, 0x19769de, 0...","[0x192d5ee, 0x195e342, 0x19a7658, 0x19f0ac1, 0...","[0x1b159fe, 0x1ba8246, 0x1a09192, 0x195e342, 0...",[0x1b15a33],0.95
4,token contracts erc-721,"[0x1b15a5c, 0x192d5d5, 0x1945bf9, 0x1945c70, 0...","[0x1945c70, 0x195e342, 0x19769e1, 0x1a09192, 0...","[0x1b5ede4, 0x1b7747e, 0x1b15a4e, 0x1bc085e, 0...","[0x1a09192, 0x19769e1, 0x195e342, 0x1a2182f, 0...",0.6
5,defi,"[0x198f094, 0x1acc6a4, 0x1a6ac23, 0x1945c4a, 0...","[0x192d5ee, 0x1945c4a, 0x198f094, 0x19a7658, 0...","[0x198f094, 0x1945c4a, 0x1acc6a4, 0x1c22363, 0...","[0x192d5ee, 0x1a39ee7, 0x1a6ab53, 0x1b15a4e, 0...",0.25
6,identity verification,"[0x1a6ac24, 0x1afd403, 0x1b467d4]","[0x192d5d5, 0x195e342, 0x19a771d, 0x19bfddc, 0...","[0x1a6ac24, 0x1b467d4, 0x1afd403]","[0x19bfddc, 0x1a5258b, 0x1a39ee7, 0x1b5ee28, 0...",0.15
7,defi decentralized exchanges,"[0x1ba824d, 0x192d5ee, 0x19a7658, 0x19bfd16, 0...","[0x192d5ee, 0x1945c4a, 0x195e313, 0x198f094, 0...","[0x1ba824d, 0x192d5ee, 0x1b8fb3b, 0x1ba8246, 0...","[0x198f094, 0x1acc6a4, 0x1945c4a, 0x195e313, 0...",0.5
8,nft marketplaces collectibles,"[0x1ae4d45, 0x19bfdca, 0x1a524b1, 0x19f0af3, 0...","[0x192d5d5, 0x1945c70, 0x19bfdca, 0x19d83fe, 0...","[0x1a39ee7, 0x1b7747e, 0x19bfdca, 0x1bc085e, 0...",[0x1b15a5c],0.95
9,gaming,"[0x1a2182e, 0x1b15a33, 0x1b2e136, 0x1b8fbd1, 0...","[0x19bfd16, 0x19bfd53, 0x1a2182e, 0x1a21853, 0...","[0x1b15a5c, 0x1a2182e, 0x1b2e136, 0x1b15a33, 0...","[0x1c09ca1, 0x1b5ede4, 0x19bfd53, 0x1a39ee7, 0...",0.25


Overall average precision: 0.435
Total queries: 20
Queries with perfect precision: 0
Queries with zero precision: 1


In [28]:
# Test 2
# Semantic Expressiveness Score

# Steps
# Approach 1: Negative log-likelihood
# 1. Use a query to get the search results
# 2. For each results, calculate the negative log-likelihood of the query
# 3. Calculate the average of the negative log-likelihood
# 4. The higher the score, the more semantically expressive the description is

# Baseline
# Interpreting the Score (for a model like GPT-2)
# Low (Bad): < 3.5
# Medium (Okay): 3.5 - 5.0
# High (Good): > 5.0

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Setup for Negative Log-Likelihood (NLL) Score
nll_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
nll_model = GPT2LMHeadModel.from_pretrained('gpt2')
nll_model.eval()

def calculate_nll_score(text: str) -> float:
    """Calculates the NLL score. Higher = more information."""
    with torch.no_grad():
        tokenized_input = nll_tokenizer(text, return_tensors='pt')
        if tokenized_input.input_ids.size(1) == 0:
            return 0.0
        outputs = nll_model(tokenized_input.input_ids, labels=tokenized_input.input_ids)
        return outputs.loss.item()

# 2. Setup for Semantic Distance Score
dist_model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_distance_score(text: str, baseline_text: str) -> float:
    """Calculates expressiveness based on distance from a baseline."""
    embeddings = dist_model.encode([text, baseline_text])
    sim = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0][0]
    score = (1 - sim) * 100
    return score

# 3. Baseline for queries
baseline_query = "This is a query."

# 4. Process the queries
expressiveness_results = []

for q in target_queries:
    query_text = q['query']
    nll_score = calculate_nll_score(query_text)
    distance_score = calculate_distance_score(query_text, baseline_query)
    expressiveness_results.append({
        "Query": query_text,
        "NLL Score": nll_score,
        "Distance Score": distance_score
    })

# 5. Create DataFrame and show results
df_express = pd.DataFrame(expressiveness_results)
display(df_express)

# Print overall averages
print(f"Average NLL Score: {df_express['NLL Score'].mean():.4f}")
print(f"Average Distance Score: {df_express['Distance Score'].mean():.2f}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


: 

In [4]:
# Test 2
# Semantic Expressiveness Score

# Steps
# Approach 2: Semantic distance from generic baseline
# 1. Use a query to get the search results
# 2. For each results, calculate the semantic distance from the generic baseline
# 3. Calculate the average of the semantic distance
# 4. The higher the distance, the more semantically expressive the description is

# Baseline
# Interpreting the Score
# Low (Bad): < 30
# Medium (Okay): 30 - 60
# High (Good): > 60

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Load the model
dist_model = SentenceTransformer('all-MiniLM-L6-v2')

# Define which attributes to compare (adjust as needed)
compared_attributes = [
    "description",
    "functionality_classification",
    "application_domain",
    "security_risks_description"
]

# Define baselines for each attribute
baselines = {
    "description": "This is a description.",
    "functionality_classification": "This is a classification.",
    "application_domain": "This is an application domain.",
    "security_risks_description": "This describes security risks."
}

def calculate_distance_score(text: str, baseline_text: str) -> float:
    """Calculates expressiveness based on distance from a baseline."""
    embeddings = dist_model.encode([text, baseline_text])
    sim = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0][0]
    score = (1 - sim) * 100
    return score

# Prepare results
distance_results = []

for q in target_queries[:10]:
    query_text = q['query']
    # Get search results from Dgraph client
    results = client.vector_search(query_text, limit=10)
    per_result_averages = []
    for res in results:
        attr_distances = []
        for attr in compared_attributes:
            # Try to get the attribute value from the result dict
            value = (
                res.get(attr)
                or res.get(f"ContractDeployment.{attr}")
                or ""
            )
            baseline = baselines[attr]
            dist = calculate_distance_score(str(value), baseline)
            attr_distances.append(dist)
        # Average distance for this result
        if attr_distances:
            per_result_averages.append(np.mean(attr_distances))
    # Average distance for this query (across all results)
    avg_distance = np.mean(per_result_averages) if per_result_averages else 0.0
    distance_results.append({
        "Query": query_text,
        "Average Semantic Distance": avg_distance
    })

# Create DataFrame
df_semantic = pd.DataFrame(distance_results)
display(df_semantic)

# Print overall average
print(f"Overall average semantic distance: {df_semantic['Average Semantic Distance'].mean():.2f}")


2025-07-23 23:50:00,058 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:50:00,244 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: library contracts
2025-07-23 23:50:03,449 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:50:03,556 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: proxy contracts
2025-07-23 23:50:04,986 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:50:05,072 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: token contracts erc-1155
2025-07-23 23:50:05,945 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:50:05,998 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: token contracts erc-20
2025-07-23 23:50:06,611 - Smar

Unnamed: 0,Query,Average Semantic Distance
0,library contracts,82.448452
1,proxy contracts,81.770174
2,token contracts erc-1155,81.432973
3,token contracts erc-20,82.692409
4,token contracts erc-721,81.096787
5,defi,82.010959
6,identity verification,81.874055
7,defi decentralized exchanges,82.007813
8,nft marketplaces collectibles,81.785843
9,gaming,82.216317


Overall average semantic distance: 81.93


In [None]:

# Test 3
# Semantic Similarity Score: Average Pairwise Cosine Distance

# Steps
# 1. Use a query to get the search results
# 2. Get the embeddings of the results
# 3. Calculate the average pairwise cosine distance between the embeddings using the formula

# Baseline
# 0.0 - 0.3	Very High Similarity	Results are semantically very close, likely redundant or rephrasings.
# 0.3 - 0.6	Moderate Similarity	Results are on-topic but discuss different aspects or sub-topics.
# 0.6 - 0.8	Low Similarity	Results are loosely related or touch upon the topic from very different angles.
# > 0.8	Very Low / No Similarity	Results are likely off-topic, irrelevant, or contradictory.

from sklearn.metrics.pairwise import cosine_distances
import numpy as np
import pandas as pd

similarity_results = []

for q in target_queries:
    query_text = q['query']
    results = client.vector_search(query_text, limit=10)
    # Extract embeddings
    embeddings = []
    print(results[0].keys())
    for res in results:
        emb = (
            res.get("embeddings")
            or res.get("ContractDeployment.embeddings")
        )
        
        if emb and isinstance(emb, list) and all(isinstance(x, (float, int)) for x in emb):
            embeddings.append(emb)
    # Calculate average pairwise cosine distance
    if len(embeddings) > 1:
        emb_matrix = np.array(embeddings)
        # Compute pairwise cosine distances (not similarities)
        dists = cosine_distances(emb_matrix)
        # Take the upper triangle (excluding diagonal) to get all unique pairs
        triu_indices = np.triu_indices_from(dists, k=1)
        avg_distance = dists[triu_indices].mean()
    else:
        avg_distance = np.nan  # Not enough embeddings to compare
    similarity_results.append({
        "Query": query_text,
        "Average Pairwise Cosine Distance": avg_distance
    })

# Create DataFrame
df_similarity = pd.DataFrame(similarity_results)
display(df_similarity)

# Print overall average (excluding NaN)
overall_avg = df_similarity["Average Pairwise Cosine Distance"].mean()
print(f"Overall average pairwise cosine distance: {overall_avg:.4f}")

2025-07-23 23:50:18,561 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:50:18,653 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: library contracts
dict_keys(['uid', 'ContractDeployment.storage_protocol', 'ContractDeployment.storage_address', 'ContractDeployment.experimental', 'ContractDeployment.solc_version', 'ContractDeployment.verified_source', 'ContractDeployment.verified_source_code', 'ContractDeployment.name', 'ContractDeployment.description', 'ContractDeployment.functionality_classification', 'ContractDeployment.application_domain', 'ContractDeployment.security_risks_description', 'ContractDeployment.embeddings'])
2025-07-23 23:50:18,702 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:50:18,772 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: proxy contracts
dict_keys(['uid', 'ContractDeployment.st

Unnamed: 0,Query,Average Pairwise Cosine Distance
0,library contracts,0.189284
1,proxy contracts,0.179425
2,token contracts erc-1155,0.182775
3,token contracts erc-20,0.182536
4,token contracts erc-721,0.181076
5,defi,0.222002
6,identity verification,0.200918
7,defi decentralized exchanges,0.196053
8,nft marketplaces collectibles,0.173649
9,gaming,0.203347


Overall average pairwise cosine distance: 0.1888


In [6]:

# Test 4
# Jaccard Index on K-highest search results

# Steps
# 1. Define two queries with similar semantic
# 2. Get the search results for both queries
# 3. Calculate the Jaccard Index on the K-highest search results
# 4. The higher the score, the more consistent the search results are

# Baseline
# Excellent / Good: > 0.70
# Moderate / Needs Improvement: 0.40 - 0.70
# Poor / Bad: < 0.40

def get_top_k_uids(query, k=10):
    """Run the search and return the top k UIDs as a set."""
    results = client.vector_search(query, limit=k)
    # Adjust the key as needed for your result structure
    return set(r.get('uid') for r in results if r.get('uid'))

def jaccard_index(set1, set2):
    """Compute the Jaccard index between two sets."""
    if not set1 and not set2:
        return 1.0  # Both empty, define as perfect overlap
    intersection = set1 & set2
    union = set1 | set2
    return len(intersection) / len(union) if union else 0.0

# Define your test cases
tests = {
    "defi_vs_decentralized_finance": [
        "defi",
        "decentralized finance"
    ],
    "nft_vs_non_fungible_token": [
        "nft",
        "non fungible token"
    ],
    "erc20_vs_token_contracts": [
        "erc20",
        "token contracts erc-20"
    ],
    "dex_vs_decentralized_exchange": [
        "dex",
        "decentralized exchange"
    ],
    "governance_vs_voting": [
        "governance",
        "voting"
    ]
}

k = 10  # Top K results to compare
jaccard_results = []

for test_name, queries in tests.items():
    if len(queries) < 2:
        continue
    uids_1 = get_top_k_uids(queries[0], k)
    uids_2 = get_top_k_uids(queries[1], k)
    jaccard = jaccard_index(uids_1, uids_2)
    jaccard_results.append({
        "Test": test_name,
        "Query 1": queries[0],
        "Query 2": queries[1],
        "Jaccard Index": jaccard,
        "Intersection": list(uids_1 & uids_2),
        "Union Size": len(uids_1 | uids_2)
    })

df_jaccard = pd.DataFrame(jaccard_results)
display(df_jaccard)

# Print overall average Jaccard index
if not df_jaccard.empty:
    print(f"Average Jaccard Index: {df_jaccard['Jaccard Index'].mean():.3f}")
else:
    print("No Jaccard results to display.")

2025-07-23 23:58:51,433 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:58:51,639 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: defi
2025-07-23 23:58:51,712 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:58:51,816 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: decentralized finance
2025-07-23 23:58:51,862 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:58:51,958 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: nft
2025-07-23 23:58:51,990 - SmartContractDiscovery.DgraphClient - INFO - Query embedding length: 384
2025-07-23 23:58:52,112 - SmartContractDiscovery.DgraphClient - INFO - Vector search found 10 similar contracts for query: non fungible token
2025-07-23 23:58:52,171 - SmartContractDiscovery.DgraphClient 

Unnamed: 0,Test,Query 1,Query 2,Jaccard Index,Intersection,Union Size
0,defi_vs_decentralized_finance,defi,decentralized finance,0.052632,[0x192d5ee],19
1,nft_vs_non_fungible_token,nft,non fungible token,0.333333,"[0x1ab3fef, 0x1b7747e, 0x1ab4019, 0x1bd8f5f, 0...",15
2,erc20_vs_token_contracts,erc20,token contracts erc-20,0.428571,"[0x1a217ff, 0x195e342, 0x19f0ac1, 0x1b159fe, 0...",14
3,dex_vs_decentralized_exchange,dex,decentralized exchange,0.0,[],20
4,governance_vs_voting,governance,voting,0.176471,"[0x1ab4021, 0x1a5258f, 0x1b8fbd1]",17


Average Jaccard Index: 0.198


In [None]:
# Test 4 Approach 2
# Weighted Jaccard Index

# Steps
# 1. Use a query to get the search results
# 2. Get the embeddings of the results
# 3. Calculate the weighted Jaccard Index of the results
# 4. The higher the score, the more semantically similar the results are