# Similarity

In [37]:
###########
# Imports #
##############################################################################

import io
import base64

import numpy as np
import matplotlib.pyplot as plt
from numpy.lib.stride_tricks import sliding_window_view

##############################################################################


In [38]:
import os
import json

from rag.db.postgresql_db import PostgreSQLDB
from rag.ai.pattern_ai import PatternAI

from IPython.display import Image, display

import base64

# check_plot

In [49]:
symbol = "ADAUSDT"
time_column="time"
vector_column="embedding"
table_name="market_pattern"

top_k=10

In [50]:
db = PostgreSQLDB()
db.set_connection()

ai = PatternAI(
    symbol=symbol,
    timeframe="15m",
    model="MOCKUP",
    vector_window=60,
)

2026-01-02 16:50:08,372 - PostgreSQLDB - INFO - Initialize database with db_name: prod || user: postgres


Creating UTC logger with level: 20
Creating UTC logger with level: 20
Creating UTC logger with level: INFO
Creating UTC logger with level: INFO


In [None]:
raw_sql = f"""
    SELECT {vector_column} 
    FROM {table_name} 
    order by random()
    limit 1
"""
db.cursor.execute(raw_sql)
row = db.cursor.fetchone()
target_vec = row[0]

if hasattr(target_vec, 'tolist'):
    target_vec = target_vec.tolist()

In [52]:
if isinstance(target_vec, str):
    actual_vector = json.loads(target_vec)
else:
    actual_vector = target_vec
actual_vector

[0.6535628,
 -0.9280643,
 1.3115144,
 0.5197404,
 0.3876526,
 -0.39962593,
 -0.13725077,
 1.5654603,
 -0.66047615,
 -1.1855297,
 -0.13725077,
 -0.53098565,
 -1.3205262,
 -0.13725077,
 2.3571372,
 -1.0546283,
 0.38719392,
 0.7790596,
 -1.5780058,
 -0.26845747,
 0.25625464,
 -0.39954948,
 -0.13725077,
 0.51820964,
 -0.7927112,
 -0.26845747,
 3.2617545,
 2.2009602,
 0.7687832,
 1.0249766,
 -0.7825621,
 -0.13725077,
 0.3790723,
 0.3784807,
 -1.9449022,
 0.12143004,
 -1.1728672,
 -1.8252261,
 0.38280082,
 0.12254987,
 -0.3970514,
 -1.438508,
 -0.006955614,
 0.123226434,
 -0.39772797,
 1.5531628,
 0.64080405,
 1.0273155,
 -1.1722718,
 -0.2667959,
 -0.78553635,
 0.8995584,
 -0.78497654,
 0.25149655,
 -1.1746573,
 0.12232513,
 0.38145235,
 -0.2668705,
 0.63990843,
 -0.5256626]

In [53]:
query_neighbors = f"""
    SELECT 
        {time_column}, 
        next_return, 
        next_slope_3, 
        next_slope_5,
        {vector_column} <=> '{actual_vector}' as distance,
        {vector_column}
    FROM {table_name}
    WHERE symbol = '{symbol}'
    AND {vector_column} IS NOT NULL
    AND {time_column} < '2026-01-01 00:00:00'::timestamptz
    ORDER BY distance ASC
    LIMIT {top_k}
"""

db.cursor.execute(query_neighbors)

neighbors = db.cursor.fetchall()
matched_data = []
for n in neighbors:
    matched_data.append(
        {
            "time": str(n[0]),
            # Handle None values for recent data that might not have labels yet
            "next_return": float(n[1]) if n[1] is not None else 0.0,
            "next_slope_3": float(n[2]) if n[2] is not None else 0.0,
            "next_slope_5": float(n[3]) if n[3] is not None else 0.0,
            "distance": float(n[4]),
            "similarity_score": 1 - float(n[4]),
            "embedding": n[5]
        }
    )

In [44]:
def get_min_max_distinct(items, key='distance'):
    if not items:
        return []
    
    # 1. Sort the list by the chosen metric (e.g., distance)
    sorted_items = sorted(items, key=lambda x: x[key])
    
    # 2. Extract Min and Max
    min_item = sorted_items[1]
    max_item = sorted_items[-1]
    
    # 3. Ensure they are distinct (handle lists with 1 item or identical values)
    # We compare the 'time' or unique ID to ensure we don't return the same object twice if the list is short
    if min_item['time'] == max_item['time']:
        return [min_item]
    
    selected = {min_item['time']: min_item, max_item['time']: max_item}
    distinct_elements = list(selected.values())
    
    returns = [abs(item['distance']) for item in distinct_elements]
    average_return = sum((returns)) / len(returns)
    
    return [min_item, max_item, average_return]

# Execution
min_distance, max_distance, average_return = get_min_max_distinct(matched_data)

In [45]:
min_distance['distance'], max_distance['distance'], average_return

(0.47703997642768836, 0.5327028737971025, 0.5048714251123954)

In [46]:
image_b64 = ai.plot_patterns_to_base64(
    current_vec=target_vec,
    rag_matches=[min_distance]
)

min_dist = min_distance['distance']
image_data = base64.b64decode(image_b64)
min_path = os.path.join('check_outputs', 'min', f"{str(min_dist)}.png")
with open(min_path, "wb") as file:
    file.write(image_data)

In [47]:
image_b64 = ai.plot_patterns_to_base64(
    current_vec=target_vec,
    rag_matches=[max_distance]
)

max_dist = max_distance['distance']
image_data = base64.b64decode(image_b64)
max_path = os.path.join('check_outputs', 'max', f"{str(max_dist)}.png")
with open(max_path, "wb") as file:
    file.write(image_data)

In [48]:
image_b64 = ai.plot_patterns_to_base64(
    current_vec=target_vec,
    rag_matches=matched_data
)

average_dist = average_return
image_data = base64.b64decode(image_b64)
average_path = os.path.join('check_outputs', 'average', f"{str(average_dist)}.png")
with open(average_path, "wb") as file:
    file.write(image_data)

# Check filter distnace

In [857]:
is_correct_3 = []
is_correct_5 = []
avg_distance_lst = []

In [858]:
symbol = "ADAUSDT"
time_column="time"
vector_column="embedding"
table_name="market_pattern"

top_k=10


In [None]:
db = PostgreSQLDB()
db.set_connection()

ai = PatternAI(
    symbol=symbol,
    timeframe="15m",
    model="MOCKUP",
    vector_window=60,
)

raw_sql = f"""
    SELECT {vector_column} 
    FROM {table_name}
    where time = '2026-01-04 02:45:00+07:00' 
    order by random()
    limit 1
"""
db.cursor.execute(raw_sql)
row = db.cursor.fetchone()
target_vec = row[0]

if hasattr(target_vec, 'tolist'):
    target_vec = target_vec.tolist()
    
if isinstance(target_vec, str):
    actual_vector = json.loads(target_vec)
else:
    actual_vector = target_vec

2026-01-04 03:00:36,989 - PostgreSQLDB - INFO - Initialize database with db_name: prod || user: postgres


Creating UTC logger with level: 20
Creating UTC logger with level: 20
Creating UTC logger with level: INFO
Creating UTC logger with level: INFO


In [960]:
query_neighbors = f"""
    SELECT 
        {time_column}, 
        next_return, 
        next_slope_3, 
        next_slope_5,
        {vector_column} <=> '{actual_vector}' as distance,
        {vector_column}
    FROM {table_name}
    WHERE symbol = '{symbol}'
    AND {vector_column} IS NOT NULL
    AND {time_column} < '2026-01-01 00:00:00'::timestamptz
    ORDER BY distance ASC
    LIMIT {top_k}
"""

db.cursor.execute(query_neighbors)

neighbors = db.cursor.fetchall()
matched_data = []
for n in neighbors:
    matched_data.append(
        {
            "time": str(n[0]),
            # Handle None values for recent data that might not have labels yet
            "next_return": float(n[1]) if n[1] is not None else 0.0,
            "next_slope_3": float(n[2]) if n[2] is not None else 0.0,
            "next_slope_5": float(n[3]) if n[3] is not None else 0.0,
            "distance": float(n[4]),
            "similarity_score": 1 - float(n[4]),
            "embedding": n[5]
        }
    )

In [961]:
min_distance, max_distance, average_distance = get_min_max_distinct(matched_data)
min_distance, max_distance, average_distance

({'time': '2024-06-18 13:45:00+00:00',
  'next_return': 0.00976933514246945,
  'next_slope_3': 0.003224939532383786,
  'next_slope_5': -0.00171996775060468,
  'distance': 0.4692307841471971,
  'similarity_score': 0.5307692158528029,
  'embedding': '[0.0017431087,-0.08664275,0.045388903,0.06740042,0.023113178,-0.17661744,-0.5572805,0.64681864,-0.2676788,-1.5584463,-6.5835943,2.6290898,-0.10555186,1.394226,0.015508159,0.6053549,-0.07828348,0.27502188,0.55638605,0.25073126,0.15706047,0.32060015,0.20366475,-0.053120516,0.39039597,0.25011602,0.087104276,-0.28649354,0.41399348,-0.07649653,-0.21771608,0.063222066,0.32139203,0.27420002,0.32069877,0.040198874,0.3438284,-0.053120516,0.5068461,0.15693761,-0.052678708,-0.38081127,0.20399803,-0.12449168,-0.38491616,0.3933363,0.15723465,-0.03144681,-0.29232755,-0.1514715,0.13366595,0.65571916,-0.4836373,-0.22443613,-0.009980995,0.8020354,-0.128307,-0.29669437,0.2773517,-0.32214037]'},
 {'time': '2023-08-15 06:15:00+00:00',
  'next_return': 0.0010377

In [962]:
minus_slope_3 = []
minus_slope_5 = []

plus_slope_3 = []
plus_slope_5 = []

matched_data = matched_data[1:]

for element in matched_data:
    
    # Slope 3
    if element['next_slope_3'] < 0.0:
        minus_slope_3.append(element['next_slope_3'])
    elif element['next_slope_3'] > 0.0:
        plus_slope_3.append(element['next_slope_3'])
        
    if element['next_slope_5'] < 0.0:
        minus_slope_5.append(element['next_slope_5'])
    elif element['next_slope_5'] > 0.0:
        plus_slope_5.append(element['next_slope_5'])
        
actual_slope_3 = matched_data[0]['next_slope_3']
actual_slope_5 = matched_data[0]['next_slope_5']
        
minus_slope_3, minus_slope_5 , plus_slope_3, plus_slope_5

([-0.0007784120394396016, -0.0013659128851115675, -0.0033167495854063605],
 [-0.00171996775060468, -0.0020665901262916318],
 [0.003224939532383786,
  0.002579535683576969,
  0.0011829652996845216,
  0.00029137529137526026,
  0.0033302497687326977,
  0.0006910850034554449],
 [0.00010378827192526093,
  0.0005918955835483234,
  0.001741186586414456,
  0.0007097791798107131,
  0.0006118881118880931,
  0.0008510638297872428,
  6.910850034553495e-05])

In [963]:
assert len(minus_slope_3) + len(plus_slope_3) == top_k - 1
assert len(minus_slope_5) + len(plus_slope_5) == top_k - 1

In [964]:
# Determine Predictions (True if Positive, False if Negative)
pred_3_is_positive = len(plus_slope_3) > len(minus_slope_3)
pred_5_is_positive = len(plus_slope_5) > len(minus_slope_5)

# Determine Actuals
actual_3_is_positive = actual_slope_3 > 0
actual_5_is_positive = actual_slope_5 > 0

# Compare
# This returns True if both are True, or both are False.
is_correct_3.append(pred_3_is_positive == actual_3_is_positive)
is_correct_5.append(pred_5_is_positive == actual_5_is_positive)
avg_distance_lst.append(average_distance)

In [965]:
import pandas as pd

df = pd.DataFrame(
    data={
        "distance_avg": avg_distance_lst,
        "is_correct_3": is_correct_3,
        "is_correct_5": is_correct_5,
        
    }
)

In [966]:
# Calculate basic accuracy (percentage of True values)
accuracy_3 = df['is_correct_3'].mean()
accuracy_5 = df['is_correct_5'].mean()

print(f"Window 3 Accuracy: {accuracy_3:.2%}")
print(f"Window 5 Accuracy: {accuracy_5:.2%}")

Window 3 Accuracy: 81.82%
Window 5 Accuracy: 36.36%


In [967]:
# Group by correctness to see the average distance for Wins vs. Losses
print("\n--- Impact of Distance on Window 3 ---")
print(df.groupby('is_correct_3')['distance_avg'].describe()[['count', 'mean', 'std']])

print("\n--- Impact of Distance on Window 5 ---")
print(df.groupby('is_correct_5')['distance_avg'].describe()[['count', 'mean', 'std']])


--- Impact of Distance on Window 3 ---
              count      mean       std
is_correct_3                           
False           2.0  0.490728  0.010391
True            9.0  0.489115  0.008195

--- Impact of Distance on Window 5 ---
              count      mean       std
is_correct_5                           
False           7.0  0.490698  0.008956
True            4.0  0.487152  0.006739


In [968]:
# Create "bins" for distance to see accuracy at different levels
# Example: 0.0-0.2, 0.2-0.4, etc.
df['distance_bucket'] = pd.cut(df['distance_avg'], bins=5)

# Check accuracy per bucket
analysis = df.groupby('distance_bucket')[['is_correct_3', 'is_correct_5']].mean()
print(analysis)

                 is_correct_3  is_correct_5
distance_bucket                            
(0.481, 0.486]            0.8           0.4
(0.486, 0.491]            1.0           0.0
(0.491, 0.496]            1.0           1.0
(0.496, 0.501]            0.0           0.0
(0.501, 0.507]            1.0           0.0


  analysis = df.groupby('distance_bucket')[['is_correct_3', 'is_correct_5']].mean()
