# Similarity

In [37]:
###########
# Imports #
##############################################################################

import io
import base64

import numpy as np
import matplotlib.pyplot as plt
from numpy.lib.stride_tricks import sliding_window_view

##############################################################################


In [38]:
import os
import json

from rag.db.postgresql_db import PostgreSQLDB
from rag.ai.pattern_ai import PatternAI

from IPython.display import Image, display

import base64

# check_plot

In [49]:
symbol = "ADAUSDT"
time_column="time"
vector_column="embedding"
table_name="market_pattern"

top_k=10

In [50]:
db = PostgreSQLDB()
db.set_connection()

ai = PatternAI(
    symbol=symbol,
    timeframe="15m",
    model="MOCKUP",
    vector_window=60,
)

2026-01-02 16:50:08,372 - PostgreSQLDB - INFO - Initialize database with db_name: prod || user: postgres


Creating UTC logger with level: 20
Creating UTC logger with level: 20
Creating UTC logger with level: INFO
Creating UTC logger with level: INFO


In [51]:
raw_sql = f"""
    SELECT {vector_column} 
    FROM {table_name} 
    order by random()
    limit 1
"""
db.cursor.execute(raw_sql)
row = db.cursor.fetchone()
target_vec = row[0]

if hasattr(target_vec, 'tolist'):
    target_vec = target_vec.tolist()

In [52]:
if isinstance(target_vec, str):
    actual_vector = json.loads(target_vec)
else:
    actual_vector = target_vec
actual_vector

[0.6535628,
 -0.9280643,
 1.3115144,
 0.5197404,
 0.3876526,
 -0.39962593,
 -0.13725077,
 1.5654603,
 -0.66047615,
 -1.1855297,
 -0.13725077,
 -0.53098565,
 -1.3205262,
 -0.13725077,
 2.3571372,
 -1.0546283,
 0.38719392,
 0.7790596,
 -1.5780058,
 -0.26845747,
 0.25625464,
 -0.39954948,
 -0.13725077,
 0.51820964,
 -0.7927112,
 -0.26845747,
 3.2617545,
 2.2009602,
 0.7687832,
 1.0249766,
 -0.7825621,
 -0.13725077,
 0.3790723,
 0.3784807,
 -1.9449022,
 0.12143004,
 -1.1728672,
 -1.8252261,
 0.38280082,
 0.12254987,
 -0.3970514,
 -1.438508,
 -0.006955614,
 0.123226434,
 -0.39772797,
 1.5531628,
 0.64080405,
 1.0273155,
 -1.1722718,
 -0.2667959,
 -0.78553635,
 0.8995584,
 -0.78497654,
 0.25149655,
 -1.1746573,
 0.12232513,
 0.38145235,
 -0.2668705,
 0.63990843,
 -0.5256626]

In [53]:
query_neighbors = f"""
    SELECT 
        {time_column}, 
        next_return, 
        next_slope_3, 
        next_slope_5,
        {vector_column} <=> '{actual_vector}' as distance,
        {vector_column}
    FROM {table_name}
    WHERE symbol = '{symbol}'
    AND {vector_column} IS NOT NULL
    AND {time_column} < '2026-01-01 00:00:00'::timestamptz
    ORDER BY distance ASC
    LIMIT {top_k}
"""

db.cursor.execute(query_neighbors)

neighbors = db.cursor.fetchall()
matched_data = []
for n in neighbors:
    matched_data.append(
        {
            "time": str(n[0]),
            # Handle None values for recent data that might not have labels yet
            "next_return": float(n[1]) if n[1] is not None else 0.0,
            "next_slope_3": float(n[2]) if n[2] is not None else 0.0,
            "next_slope_5": float(n[3]) if n[3] is not None else 0.0,
            "distance": float(n[4]),
            "similarity_score": 1 - float(n[4]),
            "embedding": n[5]
        }
    )

In [44]:
def get_min_max_distinct(items, key='distance'):
    if not items:
        return []
    
    # 1. Sort the list by the chosen metric (e.g., distance)
    sorted_items = sorted(items, key=lambda x: x[key])
    
    # 2. Extract Min and Max
    min_item = sorted_items[1]
    max_item = sorted_items[-1]
    
    # 3. Ensure they are distinct (handle lists with 1 item or identical values)
    # We compare the 'time' or unique ID to ensure we don't return the same object twice if the list is short
    if min_item['time'] == max_item['time']:
        return [min_item]
    
    selected = {min_item['time']: min_item, max_item['time']: max_item}
    distinct_elements = list(selected.values())
    
    returns = [abs(item['distance']) for item in distinct_elements]
    average_return = sum((returns)) / len(returns)
    
    return [min_item, max_item, average_return]

# Execution
min_distance, max_distance, average_return = get_min_max_distinct(matched_data)

In [45]:
min_distance['distance'], max_distance['distance'], average_return

(0.47703997642768836, 0.5327028737971025, 0.5048714251123954)

In [46]:
image_b64 = ai.plot_patterns_to_base64(
    current_vec=target_vec,
    rag_matches=[min_distance]
)

min_dist = min_distance['distance']
image_data = base64.b64decode(image_b64)
min_path = os.path.join('check_outputs', 'min', f"{str(min_dist)}.png")
with open(min_path, "wb") as file:
    file.write(image_data)

In [47]:
image_b64 = ai.plot_patterns_to_base64(
    current_vec=target_vec,
    rag_matches=[max_distance]
)

max_dist = max_distance['distance']
image_data = base64.b64decode(image_b64)
max_path = os.path.join('check_outputs', 'max', f"{str(max_dist)}.png")
with open(max_path, "wb") as file:
    file.write(image_data)

In [48]:
image_b64 = ai.plot_patterns_to_base64(
    current_vec=target_vec,
    rag_matches=matched_data
)

average_dist = average_return
image_data = base64.b64decode(image_b64)
average_path = os.path.join('check_outputs', 'average', f"{str(average_dist)}.png")
with open(average_path, "wb") as file:
    file.write(image_data)

# Check filter distnace

In [216]:
is_correct_3 = []
is_correct_5 = []
avg_distance_lst = []

In [297]:
symbol = "ADAUSDT"
time_column="time"
vector_column="embedding"
table_name="market_pattern"

top_k=10

current_timestamp_tunc = "2025-11-20 05:30:00.000"

In [298]:
db = PostgreSQLDB()
db.set_connection()

ai = PatternAI(
    symbol=symbol,
    timeframe="15m",
    model="MOCKUP",
    vector_window=60,
)

raw_sql = f"""
    SELECT {vector_column} 
    FROM {table_name} 
    order by random()
    limit 1
"""
db.cursor.execute(raw_sql)
row = db.cursor.fetchone()
target_vec = row[0]

if hasattr(target_vec, 'tolist'):
    target_vec = target_vec.tolist()
    
if isinstance(target_vec, str):
    actual_vector = json.loads(target_vec)
else:
    actual_vector = target_vec

2026-01-03 05:49:06,632 - PostgreSQLDB - INFO - Initialize database with db_name: prod || user: postgres


Creating UTC logger with level: 20
Creating UTC logger with level: 20
Creating UTC logger with level: INFO
Creating UTC logger with level: INFO


In [299]:
query_neighbors = f"""
    SELECT 
        {time_column}, 
        next_return, 
        next_slope_3, 
        next_slope_5,
        {vector_column} <=> '{actual_vector}' as distance,
        {vector_column}
    FROM {table_name}
    WHERE symbol = '{symbol}'
    AND {vector_column} IS NOT NULL
    AND {time_column} < '2026-01-01 00:00:00'::timestamptz
    ORDER BY distance ASC
    LIMIT {top_k}
"""

db.cursor.execute(query_neighbors)

neighbors = db.cursor.fetchall()
matched_data = []
for n in neighbors:
    matched_data.append(
        {
            "time": str(n[0]),
            # Handle None values for recent data that might not have labels yet
            "next_return": float(n[1]) if n[1] is not None else 0.0,
            "next_slope_3": float(n[2]) if n[2] is not None else 0.0,
            "next_slope_5": float(n[3]) if n[3] is not None else 0.0,
            "distance": float(n[4]),
            "similarity_score": 1 - float(n[4]),
            "embedding": n[5]
        }
    )

In [300]:
min_distance, max_distance, average_distance = get_min_max_distinct(matched_data)
min_distance, max_distance, average_distance

({'time': '2023-03-27 12:00:00+00:00',
  'next_return': 0.004238485447866633,
  'next_slope_3': -0.0026730444569499335,
  'next_slope_5': -0.0013224535734383586,
  'distance': 0.4315322534320001,
  'similarity_score': 0.5684677465679999,
  'embedding': '[0.50431275,-0.14774773,0.8944211,1.9304249,0.2417064,-0.14690487,0.75922424,-0.5349706,0.37086755,0.111963235,-1.0531433,-0.14701419,0.75987923,-1.5732114,-0.7972701,-1.3201374,-1.1930192,-0.8028091,-1.4608198,-0.41183138,0.7711856,1.031946,0.11361795,0.89856386,-1.7198702,1.5543727,-0.540702,0.3751673,0.24413258,-1.4575388,0.3758373,-1.1980397,-0.8061704,0.2457003,-3.5816994,0.38001984,1.0406803,-1.0754402,0.7763929,-0.017379923,-0.14958048,0.5111948,-0.2815915,-1.2082105,0.37990564,1.4364102,0.5101359,-0.28106233,1.6938647,-0.14878874,-0.14882623,0.50818044,-0.411494,1.42606,-0.27948743,-0.5420427,0.11384178,1.0310506,0.7673804,1.678363]'},
 {'time': '2025-07-01 12:45:00+00:00',
  'next_return': -0.0048868778280544145,
  'next_slope_

In [301]:
minus_slope_3 = []
minus_slope_5 = []

plus_slope_3 = []
plus_slope_5 = []

matched_data = matched_data[1:]

for element in matched_data:
    
    # Slope 3
    if element['next_slope_3'] < 0.0:
        minus_slope_3.append(element['next_slope_3'])
    elif element['next_slope_3'] > 0.0:
        plus_slope_3.append(element['next_slope_3'])
        
    if element['next_slope_5'] < 0.0:
        minus_slope_5.append(element['next_slope_5'])
    elif element['next_slope_5'] > 0.0:
        plus_slope_5.append(element['next_slope_5'])
        
actual_slope_3 = matched_data[0]['next_slope_3']
actual_slope_5 = matched_data[0]['next_slope_5']
        
minus_slope_3, minus_slope_5 , plus_slope_3, plus_slope_5

([-0.0026730444569499335,
  -0.002277696793002867,
  -3.878959614448865e-19,
  -0.000687285223367622,
  -0.0007373422906766352,
  -0.0015460167333575146],
 [-0.0013224535734383586,
  -0.0012208454810495497,
  -0.0002915451895043613,
  -0.0004077669902912387,
  -0.00010795250089959208,
  -0.0005498281786941548,
  -0.0009011961330492935],
 [0.0031067961165048333, 0.00515251442704036, 0.0004498020870817348],
 [0.0030915086562242323, 0.000800291014914527])

In [302]:
assert len(minus_slope_3) + len(plus_slope_3) == top_k - 1
assert len(minus_slope_5) + len(plus_slope_5) == top_k - 1

In [303]:
# Determine Predictions (True if Positive, False if Negative)
pred_3_is_positive = len(plus_slope_3) > len(minus_slope_3)
pred_5_is_positive = len(plus_slope_5) > len(minus_slope_5)

# Determine Actuals
actual_3_is_positive = actual_slope_3 > 0
actual_5_is_positive = actual_slope_5 > 0

# Compare
# This returns True if both are True, or both are False.
is_correct_3.append(pred_3_is_positive == actual_3_is_positive)
is_correct_5.append(pred_5_is_positive == actual_5_is_positive)
avg_distance_lst.append(average_distance)

In [304]:
len(is_correct_3), len(is_correct_5), len(avg_distance_lst)

(9, 9, 9)

In [305]:
is_correct_3, is_correct_5, avg_distance_lst

([True, True, True, True, False, False, False, True, True],
 [True, True, True, True, True, False, True, True, True],
 [0.3399384301527475,
  0.47106368724630654,
  0.44980024998942597,
  0.35650426951901587,
  0.5109491076233104,
  0.47081963348474903,
  0.16583585371854498,
  0.4832909004546591,
  0.4790884642837866])

In [306]:
import pandas as pd

df = pd.DataFrame(
    data={
        "distance_avg": avg_distance_lst,
        "is_correct_3": is_correct_3,
        "is_correct_5": is_correct_5,
        
    }
)

In [307]:
df

Unnamed: 0,distance_avg,is_correct_3,is_correct_5
0,0.339938,True,True
1,0.471064,True,True
2,0.4498,True,True
3,0.356504,True,True
4,0.510949,False,True
5,0.47082,False,False
6,0.165836,False,True
7,0.483291,True,True
8,0.479088,True,True


In [308]:
# Calculate basic accuracy (percentage of True values)
accuracy_3 = df['is_correct_3'].mean()
accuracy_5 = df['is_correct_5'].mean()

print(f"Window 3 Accuracy: {accuracy_3:.2%}")
print(f"Window 5 Accuracy: {accuracy_5:.2%}")

Window 3 Accuracy: 66.67%
Window 5 Accuracy: 88.89%


In [309]:
# Group by correctness to see the average distance for Wins vs. Losses
print("\n--- Impact of Distance on Window 3 ---")
print(df.groupby('is_correct_3')['distance_avg'].describe()[['count', 'mean', 'std']])

print("\n--- Impact of Distance on Window 5 ---")
print(df.groupby('is_correct_5')['distance_avg'].describe()[['count', 'mean', 'std']])


--- Impact of Distance on Window 3 ---
              count      mean       std
is_correct_3                           
False           3.0  0.382535  0.188736
True            6.0  0.429948  0.064561

--- Impact of Distance on Window 5 ---
              count      mean       std
is_correct_5                           
False           1.0  0.470820       NaN
True            8.0  0.407059  0.115242


In [None]:
# Create "bins" for distance to see accuracy at different levels
# Example: 0.0-0.2, 0.2-0.4, etc.
df['distance_bucket'] = pd.cut(df['distance_avg'], bins=5)

# Check accuracy per bucket
analysis = df.groupby('distance_bucket')[['is_correct_3', 'is_correct_5']].mean()
print(analysis)