In [1]:
import pandas as pd
import numpy as np
import sqlite3

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors

In [2]:
conn = sqlite3.connect('database.db')

In [3]:
data_Parts = pd.read_sql("select Id, Name, HeadingId, CategoryId from Parts;", con=conn)
data_StructuresParts = pd.read_sql("select StructureId, PartId from StructuresParts;", con=conn)
data_Structures = pd.read_sql("select Id, StandardProjectId, TypeId from Structures;", con=conn)
data_StandardProjects = pd.read_sql("select Id, ImageIndex from StandardProjects;", con=conn)
data_Conductors = pd.read_sql("select PartId, TypeId, Diameter, CrossSection from Conductors;", con=conn)

In [4]:
df = data_Parts.merge(data_StructuresParts, left_on='Id', right_on='PartId', how='outer').drop('PartId', axis=1)

df = df.merge(data_Structures, left_on='StructureId', right_on='Id', how='outer').drop('Id_y', axis=1)

data_Conductors.rename(columns={'PartId': 'Id_x'}, inplace=True)

df = pd.concat([df, data_Conductors], axis=0)

df = df.merge(data_StandardProjects, left_on='StandardProjectId', right_on='Id', how='left').drop('Id', axis=1)

In [5]:
df

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
0,___-12-2А G28,Зажим аппаратный,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
1,d10 (цинк),Сталь круглая оцинкованная по ГОСТ 9.307-89,10.0,10.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
2,PA2870P,Зажим натяжной,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
3,PGA101,,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
4,PLDT2 R,Вязка односторонняя диэлектрическая,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
...,...,...,...,...,...,...,...,...,...,...
132075,ЦСБвШнг 3х95-6,,,,,,cabel10,,,
132076,Э-ДПС-06-08-Е4(3),,,,,,commCable,,,
132077,ЭВБВ 3х95-6,,,,,,cabel10,,,
132078,ЭСО-ДПС-06-20-Е4(тип3),,,,,,commCable,,,


In [6]:
df['StructureId'] = df['StructureId'].str.split('_').str[0]
df['ImageIndex'] = df['ImageIndex'] + 1
df

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
0,___-12-2А G28,Зажим аппаратный,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3",132.0,support10,,,2.0
1,d10 (цинк),Сталь круглая оцинкованная по ГОСТ 9.307-89,10.0,10.0,"К10/0,38-1/0 (1хСК120-6)-3",132.0,support10,,,2.0
2,PA2870P,Зажим натяжной,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3",132.0,support10,,,2.0
3,PGA101,,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3",132.0,support10,,,2.0
4,PLDT2 R,Вязка односторонняя диэлектрическая,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3",132.0,support10,,,2.0
...,...,...,...,...,...,...,...,...,...,...
132075,ЦСБвШнг 3х95-6,,,,,,cabel10,,,
132076,Э-ДПС-06-08-Е4(3),,,,,,commCable,,,
132077,ЭВБВ 3х95-6,,,,,,cabel10,,,
132078,ЭСО-ДПС-06-20-Е4(тип3),,,,,,commCable,,,


In [7]:
encoder = LabelEncoder()
df['StructureId'] = encoder.fit_transform(df['StructureId'])
df['TypeId'] = encoder.fit_transform(df['TypeId'])
df = df.fillna(0)
df = df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
0,___-12-2А G28,Зажим аппаратный,9.0,58.0,745,132.0,15,0.0000,0.000000,2.0
1,d10 (цинк),Сталь круглая оцинкованная по ГОСТ 9.307-89,10.0,10.0,745,132.0,15,0.0000,0.000000,2.0
2,PA2870P,Зажим натяжной,9.0,58.0,745,132.0,15,0.0000,0.000000,2.0
3,PGA101,,9.0,58.0,745,132.0,15,0.0000,0.000000,2.0
4,PLDT2 R,Вязка односторонняя диэлектрическая,9.0,58.0,745,132.0,15,0.0000,0.000000,2.0
...,...,...,...,...,...,...,...,...,...,...
115768,ЦСБвШнг 3х95-6,0,0.0,0.0,2267,0.0,5,0.0000,0.000000,0.0
115769,Э-ДПС-06-08-Е4(3),0,0.0,0.0,2267,0.0,6,0.0000,0.000000,0.0
115770,ЭВБВ 3х95-6,0,0.0,0.0,2267,0.0,5,0.0000,0.000000,0.0
115771,ЭСО-ДПС-06-20-Е4(тип3),0,0.0,0.0,2267,0.0,6,0.0000,0.000000,0.0


**NearestNeighbors**

In [8]:
n_neighbors = 35
neigh = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')

In [9]:
neigh.fit(df.drop(columns=['Id_x', 'Name'], axis=1))

In [10]:
query = df.loc[np.random.randint(df.shape[0], size=7)]
query

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
91164,CCD 9-62,Хомут,9.0,58.0,585,572.0,14,0.0,0.0,3.0
16933,409113,Автоматический выключатель DX³ 10000 - 16 кА -...,17.0,20190823.0,2267,0.0,18,0.0,0.0,0.0
89511,СВ105-5,"Стойка железобетонная вибрированная, ТУ 5863-0...",7.0,114.0,427,454.0,15,0.0,0.0,2.0
19382,4405012,"Розетка с крышкой ""Ванильная дымка"", 2P E, с з...",15.0,20190823.0,2267,0.0,18,0.0,0.0,0.0
7258,047205,Шкаф Altis сборный металлический - IP 55 - IK ...,15.0,20190823.0,2267,0.0,18,0.0,0.0,0.0
73512,НІК 2303І АРК1Т 1820,Счетчик электрической энергии тарифный,17.0,110.0,2267,0.0,18,0.0,0.0,0.0
28683,CJH33.4234RC,"соединительные муфты на пластик, броня, 3x185-...",4.0,78.0,2267,0.0,18,0.0,0.0,0.0


In [11]:
results = pd.DataFrame()
for i in range(query.shape[0]):
    answer = neigh.kneighbors(query.iloc[i].to_frame().T.drop(columns=['Id_x', 'Name']))
    result = df.loc[answer[1][0]]#[['Id_x', 'Name']]
    result['distance'] = answer[0][0]
    try:
        result = result.drop(index=query.iloc[i].name, axis=1)
    except:
        result = result
    result = result.drop_duplicates(subset=['Id_x'])
    result = result.iloc[:int(np.ceil(25 / query.shape[0]))]
    results = pd.concat([results, result], axis=0)
results = results.drop_duplicates(subset=['Id_x']).sort_values(by='distance').head(25).sort_values(by='Name')
results

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex,distance
33821,CONTB630.3,"Адаптер параллельного подключения к CONT630, 1...",4.0,78.0,2267,0.0,18,0.0,0.0,0.0,0.0
33819,CONTB630.1,"Адаптер параллельного подключения к CONT630, 2...",4.0,78.0,2267,0.0,18,0.0,0.0,0.0,0.0
91177,TND 241,Зажим для повторных заземлений PEN проводника,9.0,58.0,585,572.0,14,0.0,0.0,3.0,0.0
91175,GUKo2,Зажим натяжной магистральный,9.0,58.0,585,572.0,14,0.0,0.0,3.0,0.0
91176,PSP 122 TRA,Зажим поддерживающий магистральный,9.0,58.0,585,572.0,14,0.0,0.0,3.0,0.0
91167,PSP 120,Зажим поддерживающий магистральный,9.0,58.0,585,572.0,14,0.0,0.0,3.0,0.0
35734,DFT 400 FT,КРЫШКА T-ОБРАЗНОГО СОЕДИНЕНИЯ,15.0,20190823.0,2267,0.0,18,0.0,0.0,0.0,1.110223e-16
33917,COT1.2404L,Концевая муфта наружной установки для кабеля с...,4.0,78.0,2267,0.0,18,0.0,0.0,0.0,0.0
35744,DFTM 100 V2A,Крышка Т-образного ответвления Magic 100мм,15.0,20190823.0,2267,0.0,18,0.0,0.0,0.0,1.110223e-16
35763,DFTM 500 FS,Крышка Т-образного ответвления Magic 500мм,15.0,20190823.0,2267,0.0,18,0.0,0.0,0.0,1.110223e-16


In [None]:
import networkx as nx
from sklearn.neighbors import NearestNeighbors

In [None]:
G = nx.Graph()

In [None]:
G.add_node(data_parts.loc[0, 'Id_x'], features=[data_parts.loc[0, 'HeadingId'], data_parts.loc[0, 'CategoryId']])
G.add_node(data_parts.loc[1, 'Id'], features=[data_parts.loc[1, 'HeadingId'], data_parts.loc[1, 'CategoryId']])
G.add_node(data_parts.loc[2, 'Id'], features=[data_parts.loc[2, 'HeadingId'], data_parts.loc[2, 'CategoryId']])

In [None]:
G.add_edge(data_parts.loc[0, 'Id'], data_parts.loc[1, 'Id'])
G.add_edge(data_parts.loc[0, 'Id'], data_parts.loc[2, 'Id'])
G.add_edge(data_parts.loc[1, 'Id'], data_parts.loc[2, 'Id'])

In [None]:
features = [G.nodes[node]['features'] for node in G.nodes]
node_indices = list(G.nodes)

In [None]:
features

In [None]:
n_neighbors = 15
neigh = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
neigh.fit(features)

In [None]:
features_query = [data_parts.loc[0, 'HeadingId'] + data_parts.loc[2, 'HeadingId'],
                  data_parts.loc[0, 'CategoryId'] + data_parts.loc[2, 'CategoryId']]
distances, indices = neigh.kneighbors([features_query], n_neighbors=n_neighbors)

In [None]:
# Вывод ближайших соседей
print("Ближайшие соседи:")
for i in range(n_neighbors):
    print(f"Узел {node_indices[indices[0][i]]}, расстояние {distances[0][i]}")

In [17]:
data_Parts

Unnamed: 0,Id,HeadingId,CategoryId
0,___-12-2А G28,9,58
1,-,16,20180331
2,.,0,20190820
3,"""STR Jaryq"" 80W, 80 Вт, 10800 лм, 5000 К, IP67",17,20210505
4,"""Анкара."" 10А",16,20200709
...,...,...,...
80436,ЯУОТ-31-110-25-IP54-У1,0,999
80437,ЯУОТ-33-211-25-54-УХЛ3.1,17,20210505
80438,ЯУР-3А-4,17,20210505
80439,Ящик секціювання ________,14,999


In [5]:
import networkx as nx
from sklearn.neighbors import NearestNeighbors

In [6]:
G = nx.Graph()

In [7]:
G.add_node(data_parts.loc[0, 'Id'], features=[data_parts.loc[0, 'HeadingId'], data_parts.loc[0, 'CategoryId']])
G.add_node(data_parts.loc[1, 'Id'], features=[data_parts.loc[1, 'HeadingId'], data_parts.loc[1, 'CategoryId']])
G.add_node(data_parts.loc[2, 'Id'], features=[data_parts.loc[2, 'HeadingId'], data_parts.loc[2, 'CategoryId']])

In [8]:
G.add_edge(data_parts.loc[0, 'Id'], data_parts.loc[1, 'Id'])
G.add_edge(data_parts.loc[0, 'Id'], data_parts.loc[2, 'Id'])
G.add_edge(data_parts.loc[1, 'Id'], data_parts.loc[2, 'Id'])

In [9]:
features = [G.nodes[node]['features'] for node in G.nodes]
node_indices = list(G.nodes)

In [10]:
features

[[9, 58], [16, 20180331], [0, 20190820]]

In [11]:
n_neighbors = 1
neigh = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
neigh.fit(features)

In [12]:
features_query = [data_parts.loc[0, 'HeadingId'] + data_parts.loc[2, 'HeadingId'],
                  data_parts.loc[0, 'CategoryId'] + data_parts.loc[2, 'CategoryId']]
distances, indices = neigh.kneighbors([features_query], n_neighbors=n_neighbors)

In [13]:
# Вывод ближайших соседей
print("Ближайшие соседи:")
for i in range(n_neighbors):
    print(f"Узел {node_indices[indices[0][i]]}, расстояние {distances[0][i]}")

Ближайшие соседи:
Узел -, расстояние 6.0285110237146e-14
