In [1]:
import pandas as pd
import numpy as np
import sqlite3

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
conn = sqlite3.connect('../database.db')

In [3]:
encoder_struct = LabelEncoder()
encoder_type = LabelEncoder()
scaler = MinMaxScaler()

In [4]:
data_Parts = pd.read_sql("select Id, Name, HeadingId, CategoryId from Parts;", con=conn)
data_StructuresParts = pd.read_sql("select StructureId, PartId from StructuresParts;", con=conn)
data_Structures = pd.read_sql("select Id, StandardProjectId, TypeId from Structures;", con=conn)
data_StandardProjects = pd.read_sql("select Id, ImageIndex from StandardProjects;", con=conn)
data_Conductors = pd.read_sql("select PartId, TypeId, Diameter, CrossSection from Conductors;", con=conn)

In [5]:
df = data_Parts.merge(data_StructuresParts, left_on='Id', right_on='PartId', how='outer').drop('PartId', axis=1)

df = df.merge(data_Structures, left_on='StructureId', right_on='Id', how='outer').drop('Id_y', axis=1)

data_Conductors.rename(columns={'PartId': 'Id_x'}, inplace=True)

df = pd.concat([df, data_Conductors], axis=0)

df = df.merge(data_StandardProjects, left_on='StandardProjectId', right_on='Id', how='left').drop('Id', axis=1)

In [6]:
df.sample(5)

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
18208,417011,Контактор CTX³ Мини 3P 6A 1нз =24В,17.0,20190823.0,,,,,,
4404,024325,Osmoz рамка вставк. широкая белая без надписей,15.0,20190823.0,,,,,,
93577,Шайба 16 ГОСТ 11371-78,"Шайба, ГОСТ 11371-78",15.0,49.0,"А10/0,38-0/2(СК)_25-70 мм²",139.0,other,,,2.0
30926,CLP1M-B-8-20,Болт шестигранный М8х20,17.0,101.0,,,,,,
69702,К9РВВнг(А)-LS 1х10ок-1,Кабель силовой с изоляцией из этиленпропиленов...,3.0,50.0,,,,,,


In [7]:
df['StructureId'] = df['StructureId'].str.split('_').str[0]
df['ImageIndex'] = df['ImageIndex'] + 1
df.sample(5)

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
87948,К-6,Колпачок ТУ-34-13-11232-87,9.0,14.0,ПВА10-1(2хСК120-6)-1,252.0,support10,,,2.0
65319,ZB3-8-Tr20-25,Зажим балочный 3-8мм под трубу 20 мм EKF PROxima,17.0,20190823.0,,,,,,
41694,HS-T2-11-827-E14,Лампа энергосберегающая HS-полуспираль 11W 270...,0.0,20190823.0,,,,,,
45433,LK2003HDZ,"Крышка на X-ответвитель, осн.200, R-300мм, гор...",15.0,20190823.0,,,,,,
87082,У 1,Кронштейн,8.0,58.0,КА10-1н(3хСВ105-5)-1,350.0,support10,,,2.0


In [8]:
df[['Name', 'StructureId', 'TypeId']] = df[['Name', 'StructureId', 'TypeId']].fillna('')
df = df.fillna(0)

df = df.drop_duplicates().reset_index(drop=True)
df[['HeadingId', 'CategoryId', 'StandardProjectId', 'ImageIndex']] = df[['HeadingId', 'CategoryId', 'StandardProjectId', 'ImageIndex']].astype('int32')
df.sample(5)

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
108240,Болт М24х220,"Болт М24х220, ГОСТ Р ИСО 4014-2013",15,49,А10-1Д,43,support10,0.0,0.0,2
37188,EPKT 2302-L12,Муфта концевая наружной установки для 3-х жиль...,4,20200613,,0,,0.0,0.0,0
75291,ПвВнг(А)-3х120-10,"Кабель силовой с медными жилами, с изоляцией и...",3,124,,0,,0.0,0.0,0
88554,GPE 7,Колпачок концевой,9,58,А2 (2хСВ95-2)-1с+ЛО.А+ВОЛЗ.А4,694,support,0.0,0.0,3
29308,CLM50D-KPS-41-41-02-HDZ,Подвес потолочный STRUT 41х41-200 HDZ IEK,17,101,,0,,0.0,0.0,0


**NearestNeighbors**

In [9]:
features = df.drop(columns=['Id_x', 'Name'], axis=1)
features['StructureId'] = encoder_struct.fit_transform(features['StructureId'])
features['TypeId'] = encoder_type.fit_transform(features['TypeId'])
features = scaler.fit_transform(features)
features[0:5]

array([[3.33333333e-01, 2.86840325e-06, 3.29069255e-01, 1.47157191e-01,
        8.88888889e-01, 0.00000000e+00, 0.00000000e+00, 5.00000000e-01],
       [3.70370370e-01, 4.94552284e-07, 3.29069255e-01, 1.47157191e-01,
        8.88888889e-01, 0.00000000e+00, 0.00000000e+00, 5.00000000e-01],
       [3.33333333e-01, 2.86840325e-06, 3.29069255e-01, 1.47157191e-01,
        8.88888889e-01, 0.00000000e+00, 0.00000000e+00, 5.00000000e-01],
       [3.33333333e-01, 2.86840325e-06, 3.29069255e-01, 1.47157191e-01,
        8.88888889e-01, 0.00000000e+00, 0.00000000e+00, 5.00000000e-01],
       [3.33333333e-01, 2.86840325e-06, 3.29069255e-01, 1.47157191e-01,
        8.88888889e-01, 0.00000000e+00, 0.00000000e+00, 5.00000000e-01]])

In [10]:
n_neighbors = 100
model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
model.fit(features);

In [11]:
request = df[df['StructureId'] == 'А11']
request

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
99765,COT36.2,Бугель (скрепа) для ленты,9,28,А11,47,support,0.0,0.0,3
99766,COT37.2,"Лента бандажная стальная 20 мм x 0,70 мм x 50 ...",9,28,А11,47,support,0.0,0.0,3
99767,PER15,"Ремешок бандажный, L=300 мм, B=4,8 мм, D=80 мм",9,28,А11,47,support,0.0,0.0,3
99768,SH702R,Стяжка ж/б стойки типа СВ110,8,28,А11,47,support,0.0,0.0,3
99769,SL37.2,Зажим соединительный плашечный; магистраль: 6-...,9,28,А11,47,support,0.0,0.0,3
99770,SLIP22.1,Зажим влагозащищенный изолированный прокалываю...,9,28,А11,47,support,0.0,0.0,3
99771,SLIP22.127,Зажим влагозащищенный изолированный прокалываю...,9,28,А11,47,support,0.0,0.0,3
99772,SO250.01,Зажим натяжной клиновой для магистрали (50-70 ...,9,28,А11,47,support,0.0,0.0,3
99773,SOT29.10R,Крюк бандажный ø16 мм,9,72,А11,47,support,0.0,0.0,3
99774,ЗП6,Заземляющий проводник,8,25,А11,47,support,0.0,0.0,3


In [12]:
results = pd.DataFrame()
for _ in range(request.shape[0]):
    element = request.iloc[[_]].drop(columns=['Id_x', 'Name'])
    element['StructureId'] = encoder_struct.transform(element['StructureId'])
    element['TypeId'] = encoder_type.transform(element['TypeId'])
    element = scaler.transform(element)
    answer = model.kneighbors(element)
    result = df.loc[answer[1][0]]
    result['distance'] = answer[0][0]
    result = result[~result['Id_x'].isin(request['Id_x'])]
    try:
        result = result[~result['Id_x'].isin(results['Id_x'])]
    except:
        pass
    result = result.drop_duplicates(subset=['Id_x'])
    result = result[:int(np.ceil(25 / request.shape[0]))]
    results = pd.concat([results, result], axis=0)

results = results.sort_values(by='distance').head(25).sort_values(by='Name')
results

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex,distance
87344,PA 1500,Анкерный клиновый зажим. Cечение жилы 50-70 мм2,9,95,А29,44,support,0.0,0.0,3,0.0002092735
106087,PD2.2,Гайка крюкообразная,9,72,А12,48,support,0.0,0.0,3,1.082371e-06
87343,KZP1,Зажим,9,26,А29,44,support,0.0,0.0,3,0.0002092735
94922,P 72,Зажим для подкл. абонента к изолир. магистраль...,9,95,А23,2,support,0.0,0.0,3,0.001084449
87346,ЗПВ,Зажим переходной прокалывающий,9,26,А29,44,support,0.0,0.0,3,0.0002092735
87349,ПС-1-1,Зажим плашечный,9,26,А29,44,support,0.0,0.0,3,0.0002092735
87372,ЗП1М,Заземляющий проводник,8,25,АО29,44,support,0.0,0.0,3,0.00145882
87345,ЗП2М,Заземляющий проводник,8,25,А29,44,support,0.0,0.0,3,0.0002128643
87342,KR 1,Кабельный ремешок,9,26,А29,44,support,0.0,0.0,3,0.0002092735
99781,PK99.2595,Колпачок защитный для провода 25-95(120) мм²,9,28,А11-к,47,support,0.0,0.0,3,7.049438e-08
