In [10]:
import joblib
import pandas as pd
import numpy as np
import sqlite3

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors

In [11]:
conn = sqlite3.connect('../database.db')

In [12]:
data_Parts = pd.read_sql("select Id, Name, HeadingId, CategoryId from Parts;", con=conn)
data_StructuresParts = pd.read_sql("select StructureId, PartId from StructuresParts;", con=conn)
data_Structures = pd.read_sql("select Id, StandardProjectId, TypeId from Structures;", con=conn)
data_StandardProjects = pd.read_sql("select Id, ImageIndex from StandardProjects;", con=conn)
data_Conductors = pd.read_sql("select PartId, TypeId, Diameter, CrossSection from Conductors;", con=conn)

In [13]:
df = data_Parts.merge(data_StructuresParts, left_on='Id', right_on='PartId', how='outer').drop('PartId', axis=1)
df = df.merge(data_Structures, left_on='StructureId', right_on='Id', how='outer').drop('Id_y', axis=1)
data_Conductors.rename(columns={'PartId': 'Id_x'}, inplace=True)
df = pd.concat([df, data_Conductors], axis=0)
df = df.merge(data_StandardProjects, left_on='StandardProjectId', right_on='Id', how='left').drop('Id', axis=1)

In [14]:
df

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
0,___-12-2А G28,Зажим аппаратный,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
1,d10 (цинк),Сталь круглая оцинкованная по ГОСТ 9.307-89,10.0,10.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
2,PA2870P,Зажим натяжной,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
3,PGA101,,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
4,PLDT2 R,Вязка односторонняя диэлектрическая,9.0,58.0,"К10/0,38-1/0 (1хСК120-6)-3_(50-70 мм²)",132.0,support10,,,1.0
...,...,...,...,...,...,...,...,...,...,...
132075,ЦСБвШнг 3х95-6,,,,,,cabel10,,,
132076,Э-ДПС-06-08-Е4(3),,,,,,commCable,,,
132077,ЭВБВ 3х95-6,,,,,,cabel10,,,
132078,ЭСО-ДПС-06-20-Е4(тип3),,,,,,commCable,,,


In [15]:
df['StructureId'] = df['StructureId'].str.split('_').str[0]
df['ImageIndex'] = df['ImageIndex'] + 1

In [16]:
# Instantiate LabelEncoders
encoder_strId = LabelEncoder()
encoder_typeId = LabelEncoder()

# Fill missing values in StructureId with "missing" placeholder,
# For later decoding
df.StructureId = df.StructureId.fillna('missing')

# Encoding
df['StructureId'] = encoder_strId.fit_transform(df['StructureId'])
df['TypeId'] = encoder_typeId.fit_transform(df['TypeId'])

# Fill all other missing values with 0
df['Name'] = df['Name'].fillna('missing')
df = df.fillna('0')

# Drop full duplicates
df = df.drop_duplicates().reset_index(drop=True)

In [8]:
df

Unnamed: 0,Id_x,Name,HeadingId,CategoryId,StructureId,StandardProjectId,TypeId,Diameter,CrossSection,ImageIndex
0,___-12-2А G28,Зажим аппаратный,9.0,58.0,746,132.0,15,0.0000,0.000000,2.0
1,d10 (цинк),Сталь круглая оцинкованная по ГОСТ 9.307-89,10.0,10.0,746,132.0,15,0.0000,0.000000,2.0
2,PA2870P,Зажим натяжной,9.0,58.0,746,132.0,15,0.0000,0.000000,2.0
3,PGA101,,9.0,58.0,746,132.0,15,0.0000,0.000000,2.0
4,PLDT2 R,Вязка односторонняя диэлектрическая,9.0,58.0,746,132.0,15,0.0000,0.000000,2.0
...,...,...,...,...,...,...,...,...,...,...
115768,ЦСБвШнг 3х95-6,0,0.0,0.0,52,0.0,5,0.0000,0.000000,0.0
115769,Э-ДПС-06-08-Е4(3),0,0.0,0.0,52,0.0,6,0.0000,0.000000,0.0
115770,ЭВБВ 3х95-6,0,0.0,0.0,52,0.0,5,0.0000,0.000000,0.0
115771,ЭСО-ДПС-06-20-Е4(тип3),0,0.0,0.0,52,0.0,6,0.0000,0.000000,0.0


### Dump Data

In [9]:
# Save processed data
df.to_csv("data_040923.csv", index=False)

# Save "StructureId" encoder for inverse transform
joblib.dump(encoder_strId, "decoder_strId_040923")

['decoder_strId_040923']

## NearestNeighbors

In [18]:
n_neighbors = 35
knn_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')

In [19]:
knn_model.fit(df.drop(columns=['Id_x', 'Name'], axis=1))
joblib.dump(knn_model, "knn_model")

['knn_model']