Clean data and prepare samples

In [28]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd
import pickle

df_objects = pd.read_csv(r'data\raw_data\objectsRevaluated.csv', sep=',', low_memory=False)
df_objects = df_objects[['Name', 'Type', 'Family', 'Category', 'NBR_COD', 'NBR_DESC']]

df_train = df_objects[df_objects['NBR_COD'].notna()]

label_encoder = LabelEncoder()
df_train['NBR_ENCOD'] = label_encoder.fit_transform(df_train['NBR_COD'])

one_hot_encoder = OneHotEncoder(sparse_output=False)
y_train = one_hot_encoder.fit_transform(df_train['NBR_ENCOD'].values.reshape(-1,1))

with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['NBR_ENCOD'] = label_encoder.fit_transform(df_train['NBR_COD'])


(7255, 41)

Build model

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.api.models import Model
from keras.api.layers import Dense, Input, Concatenate
import pickle

tfidf_name = TfidfVectorizer()  
tfidf_type = TfidfVectorizer()  
tfidf_family = TfidfVectorizer()  
tfidf_category = TfidfVectorizer()  

x_train_name = tfidf_name.fit_transform(df_train['Name'].astype(str))
x_train_type = tfidf_type.fit_transform(df_train['Type'].astype(str))
x_train_family = tfidf_family.fit_transform(df_train['Family'].astype(str))
x_train_category = tfidf_category.fit_transform(df_train['Category'].astype(str))

with open('models/tfidf_name.pkl', 'wb') as f:
    pickle.dump(tfidf_name, f)

with open('models/tfidf_type.pkl', 'wb') as f:
    pickle.dump(tfidf_type, f)

with open('models/tfidf_family.pkl', 'wb') as f:
    pickle.dump(tfidf_family, f)

with open('models/tfidf_category.pkl', 'wb') as f:
    pickle.dump(tfidf_category, f)

# Individual inputs
name_input = Input(shape=(x_train_name.shape[1],))
type_input = Input(shape=(x_train_type.shape[1],))
family_input = Input(shape=(x_train_family.shape[1],))
category_input = Input(shape=(x_train_category.shape[1],))

# One layer for each input
name_layer = Dense(64, activation='relu')(name_input)
type_layer = Dense(64, activation='relu')(type_input)
family_layer = Dense(64, activation='relu')(family_input)
category_layer = Dense(64, activation='relu')(category_input)

# Concatenate layers outputs
combined = Concatenate()([name_layer,type_layer,family_layer,category_layer])

# Output layer
output = Dense(y_train.shape[1], activation='softmax')(combined)

# Create model
model = Model(inputs=[name_input,type_input,family_input,category_input], outputs=output)

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit([x_train_name,x_train_type,x_train_family,x_train_category], y_train, epochs=20, batch_size=32)

# Save model
model.save(r'models/BIM_CLASSIFICATION_MODEL.keras')

Epoch 1/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6505 - loss: 2.2574
Epoch 2/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9408 - loss: 0.2649
Epoch 3/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9619 - loss: 0.1229
Epoch 4/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9621 - loss: 0.0919
Epoch 5/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9673 - loss: 0.0752
Epoch 6/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9661 - loss: 0.0704
Epoch 7/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9588 - loss: 0.0721
Epoch 8/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9686 - loss: 0.0596
Epoch 9/20
[1m227/227[0m [32m━━━━━━━━

Test model

In [32]:
import pandas as pd
import numpy as np
from keras.api.models import load_model
import pickle

model = load_model(r'models/BIM_CLASSIFICATION_MODEL.keras')

df_objects = pd.read_csv(r'data/raw_data/objectsRevaluated.csv', sep=',', low_memory=False)
df_objects = df_objects[['Name', 'Type', 'Family', 'Category', 'NBR_COD', 'NBR_DESC']]

df_sample = df_objects.copy()

with open('models/tfidf_name.pkl', 'rb') as f:
    tfidf_name = pickle.load(f)

with open('models/tfidf_type.pkl', 'rb') as f:
    tfidf_type = pickle.load(f)

with open('models/tfidf_family.pkl', 'rb') as f:
    tfidf_family = pickle.load(f)

with open('models/tfidf_category.pkl', 'rb') as f:
    tfidf_category = pickle.load(f)
    
with open('models/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

x_test_name = tfidf_name.transform(df_sample['Name'].astype(str)).toarray()
x_test_type = tfidf_type.transform(df_sample['Type'].astype(str)).toarray()
x_test_family = tfidf_family.transform(df_sample['Family'].astype(str)).toarray()
x_test_category = tfidf_category.transform(df_sample['Category'].astype(str)).toarray()

y_pred = model.predict([x_test_name, x_test_type, x_test_family, x_test_category])
y_pred_labels = np.argmax(y_pred, axis=1)

df_sample['NBR_COD_PRED'] = label_encoder.inverse_transform(y_pred_labels)
df_sample = df_sample.drop_duplicates(subset=['Family','Category'])
df_sample.to_csv(r'predictions/NBR_predicted.csv', index=False)


[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
