Clean data and prepare samples

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

df_objects = pd.read_csv(r'data\objectsRevaluated.csv', sep=',', low_memory=False)
df_objects = df_objects[['Name', 'Type', 'Family', 'Category', 'NBR_COD', 'NBR_DESC']]

df_train = df_objects[df_objects['NBR_COD'].notna()]

label_encoder = LabelEncoder()
df_train['NBR_ENCOD'] = label_encoder.fit_transform(df_train['NBR_COD'])

one_hot_encoder = OneHotEncoder(sparse_output=False)
y_train = one_hot_encoder.fit_transform(df_train['NBR_ENCOD'].values.reshape(-1,1))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['NBR_ENCOD'] = label_encoder.fit_transform(df_train['NBR_COD'])


Build model

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.api.models import Sequential
from keras.api.layers import Dense, Dropout
from keras.api.optimizers import Adam
from scipy.sparse import hstack
import pickle

tfidf_name = TfidfVectorizer()  
tfidf_type = TfidfVectorizer()  
tfidf_family = TfidfVectorizer()  
tfidf_category = TfidfVectorizer()  

x_train_name = tfidf_name.fit_transform(df_train['Name'].astype(str))
x_train_type = tfidf_type.fit_transform(df_train['Type'].astype(str))
x_train_family = tfidf_family.fit_transform(df_train['Family'].astype(str))
x_train_category = tfidf_category.fit_transform(df_train['Category'].astype(str))

with open('models/tfidf_name.pkl', 'wb') as f:
    pickle.dump(tfidf_name, f)

with open('models/tfidf_type.pkl', 'wb') as f:
    pickle.dump(tfidf_type, f)

with open('models/tfidf_family.pkl', 'wb') as f:
    pickle.dump(tfidf_family, f)

with open('models/tfidf_category.pkl', 'wb') as f:
    pickle.dump(tfidf_category, f)

x_train_combined = hstack([x_train_name, x_train_type, x_train_family, x_train_category])

print(f"x_train_combined shape: {x_train_combined.shape}")
print(f"y_train shape: {y_train.shape}")

model = Sequential()
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(y_train.shape[1], activation='softmax'))  

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(x_train_combined.toarray(), y_train, epochs=20, batch_size=32)

model.save(r'models/BIM_CLASSIFICATION_MODEL.keras')

x_train_combined shape: (7255, 815)
y_train shape: (7255, 41)
Epoch 1/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5840 - loss: 2.0792
Epoch 2/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9180 - loss: 0.3330
Epoch 3/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9483 - loss: 0.1649
Epoch 4/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9527 - loss: 0.1275
Epoch 5/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9577 - loss: 0.1043
Epoch 6/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9602 - loss: 0.0908
Epoch 7/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9631 - loss: 0.0805
Epoch 8/20
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 

Test model

In [11]:
import pandas as pd
import numpy as np
from keras.api.models import load_model
import pickle

model = load_model(r'models/BIM_CLASSIFICATION_MODEL.keras')

df_objects = pd.read_csv(r'data/objectsRevaluated.csv', sep=',', low_memory=False)
df_objects = df_objects[['Name', 'Type', 'Family', 'Category', 'NBR_COD', 'NBR_DESC']]

df_sample = df_objects

with open('models/tfidf_name.pkl', 'rb') as f:
    tfidf_name = pickle.load(f)

with open('models/tfidf_type.pkl', 'rb') as f:
    tfidf_type = pickle.load(f)

with open('models/tfidf_family.pkl', 'rb') as f:
    tfidf_family = pickle.load(f)

with open('models/tfidf_category.pkl', 'rb') as f:
    tfidf_category = pickle.load(f)

x_test_name = tfidf_name.transform(df_sample['Name'].astype(str))
x_test_type = tfidf_type.transform(df_sample['Type'].astype(str))
x_test_family = tfidf_family.transform(df_sample['Family'].astype(str))
x_test_category = tfidf_category.transform(df_sample['Category'].astype(str))

x_test_combined = np.hstack([x_test_name.toarray(), x_test_type.toarray(), x_test_family.toarray(), x_test_category.toarray()])

y_pred = model.predict(x_test_combined)
y_pred_labels = np.argmax(y_pred, axis=1)
df_sample['NBR_COD_PRED'] = label_encoder.inverse_transform(y_pred_labels)
df_sample.to_csv(r'predictions\NBR_predicted.csv')


[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 834us/step
