In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from keras.layers import SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import scipy.stats as stats

In [13]:
# load the dataset
dataset_text =pd.read_excel("../AAI_Logbook_v2/dataset/TOTAL_2.xlsx",engine='openpyxl')
dataset_text["Hellen Naam order"] = dataset_text["Hellen Naam order"].apply(lambda x: x.lower())

In [14]:
# map the label column to integer values
dataset_text["LABEL"] = dataset_text["LABEL"].map({'7 kW - 2 WCD':0, '10 kW':1, '10 kW - 3 WCD':2, '20 kW':3, 'N.V.T':4,
   '30 kW':5, '3 kW':6, '40 kW':7, '50 kW':8, '60 kW':9, '80 kW':10,
   'Powerlock 130 kW':11, 'Powerlock 200 kW':12, 'WCD':13, 'Kabelmat':14,
   'Kabelgoot':15, 'CEE-125A':16, 'CEE-16A':17, 'CEE-32A':18, 'CEE-63A':19, 'Pendel':20,
   'NU':21, 'Expo Pendel':22, 'P':23, 'S':24, 'Vloerspot':25,
   'Statief + 2 Vloerspot':26, 'Aarding':27, '3 KW':28, 'Powerlock':29, '40 KW':30,
   'Rigging':31, '3 kW + NU':32, 'Prikverlichting':33, 'Troll Pendel':34})

In [15]:
# split dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(dataset_text['Hellen Naam order'], dataset_text['LABEL'], test_size=0.5, random_state=42)

In [16]:
# convert text data to sequences and pad them
max_features = 10000
max_length = 150
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train.values)
X_train = tokenizer.texts_to_sequences(X_train.values)
X_train = pad_sequences(X_train, maxlen=max_length)
X_test = tokenizer.texts_to_sequences(X_test.values)
X_test = pad_sequences(X_test, maxlen=max_length)

In [17]:
# convert labels to categorical
y_train = to_categorical(y_train)

In [18]:
# define the model architecture
from keras.layers import Conv1D, MaxPooling1D, Flatten
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
#model.add(MaxPooling1D())
model.add(Conv1D(128,5,activation = "relu"))
model.add(MaxPooling1D())
model.add(Dense(256, activation = "relu"))
model.add(Dense(128, activation = "relu"))
#model.add(Dense(256, activation = "relu"))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(35, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='Nadam', metrics=['accuracy'])

In [19]:
# train the model
batch_size = 32
model.summary()
history = model.fit(X_train, y_train, epochs=20, batch_size=batch_size, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 128)          1280000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 150, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 146, 128)          82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 73, 128)           0         
_________________________________________________________________
dense_3 (Dense)              (None, 73, 256)           33024     
_________________________________________________________________
dense_4 (Dense)              (None, 73, 128)           32896     
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)              

In [20]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_pred_labels

array([31, 31, 31, 31, 31, 31,  4, 31,  6, 31, 31, 17, 25,  6, 31, 31,  0,
       31, 31,  6,  6, 24, 31, 31, 31,  6, 31, 31, 33,  3, 31,  0, 31,  3,
       31, 31, 31, 17,  3,  6, 31,  6, 31, 31,  6, 13, 31, 33, 31, 31, 31,
        0, 31, 31, 31, 31,  4, 31, 31,  3, 31, 31, 31,  3, 31, 31,  6, 31,
       31, 31, 31, 31, 31, 31, 31, 31,  6, 31, 31,  6, 31, 31, 31, 31, 31,
       31, 31, 31,  0, 31, 31, 31, 31, 31, 31, 31, 31, 17, 31,  6, 31,  7,
       33, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,  6,
       31, 31, 33, 13, 31, 33, 31, 31, 33, 31, 31, 31, 31, 23, 31, 31, 31,
        6, 31, 31, 31, 31, 31,  5, 31,  6, 31, 31, 31,  4, 31, 31, 31, 31,
       31, 31, 31, 31, 31, 33, 33,  3, 31, 31, 31, 31, 31, 31, 33, 33, 31,
       31, 31, 31, 31, 31, 31, 31, 31,  6, 31, 23, 31, 31, 31, 31, 31, 25,
       31, 31,  6,  6, 31, 31, 31, 31, 31, 31,  6, 33, 31, 31, 24, 31, 31,
       31, 31, 31, 31, 13,  0, 31, 31, 31, 31,  5,  7, 31, 31, 31, 31,  6,
       31, 31, 31, 31,  4

In [21]:
#y_test_labels = np.argmax(y_test, axis=1)

print(classification_report(y_test, y_pred_labels))

              precision    recall  f1-score   support

           0       0.25      0.40      0.31         5
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         2
           3       0.50      0.78      0.61         9
           4       0.43      0.27      0.33        22
           5       0.33      1.00      0.50         3
           6       0.97      0.90      0.94        41
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.71      0.71      0.71         7
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       0.17      0.50      0.25         2
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [34]:

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model.add(Conv1D(2056, 5, activation = 'relu'))
model.add(SimpleRNN(1024, return_sequences = True))
model.add(SimpleRNN(1024))
model.add(Dense(35, activation = 'sigmoid'))


In [35]:
from tensorflow import keras
opt = keras.optimizers.SGD(learning_rate=0.1)
model.compile(loss='categorical_crossentropy', optimizer='Nadam', metrics=['accuracy'])

model.summary()
history = model.fit(X_train, y_train, epochs=15, batch_size=batch_size, validation_split=0.2)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 150, 128)          1280000   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 146, 2056)         1317896   
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 146, 1024)         3154944   
_________________________________________________________________
simple_rnn_7 (SimpleRNN)     (None, 1024)              2098176   
_________________________________________________________________
dense_9 (Dense)              (None, 35)                35875     
Total params: 7,886,891
Trainable params: 7,886,891
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15

In [36]:
#y_test_labels = np.argmax(y_test, axis=1)
y_pred = model.predict(X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_pred_labels
print(classification_report(y_test, y_pred_labels))




              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         9
           4       0.00      0.00      0.00        22
           5       0.00      0.00      0.00         3
           6       0.44      0.39      0.42        41
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         7
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         2
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
