In [1]:
import json
import re
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
with open('dataset.json', 'r') as f:
    data = json.load(f)

# Preprocess data

In [3]:
def preprocess_text(text):
    # Remove special characters and extra whitespace
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

In [4]:
X = [preprocess_text(item['externalStatus']) for item in data]
y = [item['internalStatus'] for item in data]

In [43]:
import pandas as pd

# Create DataFrame
df = pd.DataFrame({'Column1': X, 'Column2': y})

# Display DataFrame
df


Unnamed: 0,Column1,Column2
0,port out,Port Out
1,terminal in,Inbound Terminal
2,port in,Port In
3,vessel departure from first pol vessel name ti...,Departure
4,vessel arrival at final pod vessel name tian f...,Arrival
...,...,...
1217,import loaded on rail,Loaded on Vessel
1218,full transshipment loaded,Loaded on Vessel
1219,full transshipment loaded,Loaded on Vessel
1220,export loaded on vessel,Loaded on Vessel


In [44]:
# Rename columns
df = df.rename(columns={'Column1': 'External_Status', 'Column2': 'Internal_Status'})


In [45]:
df

Unnamed: 0,External_Status,Internal_Status
0,port out,Port Out
1,terminal in,Inbound Terminal
2,port in,Port In
3,vessel departure from first pol vessel name ti...,Departure
4,vessel arrival at final pod vessel name tian f...,Arrival
...,...,...
1217,import loaded on rail,Loaded on Vessel
1218,full transshipment loaded,Loaded on Vessel
1219,full transshipment loaded,Loaded on Vessel
1220,export loaded on vessel,Loaded on Vessel


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Model Development

In [18]:
# Convert labels to numerical format
label_to_index = {label: i for i, label in enumerate(set(y))}
y_train_numeric = np.array([label_to_index[label] for label in y_train])
y_test_numeric = np.array([label_to_index[label] for label in y_test])

In [19]:
# Tokenize input text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [39]:
# Import Bidirectional layer
from tensorflow.keras.layers import Bidirectional

# Define model with Bi-LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_len),
    Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(len(set(y)), activation='softmax')
])

# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train_pad, y_train_numeric, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test_numeric))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x246ab933790>

In [40]:
# Convert y_test to numpy array
y_test_numeric = np.array([label_to_index[label] for label in y_test])

# Evaluate model
loss, accuracy = model.evaluate(X_test_pad, y_test_numeric)
print('Accuracy:', accuracy)


Accuracy: 0.977505087852478
