# Multi-Class Text Classification Fine-Tuning

In [None]:
## Doing 
'''
Start by installing the Transformers library from GitHub repository.
'''
#!pip install git+https://github.com/huggingface/transformers.git

## Step: Import All Necessary Libraries

In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import pandas as pd
import json
import gc

In [None]:
#Read data from local directory
inventory_df = pd.read_excel("Multi-Class-Text-Classification\data_test.xlsx")
inventory_df.head()

In [None]:
#Filter Out the Unlabelled Values
inventory_df = inventory_df[(inventory_df['Category'] == 'Material') | (inventory_df['Category'] == 'Service')]

In [None]:
#Get the Unique Items from the Category column
inventory_df['Category'].unique()

In [None]:
#Encode Category to 1 & 0 for easy identification.
inventory_df['encoded_cat'] = inventory_df['Category'].astype('category').cat.codes
inventory_df.head()

In [None]:
#inventory_df.drop(['Task Plan'], 1, inplace=True) XXX
inventory_df = inventory_df.drop(columns='Task Plan') # Pandas official way of dropping columns
inventory_df.head(20)

In [None]:
#Split Data to feature and labels
data_texts = inventory_df["Description"].to_list() # Features (not-tokenized yet)
data_labels = inventory_df["encoded_cat"].to_list() # Lables

### Split to Train and Test using SKlearn

In [None]:
from sklearn.model_selection import train_test_split

# Split Train and Validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=0, shuffle=True)

# Keep some data for inference (testing)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0, shuffle=True)

### Download Model needed from Transformers Library

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

### Set Parameter for model

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

#optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
#model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])

### Train model using Tensorflow:
This was set to 1 epochs or iteration, for better accuracy we might need to increase the number of iteration and parameter tuning.

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(train_dataset.shuffle(1000).batch(16),
          epochs=1,
          batch_size=16,
          validation_data=val_dataset.shuffle(1000).batch(16),
          callbacks=[early_stopping])

In [None]:
# Display the model's architecture
model.summary()

### Save Trained Model in Local Directory

In [None]:
from tensorflow.keras.models import load_model

In [None]:
save_directory = "Inventory" # change this to your preferred location

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
loaded_tokenizer = DistilBertTokenizer.from_pretrained(save_directory)
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

In [None]:
# 0: Material
# 1: Services

### Test Model 

In [None]:
test_text = test_texts[10]
test_text

In [None]:
test_text = ['Bed with foamed bedfloor, 6x4x8 orthopaedic Vitafoam Supreme, Gazelle pillow,etc']

In [None]:
predict_input = loaded_tokenizer.encode(test_text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

output = loaded_model(predict_input)[0]

prediction_value = tf.argmax(output, axis=1).numpy()[0]
prediction_value

In [None]:
predict_input = loaded_tokenizer.encode(test_text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

output = loaded_model(predict_input)[0]

prediction_value = tf.argmax(output, axis=1).numpy()[0]

# Convert numeric prediction to category label
if prediction_value == 0:
    prediction_label = "Material"
else:
    prediction_label = "Service"  # Handle unexpected values if necessary

print("Predicted Category:", prediction_label)


In [None]:
def predict_category(text):

    predict_input = loaded_tokenizer.encode(text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

    output = loaded_model(predict_input)[0]

    prediction_value = tf.argmax(output, axis=1).numpy()[0]

    return prediction_value
# -----------------------------------------------------
y_pred = []
for texts in test_texts:
    y_pred.append(predict_category(texts))
# -------------------------------------------
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

confusion = confusion_matrix(test_labels, y_pred)

plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(confusion, annot=True, fmt="d", cmap="Blues", cbar=False, square=True,
            xticklabels=["Material", "Service"], yticklabels=["Material", "Service"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(classification_report(test_labels, y_pred))

### Test the model to the other Unlabelled dataset

In [None]:
unlabelled_df = pd.read_excel("data_test.xlsx")
unlabelled_df.head()

In [None]:
# NOTE: Re-run cell 4 before executing this cell
unlabelled_df.drop('Task Plan', axis=1, inplace=True)
unlabelled_df = unlabelled_df[unlabelled_df['Category'] == 'Unlabelled']
unlabelled_df.head()

In [None]:
len(unlabelled_df)

In [None]:
unlabelled_predictions = []
for data in unlabelled_data:
    unlabelled_predictions.append(predict_category(data))

In [None]:
prediction_df = pd.DataFrame({
    "Description": unlabelled_data,
    "Category": unlabelled_predictions,
})

prediction_df.to_csv("prediction.csv", index=False)

In [None]:
len(prediction_df[prediction_df["Category"] == 1])

In [None]:
predict_input = loaded_tokenizer.encode(test_text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

output = loaded_model(predict_input)[0]

prediction_value = tf.argmax(output, axis=1).numpy()[0]

# Convert numeric prediction to category label
if prediction_value == 0:
    prediction_label = "Material"
else:
    prediction_label = "Service"  # Handle unexpected values if necessary

print("Predicted Category:", prediction_label)
