In [1]:
# pip install tensorflow==2.16.1

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import pickle

# Load and preprocess data
df = pd.read_excel(r'D:/SHARING PROJECTS/CHIETA\CHATBOT\dataset\combined dataset.xlsx')
df['KEYWORDS'] = df['KEYWORDS'].str.lower().str.strip()



In [3]:
df.head()

Unnamed: 0,KEYWORDS,Purpose
0,bursaries,Bursaries
1,scholarships,Bursaries
2,grants,Bursaries
3,fellowships,Bursaries
4,financial aid,Bursaries


In [4]:
X = df['KEYWORDS']
y = df['Purpose']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)



In [5]:
# Get the number of unique classes for the output layer
num_classes = len(label_encoder.classes_)

num_classes

18

In [6]:
# Vectorize the input text data using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# Save the fitted TF-IDF vectorizer for later use
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [8]:
# Define the model with the correct input shape and output layer for multi-class classification
input_dim = X_train.shape[1]  # Set input_dim to the number of features after TF-IDF
model = Sequential()
model.add(Input(shape=(input_dim,)))  # Adjust input shape to match the TF-IDF output
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(units=num_classes, activation='softmax'))  # Softmax for multi-class classification

In [9]:
model.summary()

In [10]:
# Compile the model with the appropriate loss function
model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

# Train the model
model.fit(X_train.toarray(), y_train, epochs=800, batch_size=32)  # Convert sparse matrix to dense for compatibility

Epoch 1/800
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 635us/step - accuracy: 0.0884 - loss: 2.8815
Epoch 2/800
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 633us/step - accuracy: 0.1001 - loss: 2.8464
Epoch 3/800
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 611us/step - accuracy: 0.0988 - loss: 2.8261
Epoch 4/800
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 603us/step - accuracy: 0.0933 - loss: 2.8118
Epoch 5/800
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 606us/step - accuracy: 0.0956 - loss: 2.8013
Epoch 6/800
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 605us/step - accuracy: 0.1229 - loss: 2.7801
Epoch 7/800
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 652us/step - accuracy: 0.1837 - loss: 2.7477
Epoch 8/800
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 598us/step - accuracy: 0.1926 - loss: 2.6884
Epoch 9/800
[1m

<keras.src.callbacks.history.History at 0x1a37f469520>

In [17]:
# Save the trained model
# Save the model in SavedModel format
model.save('sequentialModelDeepLearning..keras')
# keras.saving.save_model(model, 'ss.keras')

# print("Model and vectorizer saved successfully.")

In [21]:
import pickle
from tensorflow.keras.models import load_model
import numpy as np

# Load the saved model
model = load_model('C:/Users/JBS/sequentialModelDeepLearning..keras')

# Load the saved TfidfVectorizer
with open('tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# Load the saved LabelEncoder
with open('label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)


  saveable.load_own_variables(weights_store.get(inner_path))


In [33]:
# Example of new input data for prediction
new_input = ["What is the role of a skills development facilitator in employee assessments?"]  # Replace with actual keywords for prediction

# Transform the new input using the loaded TF-IDF vectorizer
transformed_input = vectorizer.transform(new_input)

# Convert sparse matrix to dense array for prediction compatibility
transformed_input = transformed_input.toarray()

# Make a prediction
predictions = model.predict(transformed_input)

# Get the index of the class with the highest probability
predicted_class_index = np.argmax(predictions, axis=1)[0]

# Decode the predicted class label
predicted_class_label = label_encoder.inverse_transform([predicted_class_index])[0]

print(f"Predicted category for '{new_input[0]}': {predicted_class_label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Predicted category for 'What is the role of a skills development facilitator in employee assessments?': What is a skills development facilitator
