In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import RandomOverSampler

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the data
file_path = "C:/Users/isura/Downloads/chatbotmodel/Dataset_risk_keywords.xlsx"
df = pd.read_excel(file_path)

# Define preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing and removing punctuation
    tokens = [token.lower() for token in tokens if token.isalpha()]
    
    # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Preprocess the text data
df['Preprocessed_Prompt'] = df['Prompt'].apply(preprocess_text)

# Tokenization and Padding
maxlen = 100  # Maximum sequence length
max_words = 10000  # Maximum number of words to consider as features

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['Preprocessed_Prompt'])
sequences = tokenizer.texts_to_sequences(df['Preprocessed_Prompt'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Risk Level'])
labels = to_categorical(labels)  # Convert to one-hot encoding

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Oversample the minority classes
oversampler = RandomOverSampler()
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Define the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=maxlen))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

# Compile the model
optimizer = Adam(lr=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model on the resampled data
history = model.fit(X_train_resampled, y_train_resampled,
                    epochs=20,
                    batch_size=32,
                    validation_data=(X_test, y_test),
                    verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)
print("Test Loss:", loss)

# Generate predictions
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
print("Classification Report:\n", classification_report(np.argmax(y_test, axis=1), y_pred))





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\isura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\isura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Found 1188 unique tokens.





Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 spatial_dropout1d (Spatial  (None, 100, 128)          0         
 Dropout1D)                                                      
                                                                 
 bidirectional (Bidirection  (None, 256)               263168    
 al)                                                             
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 4)                 2










145/145 - 115s - loss: 1.0031 - accuracy: 0.5622 - val_loss: 0.4229 - val_accuracy: 0.8462 - 115s/epoch - 795ms/step
Epoch 2/20
145/145 - 96s - loss: 0.1837 - accuracy: 0.9406 - val_loss: 0.1726 - val_accuracy: 0.9558 - 96s/epoch - 659ms/step
Epoch 3/20
145/145 - 108s - loss: 0.0722 - accuracy: 0.9819 - val_loss: 0.1421 - val_accuracy: 0.9715 - 108s/epoch - 744ms/step
Epoch 4/20
145/145 - 91s - loss: 0.0417 - accuracy: 0.9881 - val_loss: 0.1608 - val_accuracy: 0.9729 - 91s/epoch - 628ms/step
Epoch 5/20
145/145 - 95s - loss: 0.0311 - accuracy: 0.9903 - val_loss: 0.1468 - val_accuracy: 0.9758 - 95s/epoch - 658ms/step
Epoch 6/20
145/145 - 92s - loss: 0.0225 - accuracy: 0.9931 - val_loss: 0.1550 - val_accuracy: 0.9786 - 92s/epoch - 637ms/step
Epoch 7/20
145/145 - 89s - loss: 0.0265 - accuracy: 0.9922 - val_loss: 0.1456 - val_accuracy: 0.9815 - 89s/epoch - 612ms/step
Epoch 8/20
145/145 - 87s - loss: 0.0213 - accuracy: 0.9933 - val_loss: 0.1390 - val_accuracy: 0.9801 - 87s/epoch - 601ms/step

In [5]:
# Define a function to test a prompt
def test_prompt(prompt):
    # Preprocess the prompt
    preprocessed_prompt = preprocess_text(prompt)
    
    # Tokenize and pad the preprocessed prompt
    sequence = tokenizer.texts_to_sequences([preprocessed_prompt])
    padded_sequence = pad_sequences(sequence, maxlen=maxlen)
    
    # Use the trained model to predict the risk level
    prediction_prob = model.predict(padded_sequence)
    predicted_class = np.argmax(prediction_prob)
    
    # Decode the predicted class using the label encoder
    predicted_label = label_encoder.classes_[predicted_class]
    
    return predicted_label, prediction_prob

# Example prompt
prompt = "I have severe abdominal pain"

# Test the prompt
predicted_label, prediction_prob = test_prompt(prompt)
print("Predicted Risk Level:", predicted_label)
print("Prediction Probabilities:", prediction_prob)


Predicted Risk Level: High
Prediction Probabilities: [[9.9999952e-01 1.8999774e-10 4.2904315e-07 8.1562040e-12]]


In [6]:
import pickle

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
model.save('my_model.keras')