In [1]:
!pip install tensorflow numpy pandas nltk openpyxl



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import string
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Download necessary NLP resources
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nkrid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load dataset (Ensure the file is in the same directory as your notebook)
file_path = "mental_health_chatbot_dataset.xlsx"
df = pd.read_excel(file_path)

# Show dataset preview
df.head()

Unnamed: 0,ID,User Input,Bot Response,Intent,Sentiment,Context
0,1,I feel really down today.,I'm sorry to hear that. Do you want to talk ab...,Sadness,Negative,Depression
1,2,I had a panic attack.,That sounds tough. Try to take deep breaths. I...,Anxiety,Negative,Panic Attack
2,3,I'm so stressed with work.,That sounds overwhelming. Would you like to tr...,Stress,Negative,Work Stress
3,4,I can't sleep at night.,I'm here to help. Have you tried deep breathin...,Insomnia,Negative,Sleep Issues
4,5,I'm feeling better today!,That's great! What helped you feel better?,Happiness,Positive,General


In [4]:
# Function to clean text
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = nltk.word_tokenize(text)  # Tokenize
    return " ".join(text)

# Apply preprocessing to user inputs and bot responses
df["User Input"] = df["User Input"].apply(preprocess_text)
df["Bot Response"] = df["Bot Response"].apply(preprocess_text)

# Display cleaned data
df.head()

Unnamed: 0,ID,User Input,Bot Response,Intent,Sentiment,Context
0,1,i feel really down today,im sorry to hear that do you want to talk abou...,Sadness,Negative,Depression
1,2,i had a panic attack,that sounds tough try to take deep breaths im ...,Anxiety,Negative,Panic Attack
2,3,im so stressed with work,that sounds overwhelming would you like to try...,Stress,Negative,Work Stress
3,4,i cant sleep at night,im here to help have you tried deep breathing ...,Insomnia,Negative,Sleep Issues
4,5,im feeling better today,thats great what helped you feel better,Happiness,Positive,General


In [28]:
# Define hyperparameters
MAX_LEN = 20  # Maximum sequence length
VOCAB_SIZE = 10000  # Limit vocabulary size

# Create a tokenizer
tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters="")
tokenizer.fit_on_texts(df["User Input"].tolist() + df["Bot Response"].tolist())

# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(df["User Input"])
response_sequences = tokenizer.texts_to_sequences(df["Bot Response"])

# Apply padding
input_sequences = pad_sequences(input_sequences, maxlen=MAX_LEN, padding="post")
response_sequences = pad_sequences(response_sequences, maxlen=MAX_LEN, padding="post")

# Convert to NumPy arrays
input_sequences = np.array(input_sequences)
response_sequences = np.array(response_sequences)

# Check shapes
input_sequences.shape, response_sequences.shape


((30, 20), (30, 20))

In [30]:
import pickle

# Save tokenizer to a file
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [31]:
# Define encoder
encoder_inputs = Input(shape=(MAX_LEN,))
encoder_embedding = Embedding(VOCAB_SIZE, 128)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Define decoder
decoder_inputs = Input(shape=(MAX_LEN,))
decoder_embedding = Embedding(VOCAB_SIZE, 128)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Dense output layer
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Build and compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

# Show model summary
model.summary()


In [32]:
# Train the model
model.fit([input_sequences, response_sequences], response_sequences, batch_size=64, epochs=50, validation_split=0.2)

Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 9.2111 - val_loss: 9.2005
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step - loss: 9.1990 - val_loss: 9.1879
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step - loss: 9.1844 - val_loss: 9.1684
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 254ms/step - loss: 9.1618 - val_loss: 9.1331
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step - loss: 9.1203 - val_loss: 9.0580
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step - loss: 9.0323 - val_loss: 8.8757
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step - loss: 8.8248 - val_loss: 8.5070
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step - loss: 8.4197 - val_loss: 8.0878
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x24a47463590>

In [33]:
import h5py
print("h5py is installed. Version:", h5py.__version__)


h5py is installed. Version: 3.13.0


In [34]:
model.save("mental_health_chatbot.keras")
print("Model Successfully Saved!!")

Model Successfully Saved!!


In [35]:
from tensorflow.keras.models import load_model

# Load the trained chatbot model
model = load_model("mental_health_chatbot.keras")

print("Model loaded successfully!")


Model loaded successfully!


In [36]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle


In [37]:
from tensorflow.keras.models import load_model
import numpy as np

# Load tokenizer
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Load trained model
model = load_model("mental_health_chatbot.keras")

# Define function to generate chatbot responses
def get_bot_response(user_input):
    user_seq = tokenizer.texts_to_sequences([user_input])
    user_seq = pad_sequences(user_seq, maxlen=MAX_LEN, padding="post")  # Ensure consistent length

    # Pass user input through the model
    prediction = model.predict([user_seq, np.zeros((1, MAX_LEN))])  # Ensure decoder input is correct
    predicted_index = np.argmax(prediction, axis=-1)

    # Convert predicted sequence back to words
    response_words = [word for word, index in tokenizer.word_index.items() if index in predicted_index[0]]

    return " ".join(response_words)


# Run chatbot loop
while True:
    print(MAX_LEN)
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("Chatbot: Goodbye!")
        break
    response = get_bot_response(user_input)
    print("Chatbot:", response)


20
You: Hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 315ms/step
Chatbot: im
20
You: How are you?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Chatbot: im
20
You: Blah Blah
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Chatbot: im
20
You: exit
Chatbot: Goodbye!


In [38]:
print(tokenizer.word_index)


{'you': 1, 'i': 2, 'to': 3, 'im': 4, 'can': 5, 'feel': 6, 'help': 7, 'that': 8, '’': 9, 'what': 10, 'is': 11, 'a': 12, 'be': 13, 'have': 14, 'here': 15, 'about': 16, 'tried': 17, 's': 18, 'with': 19, 'feeling': 20, 'like': 21, 'want': 22, 'how': 23, 'and': 24, 'really': 25, 'so': 26, 'some': 27, 'do': 28, 'for': 29, 'would': 30, 'today': 31, 'day': 32, 'emotions': 33, 'the': 34, 'listen': 35, 'alone': 36, 'life': 37, 'sorry': 38, 'sounds': 39, 'tough': 40, 'try': 41, 'deep': 42, 'relaxation': 43, 'techniques': 44, 'breathing': 45, 'or': 46, 'way': 47, 'talking': 48, 'exercises': 49, 'on': 50, 'one': 51, 'step': 52, 'might': 53, 'had': 54, 'attack': 55, 'better': 56, 'think': 57, 'me': 58, 'having': 59, 'need': 60, 'managing': 61, 'my': 62, 'just': 63, 'someone': 64, 'overthinking': 65, 'deal': 66, 'stress': 67, 'selfesteem': 68, 'professional': 69, 'responsibilities': 70, 'negative': 71, 'thoughts': 72, 'an': 73, 'bad': 74, 'dreams': 75, 'stuck': 76, 'in': 77, 'build': 78, 'confidence'

In [39]:
print("Sample Input Sequence:", input_sequences[:5])
print("Sample Response Sequence:", response_sequences[:5])


Sample Input Sequence: [[  2   6  25 103  31   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2  54  12 104  55   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  4  26 105  19 106   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2 107 108 109 110   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  4  20  56  31   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]
Sample Response Sequence: [[  4  38   3 144   8  28   1  22   3  82  16 145   0   0   0   0   0   0
    0   0]
 [  8  39  40  41   3 146  42 147   4  15  29   1   0   0   0   0   0   0
    0   0]
 [  8  39  83  30   1  21   3  41  27  43  44   0   0   0   0   0   0   0
    0   0]
 [  4  15   3   7  14   1  17  42  45  46 148  84  85   0   0   0   0   0
    0   0]
 [ 86  87  10 149   1   6  56   0   0   0   0   0   0   0   0   0   0   0
    0   0]]


In [40]:
decoded_responses = tokenizer.sequences_to_texts(response_sequences[:5])
print(decoded_responses)

['im sorry to hear that do you want to talk about it', 'that sounds tough try to take deep breaths im here for you', 'that sounds overwhelming would you like to try some relaxation techniques', 'im here to help have you tried deep breathing or meditation before bed', 'thats great what helped you feel better']


In [41]:
model.input  # Check input structure


[<KerasTensor shape=(None, 20), dtype=float32, sparse=False, name=input_layer_2>,
 <KerasTensor shape=(None, 20), dtype=float32, sparse=False, name=input_layer_3>]

In [42]:
decoder_input = np.zeros((1, MAX_LEN))  # Example decoder input (modify if needed)

predicted_sequence = model.predict([input_sequences[:1], decoder_input])
print(predicted_sequence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[[[1.1284223e-02 1.8427793e-02 9.9566299e-04 ... 1.2550480e-06
   1.5250345e-06 1.4948503e-06]
  [1.9975832e-02 2.4150236e-02 1.1443802e-03 ... 1.0105922e-06
   1.2932737e-06 1.2431459e-06]
  [3.3729289e-02 2.9346872e-02 1.2542300e-03 ... 9.6087342e-07
   1.2864533e-06 1.2182938e-06]
  ...
  [8.0921674e-01 1.8271808e-02 2.1336253e-03 ... 8.5411024e-08
   8.4877378e-08 8.1688540e-08]
  [8.0923754e-01 1.8270062e-02 2.1333850e-03 ... 8.5404579e-08
   8.4872759e-08 8.1683474e-08]
  [8.0924505e-01 1.8269371e-02 2.1332931e-03 ... 8.5403094e-08
   8.4871928e-08 8.1682366e-08]]]


In [43]:
predicted_tokens = np.argmax(predicted_sequence, axis=-1)  # Get highest probability token indices


In [44]:
predicted_text = tokenizer.sequences_to_texts(predicted_tokens)
print("Predicted Response:", predicted_text[0])  # Extract first response


Predicted Response: im im im


In [45]:
import numpy as np

def sample_with_temperature(predictions, temperature=0.7):
    predictions = np.log(predictions + 1e-9) / temperature
    exp_preds = np.exp(predictions)
    probabilities = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(probabilities), p=probabilities)

predicted_tokens = [sample_with_temperature(pred, temperature=0.8) for pred in predicted_sequence[0]]
predicted_text = tokenizer.sequences_to_texts([predicted_tokens])

print("Improved Response:", predicted_text[0])


Improved Response: have loneliness overwhelming what so on im im you


In [46]:
start_token = tokenizer.word_index.get("start", 1)  # Use "start" token if available
decoder_input = np.array([[start_token] + [0] * (MAX_LEN - 1)])  # Pad to max length

predicted_sequence = model.predict([input_sequences[:1], decoder_input])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 373ms/step


In [47]:
from collections import Counter

word_counts = Counter(" ".join(df["Bot Response"]).split())
print(word_counts["im"])  # See how often "im" appears


11


In [48]:
predicted_tokens = np.argmax(predicted_sequence, axis=-1)
predicted_text = tokenizer.sequences_to_texts(predicted_tokens)
print("Fixed Response:", predicted_text[0])


Fixed Response: im im im
