In [1]:
!pip install tensorflow numpy pandas nltk openpyxl



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import string
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Download necessary NLP resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nkrid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load dataset (Ensure the file is in the same directory as your notebook)
file_path = "mental_health_chatbot_useful.xlsx"
df = pd.read_excel(file_path)

# Show dataset preview
df.head()

Unnamed: 0,ID,User Input,Bot Response,Intent,Sentiment,Context
0,1,I don't feel like getting out of bed.,I'm here for you. Would you like to talk about...,Depression,Negative,General
1,2,I keep waking up in the middle of the night.,Sleep issues can be tough. Have you tried limi...,Insomnia,Negative,Health
2,3,I don't feel like getting out of bed.,I'm here for you. Would you like to talk about...,Depression,Negative,General
3,4,I had a panic attack yesterday.,That sounds really difficult. Deep breathing a...,Anxiety,Negative,Health
4,5,I keep waking up in the middle of the night.,Sleep issues can be tough. Have you tried limi...,Insomnia,Negative,Health


In [4]:
# Function to clean text
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = nltk.word_tokenize(text)  # Tokenize
    return " ".join(text)

# Apply preprocessing to user inputs and bot responses
df["User Input"] = df["User Input"].apply(preprocess_text)
df["Bot Response"] = df["Bot Response"].apply(preprocess_text)

# Display cleaned data
df.head()

Unnamed: 0,ID,User Input,Bot Response,Intent,Sentiment,Context
0,1,i dont feel like getting out of bed,im here for you would you like to talk about w...,Depression,Negative,General
1,2,i keep waking up in the middle of the night,sleep issues can be tough have you tried limit...,Insomnia,Negative,Health
2,3,i dont feel like getting out of bed,im here for you would you like to talk about w...,Depression,Negative,General
3,4,i had a panic attack yesterday,that sounds really difficult deep breathing an...,Anxiety,Negative,Health
4,5,i keep waking up in the middle of the night,sleep issues can be tough have you tried limit...,Insomnia,Negative,Health


In [5]:
# Define hyperparameters
MAX_LEN = 50  # Maximum sequence length
VOCAB_SIZE = 20000  # Limit vocabulary size

# Create a tokenizer
tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters="")
tokenizer.fit_on_texts(df["User Input"].tolist() + df["Bot Response"].tolist())

# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(df["User Input"])
response_sequences = tokenizer.texts_to_sequences(df["Bot Response"])

# Apply padding
input_sequences = pad_sequences(input_sequences, maxlen=MAX_LEN, padding="post")
response_sequences = pad_sequences(response_sequences, maxlen=MAX_LEN, padding="post")

# Convert to NumPy arrays
input_sequences = np.array(input_sequences)
response_sequences = np.array(response_sequences)

# Check shapes
input_sequences.shape, response_sequences.shape


((500, 50), (500, 50))

In [6]:
import pickle

# Save tokenizer to a file
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [7]:
# Define encoder
encoder_inputs = Input(shape=(MAX_LEN,))
encoder_embedding = Embedding(VOCAB_SIZE, 128)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Define decoder
decoder_inputs = Input(shape=(MAX_LEN,))
decoder_embedding = Embedding(VOCAB_SIZE, 128)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Dense output layer
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Build and compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

# Show model summary
model.summary()


In [8]:
# Train the model
model.fit([input_sequences, response_sequences], response_sequences, batch_size=64, epochs=50, validation_split=0.2)

Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1s/step - loss: 9.7165 - val_loss: 7.9605
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step - loss: 7.1394 - val_loss: 4.1422
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - loss: 3.4474 - val_loss: 1.7710
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step - loss: 1.5872 - val_loss: 1.3566
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step - loss: 1.3516 - val_loss: 1.2802
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step - loss: 1.2629 - val_loss: 1.1815
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - loss: 1.1655 - val_loss: 1.1175
Epoch 8/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - loss: 1.1074 - val_loss: 1.0800
Epoch 9/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/ste

<keras.src.callbacks.history.History at 0x2015e9e8950>

In [9]:
model.save("mental_health_chatbot.keras")
print("Model Successfully Saved!!")

Model Successfully Saved!!


In [10]:
from tensorflow.keras.models import load_model
import numpy as np

# Load tokenizer
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Load trained model
model = load_model("mental_health_chatbot.keras")

# Define function to generate chatbot responses
def get_bot_response(user_input):
    user_seq = tokenizer.texts_to_sequences([user_input])
    user_seq = pad_sequences(user_seq, maxlen=MAX_LEN, padding="post")  # Ensure consistent length

    # Pass user input through the model
    prediction = model.predict([user_seq, np.zeros((1, MAX_LEN))])  # Ensure decoder input is correct
    predicted_index = np.argmax(prediction, axis=-1)

    # Convert predicted sequence back to words
    response_words = [word for word, index in tokenizer.word_index.items() if index in predicted_index[0]]

    return " ".join(response_words)


# Run chatbot loop
while True:
    print(MAX_LEN)
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("Chatbot: Goodbye!")
        break
    response = get_bot_response(user_input)
    print("Chatbot:", response)


50
You: Hi
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 368ms/step
Chatbot: you
50
You: Hello i am not feeling good
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Chatbot: you
50
You: exit
Chatbot: Goodbye!


In [11]:
print("Sample Input Sequence:", input_sequences[:5])
print("Sample Response Sequence:", response_sequences[:5])


Sample Input Sequence: [[ 1 31  4  9 69 70 23 24  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 1 76 77 78 79 30 80 23 30 81  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 1 31  4  9 69 70 23 24  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 1 21 22 37 38 39  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 1 76 77 78 79 30 80 23 30 81  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]]
Sample Response Sequence: [[11 25 71  2 32  2  9  3 72 19 73 74 10 75  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [82 83 

In [12]:
decoded_responses = tokenizer.sequences_to_texts(response_sequences[:5])
print(decoded_responses)

['im here for you would you like to talk about whats on your mind', 'sleep issues can be tough have you tried limiting screen time before bed', 'im here for you would you like to talk about whats on your mind', 'that sounds really difficult deep breathing and grounding techniques might help', 'sleep issues can be tough have you tried limiting screen time before bed']


In [13]:
predicted_tokens = np.argmax(predicted_sequence, axis=-1)  # Get highest probability token indices
predicted_text = tokenizer.sequences_to_texts(predicted_tokens)
print("Predicted Response:", predicted_text[0])  # Extract first response


NameError: name 'predicted_sequence' is not defined

In [14]:
decoder_input = np.zeros((1, MAX_LEN))  # Example decoder input (modify if needed)

predicted_sequence = model.predict([input_sequences[:1], decoder_input])
print(predicted_sequence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[[[4.94640060e-02 1.40515027e-07 1.39161855e-01 ... 1.18347742e-07
   1.44147151e-07 1.21556923e-07]
  [5.91432869e-01 1.18866318e-07 1.41672865e-01 ... 1.04225286e-07
   1.27670006e-07 1.07875174e-07]
  [7.47213304e-01 7.18390822e-08 9.35428664e-02 ... 6.39137809e-08
   7.83382248e-08 6.66485818e-08]
  ...
  [9.94438648e-01 5.36238343e-09 5.69184602e-04 ... 5.51007817e-09
   6.63103705e-09 5.64846481e-09]
  [9.94438648e-01 5.36236300e-09 5.69184078e-04 ... 5.51006751e-09
   6.63103705e-09 5.64845459e-09]
  [9.94438648e-01 5.36236300e-09 5.69184078e-04 ... 5.51005686e-09
   6.63102462e-09 5.64844349e-09]]]


In [15]:
predicted_tokens = np.argmax(predicted_sequence, axis=-1)  # Get highest probability token indices
predicted_text = tokenizer.sequences_to_texts(predicted_tokens)
print("Predicted Response:", predicted_text[0])  # Extract first response


Predicted Response: you


In [16]:
import numpy as np

def sample_with_temperature(predictions, temperature=0.7):
    predictions = np.log(predictions + 1e-9) / temperature
    exp_preds = np.exp(predictions)
    probabilities = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(probabilities), p=probabilities)

predicted_tokens = [sample_with_temperature(pred, temperature=0.8) for pred in predicted_sequence[0]]
predicted_text = tokenizer.sequences_to_texts([predicted_tokens])

print("Improved Response:", predicted_text[0])


Improved Response: be


In [17]:
from collections import Counter

word_counts = Counter(" ".join(df["Bot Response"]).split())
print(word_counts["you"])  # See how often "im" appears


325


In [18]:
predicted_tokens = np.argmax(predicted_sequence, axis=-1)
predicted_text = tokenizer.sequences_to_texts(predicted_tokens)
print("Fixed Response:", predicted_text[0])


Fixed Response: you
