In [5]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('punkt_tab') # <-- Add this new line here
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# Load the dataset
df = pd.read_csv('Mental_Health_FAQ_for_Chatbot_by_Muneeb_1000.csv')

# Display the first few rows
df.head()

Unnamed: 0,Question,Answer
0,What should I do if I am experiencing stress?,Stress is a natural response to pressure. Try ...
1,What should I do if I am experiencing anxiety?,Anxiety is a feeling of worry or fear. Practic...
2,What should I do if I am experiencing depression?,Depression is persistent sadness or loss of in...
3,What should I do if I am experiencing panic at...,Panic attacks are sudden episodes of intense f...
4,What should I do if I am experiencing overthin...,Overthinking involves repetitive negative thou...


In [7]:
# Initialize the lemmatizer and define English stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 1. Convert text to lowercase
    text = str(text).lower()

    # 2. Remove punctuation and special characters using Regex
    text = re.sub(r'[^\w\s]', '', text)

    # 3. Tokenize (split into words)
    tokens = word_tokenize(text)

    # 4 & 5. Remove stopwords and Lemmatize
    cleaned_tokens = []
    for word in tokens:
        if word not in stop_words:
            base_word = lemmatizer.lemmatize(word)
            cleaned_tokens.append(base_word)

    # Rejoin the cleaned words back into a single string
    return " ".join(cleaned_tokens)

# Test the function on a sample sentence to see how it works
sample = "What are the warning signs of mental illness?"
print("Original:", sample)
print("Processed:", preprocess_text(sample))



In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. First, create the 'Processed_Question' column by applying your function to the entire dataset
print("Processing all questions... this might take a few seconds.")
df['Processed_Question'] = df['Question'].apply(preprocess_text)

# 2. Initialize the vectorizer
vectorizer = TfidfVectorizer()

# 3. Fit the vectorizer on our newly created preprocessed questions
tfidf_matrix = vectorizer.fit_transform(df['Processed_Question'])

print("Success!")
print("Shape of our TF-IDF Matrix:", tfidf_matrix.shape)
print("This means we have", tfidf_matrix.shape[0], "questions, each represented by", tfidf_matrix.shape[1], "unique word features.")

Processing all questions... this might take a few seconds.
Success!
Shape of our TF-IDF Matrix: (1000, 56)
This means we have 1000 questions, each represented by 56 unique word features.


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def get_bot_response(user_input):
    # Preprocess the user's raw input
    cleaned_input = preprocess_text(user_input)

    # Convert the cleaned input into a vector
    input_vector = vectorizer.transform([cleaned_input])

    # Calculate cosine similarity
    similarity_scores = cosine_similarity(input_vector, tfidf_matrix)

    # Find the index of the highest score
    best_match_index = similarity_scores.argmax()
    highest_score = similarity_scores[0, best_match_index]

    # Fallback threshold
    if highest_score < 0.20:
        return "I'm sorry, I don't have enough information on that specific topic. Could you rephrase your question?"

    # Return the corresponding answer
    return df['Answer'].iloc[best_match_index]

# --- Chat Loop ---
print("=== Mental Health FAQ Chatbot is Online! ===")
print("(Type 'quit', 'exit', or 'stop' to end the chat)\n")

while True:
    user_query = input("You: ")

    if user_query.lower() in ['quit', 'exit', 'stop']:
        print("Chatbot: Take care of yourself! Goodbye.")
        break

    response = get_bot_response(user_query)
    print(f"Chatbot: {response}\n")

=== Mental Health FAQ Chatbot is Online! ===
(Type 'quit', 'exit', or 'stop' to end the chat)

You: hello
Chatbot: I'm sorry, I don't have enough information on that specific topic. Could you rephrase your question?

You: what is stress
Chatbot: Stress is a natural response to pressure. Try deep breathing, short breaks, regular exercise, and organizing your tasks to manage it better.

You: i wnat to kill myself because of stress
Chatbot: Stress is a natural response to pressure. Try deep breathing, short breaks, regular exercise, and organizing your tasks to manage it better.

You: What should I do if I am experiencing anxiety?
Chatbot: Anxiety is a feeling of worry or fear. Practicing mindfulness, controlled breathing, and limiting caffeine can help reduce symptoms.



KeyboardInterrupt: Interrupted by user

In [11]:
import pickle

# 1. Save the trained vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# 2. Save the mathematical TF-IDF matrix
with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

# 3. Save the dataframe (so the local app knows the answers)
df.to_pickle('dataset.pkl')

print("Files saved! Check the folder icon on the left to download them.")

Files saved! Check the folder icon on the left to download them.
