<a href="https://colab.research.google.com/github/Krukruthi745/Corpus-Based-Chatbot/blob/main/chatbot2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import numpy as np
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

# ----------------------------
# Download required NLTK data
# ----------------------------
nltk.download('punkt')
nltk.download('punkt_tab')   # NEW fix for sentence tokenization
nltk.download('wordnet')

# ----------------------------
# Read the corpus
# ----------------------------
f = open('/content/symptom_medicine_sentences.txt', 'r', errors='ignore')
raw = f.read().lower()

# Tokenize
sent_tokens = nltk.sent_tokenize(raw)   # sentences
word_tokens = nltk.word_tokenize(raw)   # words

# ----------------------------
# Lemmatization
# ----------------------------
lemmer = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

# ----------------------------
# Greeting function
# ----------------------------
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey")
GREETING_RESPONSES = ["hi", "hey", "hello", "I am glad you are talking to me!"]

def greet(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

# ----------------------------
# TF-IDF Vectorizer
# ----------------------------
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize)
tfidf_corpus = TfidfVec.fit_transform(sent_tokens)

# ----------------------------
# Response function
# ----------------------------
def response(user_response):
    robo1_response = ''
    tfidf_user = TfidfVec.transform([user_response])
    vals = cosine_similarity(tfidf_user, tfidf_corpus)

    idx = vals.argsort()[0][-1]      # index of best match
    score = vals[0][idx]             # similarity score

    if score == 0:
        robo1_response = "I am sorry! I don't understand you."
    else:
        robo1_response = sent_tokens[idx]   # best matching sentence

    return robo1_response

# ----------------------------
# Conversation loop
# ----------------------------
flag = True
print("BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type Bye!")

while flag:
    user_response = input()
    user_response = user_response.lower()
    if user_response != 'bye':
        if user_response in ('thanks', 'thank you'):
            flag = False
            print("BOT: You are welcome..")
        else:
            if greet(user_response) is not None:
                print("BOT: " + greet(user_response))
            else:
                print("BOT: " + response(user_response))
    else:
        flag = False
        print("BOT: Bye! take care..")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type Bye!
BOT: the medicine for fever is paracetamol or ibuprofen.
BOT: the medicine for malaria is chloroquine or artemisinin-based combination therapy (act).
BOT: I am sorry! I don't understand you.
