In [16]:
import spacy
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [57]:
nlp = spacy.load("en_core_web_md")

#df = pd.read_csv('./npr.csv')  # Ensure npr.csv is in the correct path
#sample_document = df['Article'].iloc[1]

with open('./machine_learning_nlp.txt') as f:
    sample_document = f.read()

In [58]:
# Function to preprocess text (lemmatization, keep for consistency)
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return doc  # Return spaCy Doc for embeddings

In [61]:
# Function to get sentence embedding by averaging token vectors
def get_embedding(doc):
    # Average non-stop, non-punctuation token vectors
    vectors = [token.vector for token in doc if not token.is_stop and not token.is_punct]
    if not vectors:  # Handle empty vectors
        return np.zeros(nlp.vocab.vectors.shape[1])
    return np.mean(vectors, axis=0)

In [63]:
# Function to process document into sentences
def process_document(document):
    doc = nlp(document)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    sentence_docs = [preprocess_text(sent) for sent in sentences]
    sentence_embeddings = [get_embedding(doc) for doc in sentence_docs]
    return sentences, sentence_embeddings



In [None]:
# Function to find best matching sentences
def get_best_response(user_input, sentence_embeddings, sentences):
    # Preprocess and embed user input
    input_doc = preprocess_text(user_input)
    input_embedding = get_embedding(input_doc)
    
    # Compute cosine similarity
    similarities = cosine_similarity([input_embedding], sentence_embeddings)[0]
    
    # Get top 2 sentences above threshold
    threshold = 0.4 
    top_indices = np.argsort(similarities)[::-1][:2]  # Top 2
    responses = []
    for idx in top_indices:
        if similarities[idx] > threshold:
            responses.append((sentences[idx], similarities[idx]))
        else:
            break
    
    if not responses:
        return "Sorry, I don’t have relevant information about that in the document.", 0.0
    return responses, max(similarities)



In [67]:
# Chatbot function
def chatbot(document):
    sentences, sentence_embeddings = process_document(document)
    if not sentences:
        print("Error: No valid sentences found in the document.")
        return
    
    print("Chatbot: Hello! Ask me about the document (type 'exit'or 'quit' to quit).")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit"or user_input.lower() == "quit":
            print("Chatbot: Goodbye!")
            break
        response, max_similarity = get_best_response(user_input, sentence_embeddings, sentences)
        if isinstance(response, str):
            print(f"Chatbot: {response} (Similarity: {max_similarity:.2f})")
        else:
            for sent, sim in response:
                print(f"Chatbot: {sent} (Similarity: {sim:.2f})")

# Run the chatbot
if __name__ == "__main__":
    chatbot(sample_document)

Chatbot: Hello! Ask me about the document (type 'exit'or 'quit' to quit).


You:  what is machine learning?


Chatbot: Categories of Machine Learning
Machine Learning encompasses several approaches, each suited to different types of problems:

Supervised Learning: (Similarity: 0.87)
Chatbot: What is Machine Learning?
Machine Learning is a subfield of AI that empowers computers to learn from data and improve over time without being explicitly programmed. (Similarity: 0.87)


You:  decision trees


Chatbot: Both fields require significant computational resources, raising concerns about energy consumption and accessibility. (Similarity: 0.61)
Chatbot: As researchers and engineers push the boundaries of these fields, ML and NLP will continue to drive innovation, making technology more intelligent, accessible, and human-centric. (Similarity: 0.60)


You:  quit


Chatbot: Goodbye!
