In [1]:
import wikipediaapi
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import faiss
from gensim.models import Word2Vec
import re
import string
import csv
import json

### Fetching text from Wikipedia and Preprocessing

In [2]:

# Initialize a Wikipedia API client
wiki_wiki = wikipediaapi.Wikipedia("https://meta.wikimedia.org/wiki/User-Agent_policy")

# Define a list of page titles you want to fetch
page_titles = ["Artificial Intelligence", "Machine Learning", "Deep Learning", "Natural Language Processing","Greeting","Hi"]

# Initialize an empty string to store the concatenated content
concatenated_content = ""

# Loop through the list of page titles
for page_title in page_titles:
    # Fetch the page content
    page = wiki_wiki.page(page_title)

    # Check if the page exists
    if page.exists():
        # Get the full text content of the page
        full_text = page.text

        # Concatenate the fetched content to the existing string
        concatenated_content += full_text
    else:
        print(f"Wikipedia page '{page_title}' does not exist.")
# Print or return the concatenated content as a single string
print("Concatenated Wikipedia Pages:")
print(concatenated_content)

Concatenated Wikipedia Pages:
Artificial intelligence (AI) is the intelligence of machines or software, as opposed to the intelligence of humans or animals. It is also the field of study in computer science that develops and studies intelligent machines. "AI" may also refer to the machines themselves.
AI technology is widely used throughout industry, government and science. Some high-profile applications are: advanced web search engines (e.g., Google Search), recommendation systems (used by YouTube, Amazon, and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Waymo), generative or creative tools (ChatGPT and AI art), and competing at the highest level in strategic games (such as chess and Go).Artificial intelligence was founded as an academic discipline in 1956. The field went through multiple cycles of optimism followed by disappointment and loss of funding, but after 2012, when deep learning surpassed all previous AI techniques, there was a vast

In [3]:
def preprocess_wikipedia_text(wikipedia_text):
    # Split the text into paragraphs using a common pattern
    chunks = wikipedia_text.split('\n')
    
    # Clean and filter paragraphs
    cleaned_chunks = []
    for chunk in chunks:
        # Remove extra whitespace and newline characters
        chunk = chunk.strip()
        
        # Filter out very short paragraphs (adjust the threshold as needed)
        if len(chunk) > 80:
            cleaned_chunks.append(chunk)
    
    return cleaned_chunks

# Example usage:
cleaned_tokens = preprocess_wikipedia_text(concatenated_content)
cleaned_tokens = ["".join(i) for i in cleaned_tokens]
cleaned_tokens[:2]

['Artificial intelligence (AI) is the intelligence of machines or software, as opposed to the intelligence of humans or animals. It is also the field of study in computer science that develops and studies intelligent machines. "AI" may also refer to the machines themselves.',
 'AI technology is widely used throughout industry, government and science. Some high-profile applications are: advanced web search engines (e.g., Google Search), recommendation systems (used by YouTube, Amazon, and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Waymo), generative or creative tools (ChatGPT and AI art), and competing at the highest level in strategic games (such as chess and Go).Artificial intelligence was founded as an academic discipline in 1956. The field went through multiple cycles of optimism followed by disappointment and loss of funding, but after 2012, when deep learning surpassed all previous AI techniques, there was a vast increase in funding and

### Knowledge Base with Wiki and FAQ data

In [4]:
#loading the sample FAQ data
questions  =[]
answers = []
with open("chatbot dataset.txt",'r') as f :
    for line in f :
        line  =  line.split('\t')
        questions.append(line[0])
        answers.append(line[1])

In [5]:
# Initialize an empty list to store questions and answers
questions = []
answers = []

# Path to the CSV file containing FAQ data
csv_file_path = "Mental_Health_FAQ.csv"  # Replace with your actual file path

# Open and read the CSV file
with open(csv_file_path, "r", newline="",encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if len(row) == 3:
            question_id = row[0]
            question = row[1]
            answer = row[2]
            questions.append(question)
            answers.append(answer)
questions = questions[1:]
answers = answers[1:]

In [6]:
# Specify the path to your .txt file
file_path = "Aadhar_Faq.txt"

# Read the contents of the file
with open(file_path, "r") as file:
    text_data = file.read()

# Parse the JSON-like text
faq_data = json.loads(text_data)

# Print the list of questions and answers
for item in faq_data:
    questions.append(item["question"])
    answers.append(item["answer"])

In [7]:
len(answers),len(questions)

(121, 121)

In [8]:
qa_dict = dict(zip(questions, answers))

In [9]:
# Preprocess the FAQ data (clean and tokenize)
def preprocess(text):
    # Implement your text preprocessing here (e.g., cleaning, tokenization)
    return text.lower().split()

for question in questions:
    question = preprocess(question)
for answer in answers:
    answer = preprocess(answer)

In [10]:
# Combine Wikipedia text and FAQ data for the knowledge base
knowledge_base = [token for token in cleaned_tokens] + [answer for answer in answers]

### Vectorizing with TF-IDF

In [11]:
# Vectorize the knowledge base using TF-IDF
vectorizer = TfidfVectorizer()
knowledge_base_text = [text for text in knowledge_base]
tfidf_matrix = vectorizer.fit_transform(knowledge_base_text)

# Convert TF-IDF matrix to a dense numpy array
tfidf_matrix = tfidf_matrix.toarray()

For Bert based Embeddings

In [13]:
import torch
from transformers import BertTokenizer, BertModel

knowledge_base_text = [text for text in knowledge_base]
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Encode the knowledge base using BERT embeddings
knowledge_base_embeddings = []
for text in knowledge_base_text:
    input_ids = tokenizer.encode(text, add_special_tokens=True,truncation=True,padding ="max_length",max_length=128)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Batch size of 1
    with torch.no_grad():
        embeddings = model(input_ids)[0]  # Extract BERT embeddings from the model
    knowledge_base_embeddings.append(embeddings.numpy())
knowledge_base_embeddings = np.vstack(knowledge_base_embeddings)

###  FAISS INDEX

In [14]:
# Build a Faiss index using the dense TF-IDF matrix
d = tfidf_matrix.shape[1]  # Dimensionality of the vectors

index = faiss.IndexFlatL2(d)  # L2 distance index (you can choose other indexes based on your needs)

# Add vectors to the index
index.add(tfidf_matrix)

In [15]:
# Define a function to query the knowledge base
def query_knowledge_base(question, k=1):
    # Vectorize and preprocess the user question
    question_vector = vectorizer.transform([" ".join(preprocess(question))]).toarray()
    # input_ids = tokenizer.encode(question, add_special_tokens=True)
    # input_ids = torch.tensor(input_ids).unsqueeze(0)  # Batch size of 1
    # with torch.no_grad():
    #     question_embedding = model(input_ids)[0]

    # Perform a nearest-neighbor search in the Faiss index
    D, I = index.search(question_vector, k)

    # Retrieve the top-k knowledge base entries based on the question
    top_k_entries = [knowledge_base[i] for i in I[0]]

    return top_k_entries

### Queries

In [235]:
# Example usage:
user_question = "How can I get the details corrected after 96 hours?"
top_k_entries = query_knowledge_base(user_question, k=1)

# Print the top-k entries
for i, entry in enumerate(top_k_entries):
    print(f"Entry {i+1}: {entry}")

Entry 1: If you are not able to get your details corrected in 96 hours window, you can get your details updated once your Aadhaar is generated.


In [17]:
user_question = "what are the ke aspects Natural language processing"
top_k_entries = query_knowledge_base(user_question, k=5)

# Print the top-k entries
for i, entry in enumerate(top_k_entries):
    print(f"Entry {i+1}: {entry}")

Entry 1: Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.
Entry 2: Up to the 1980s, most natural language processing systems were based on complex sets of hand-written rules.  Starting in the late 1980s, however, there was a revolution in natural language processing with the introduction of machine learning algorithms for language processing.  This was due to both the steady increase in computational power (see Moore's law) and the gradual lessening of the dominance of Chomskyan theories of linguistics (e.g. transformational grammar), whose theoretical underpinnings discouraged the sort of corpus linguistics that underlies the machine-learning approach to language processing.
Entry 3: As an example, George Lakoff offers a methodology to build natural language processing (NLP) algorithms through the perspective of cognitive science, along with the findings of cognitive linguistics, with two 

In [249]:
user_question = "Greeting"
top_k_entries = query_knowledge_base(user_question, k=1)

# Print the top-k entries
for i, entry in enumerate(top_k_entries):
    print(f"Entry {i+1}: {entry}")

Entry 1: A spoken greeting or verbal greeting is a customary or ritualised word or phrase used to introduce oneself or to greet someone.  Greeting habits are highly culture- and situation-specific and may change within a culture depending on social status.


In [204]:
# Initialize variables for tracking correct and total responses
qa_dict2 = dict(zip(questions[75:100],answers[75:100]))
correct_responses = 0
total_responses = len(qa_dict2)

# Loop through the dataset and evaluate the chatbot
for question,answer in qa_dict2.items() :
    user_question = question
    expected_answer = answer

    # Query the chatbot (assuming the query_knowledge_base function is defined)
    top_k_answers = query_knowledge_base(user_question, k=3)  # Assuming a top-k of 1

    # Check if the chatbot's response matches the expected answer
    if expected_answer in top_k_answers:
        correct_responses += 1

# Calculate accuracy
accuracy = (correct_responses / total_responses) * 100

# Print the accuracy
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 92.00%
