## TEXT PREPROCESSING

In [None]:
pip install pandas

Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting numpy>=1.20.3 (from pandas)
  Using cached numpy-1.24.4-cp38-cp38-win_amd64.whl.metadata (5.6 kB)
Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl (10.8 MB)
   ---------------------------------------- 0.0/10.8 MB ? eta -:--:--
   ------- -------------------------------- 2.1/10.8 MB 16.8 MB/s eta 0:00:01
   ------------------------------------- -- 10.2/10.8 MB 29.0 MB/s eta 0:00:01
   -------------------------------------- - 10.5/10.8 MB 27.3 MB/s eta 0:00:01
   -------------------------------------- - 10.5/10.8 MB 27.3 MB/s eta 0:00:01
   ---------------------------------------  10.7/10.8 MB 12.4 MB/s eta 0:00:01
   ---------------------------------------- 10.8/10.8 MB 9.8 MB/s eta 0:0

In [10]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.11.6-cp38-cp38-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 13.2 MB/s eta 0:00:00
Using cached regex-2024.11.6-cp38-cp38-win_amd64.whl (274 kB)
Using cached click-8.1.7-py3-none-any.whl (97 kB)
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, click, nltk
Successfully installed click-8.1.7 joblib-1.4.2 nltk-3.9.1 regex-202

In [13]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp38-cp38-win_amd64.whl.metadata (8.2 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.10.1-cp38-cp38-win_amd64.whl.metadata (58 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.0-cp38-cp38-win_amd64.whl.metadata (6.5 kB)
Downloading gensim-4.3.3-cp38-cp38-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   - -------------------------------------- 1.0/24.0 MB 10.1 MB/s eta 0:00:03
   --- ------------------------------------ 2.1/24.0 MB 5.6 MB/s eta 0:00:04
   --- ------------------------------------ 2.1/24.0 MB 5.6 MB/s eta 0:00:04
   ----- ---------------------------------- 3.1/24.0 MB 4.5 MB/s eta 0:00:05
   ------ --------------------------------- 4.2/24.0 MB 4.3 MB/s eta 0:00:05
   -------- ------------------------------- 5.0/24.0 MB 4

In [None]:
import pandas as pd
import re
import os
os.environ['NLTK_DATA'] = 'C:/nltk_data'
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import nltk
nltk.data.path.append('C:/nltk_data')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')


# Load the data
ai_df = pd.read_csv("AI.csv")


# rename columns
ai_df.rename(columns={"Question":"question","Answer":"answer"},inplace=True)

# Concat df
df = pd.concat([ai_df])

# Copy df to a new one
cleaned_df = df.copy()

# Function to preprocess text
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove special characters and line breaks
    text = re.sub(r'([^\s\w_])+', ' ', text)
    text = re.sub(r'[\n\r]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', ' ', text)
    return text

# Apply preprocessing to 'Question' column
cleaned_df['question_processed'] = cleaned_df['question'].apply(preprocess_text)

# Function to tokenize text
def tokenize_text(text):
    text = text.lower()
    tokenizer = word_tokenize('\w+|\$[\d\.]+|\S+')
    words = word_tokenize(text)
    return words

# Function to remove stopwords
def remove_stopwords(words):
    stop = set(stopwords.words('english'))
    filtered_words = [w for w in words if w not in stop]
    return filtered_words

# Function to get Part of Speech tags
def get_part_of_speech_tags(word):
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    tag = nltk.pos_tag([word])[0][1][0].upper()
    return tag_dict.get(tag, wordnet.NOUN)

# Function to perform lemmatization
def lemmatize_text(words):
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(word, get_part_of_speech_tags(word)) for word in words]
    return lem_words


# Preprocess 'Question' column
cleaned_df['question_processed'] = cleaned_df['question_processed'].apply(tokenize_text)
cleaned_df['question_processed'] = cleaned_df['question_processed'].apply(remove_stopwords)
cleaned_df['question_processed'] = cleaned_df['question_processed'].apply(lemmatize_text)

# Convert tokens back to string
cleaned_df['question_processed'] = cleaned_df['question_processed'].apply(lambda x: ' '.join(x))

cleaned_df.head(4)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kedhar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kedhar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kedhar\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Kedhar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kedhar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kedhar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,question,answer,question_processed
0,Who did the first work generally recognized as...,Warren McCulloch and Walter Pitts (1943).\n,first work generally recognize ai
1,What sources was drawn on the formation of the...,knowledge of the basic physiology and function...,source drawn formation first work generally re...
2,Who created the Hebbian learning rule?,Donald Hebb (1949).\n,create hebbian learn rule
3,When the first neural network is built?,1950.\n,first neural network built


## TRAINING LDA MODEL AND GETTING DICTIONARY

In [19]:
# Function to convert text to words
def text_to_words(texts):
    return [[word for word in simple_preprocess(str(text), deacc=True)]
            for text in texts]

text = cleaned_df.question_processed.values.tolist()
text_words = text_to_words(text)
dict_word = corpora.Dictionary(text_words)
corpus_vec = [dict_word.doc2bow(text) for text in text_words]

# Train LDA model
lda_model = gensim.models.LdaModel(corpus=corpus_vec, id2word=dict_word, num_topics=3, iterations=20)

lda_model.save('lda_model')

In [20]:
lda_model_1 = gensim.models.LdaModel.load('lda_model')
topics = lda_model_1.show_topic(2, topn=3)
print(topics)

[('problem', 0.027484434), ('agent', 0.021622017), ('model', 0.014423209)]


TOPIC NAME EXTRACTION

In [None]:
# Function to extract topics using LDA model and return topic numbers
def extract_topics(text):
    text_words = text_to_words([text])
    # Create corpus
    corpus_vec = [dict_word.doc2bow(words) for words in text_words]
    # Get topic distribution
    topics = lda_model.get_document_topics(corpus_vec[0])
    # Extract most probable topic
    topic_num = max(topics, key=lambda x: x[1])[0]
    return topic_num


# Function to get topic names based on representative words
def infer_topic_names(lda_model, dict_word, num_words=3):
    topic_names = {}
    for i in range(lda_model.num_topics):
        #print("i is", i)
        words = lda_model.show_topic(i, topn=num_words)
        #print("words is", words)
        topic_names[i] = ', '.join([word for word, _ in words])
        #print("topic names ",topic_names)
    return topic_names

DISPLAYING OUTPUT FOR TRAINED LDA MODELS

In [25]:
# Get inferred topic names
topic_names = infer_topic_names(lda_model, dict_word)

# Apply topic extraction function to each question
cleaned_df['Topic_Num'] = cleaned_df['question_processed'].apply(extract_topics)

# Map topic numbers to inferred topic names
cleaned_df['Topic'] = cleaned_df['Topic_Num'].map(topic_names)

# Drop the 'Topic_Num' column if needed
cleaned_df = cleaned_df.drop(columns=['Topic_Num'])
pd.set_option('display.max_colwidth', 300)
cleaned_df.head(20)

Unnamed: 0,question,answer,question_processed,Topic
0,Who did the first work generally recognized as AI?,Warren McCulloch and Walter Pitts (1943).\n,first work generally recognize ai,"search, algorithm, first"
1,What sources was drawn on the formation of the first work generally recognized as AI?,knowledge of the basic physiology and function of neurons in the brain; a formal analysis of propositional logic due to Russell and Whitehead; and Turing's theory of computation.\n,source drawn formation first work generally recognize ai,"search, algorithm, first"
2,Who created the Hebbian learning rule?,Donald Hebb (1949).\n,create hebbian learn rule,"search, algorithm, first"
3,When the first neural network is built?,1950.\n,first neural network built,"search, agent, environment"
4,What is the first neural network called?,The SNARC.\n,first neural network call,"search, agent, environment"
5,"""Who introduced the Turing test",machine learning,introduce turing test,"problem, agent, model"
6,Alan Turing prefer what method on creating human-level Al?,He prefer to develop learning algorithms and then teach the machine rather than by programming its intelligence by hand.\n,alan turing prefer method create human level al,"problem, agent, model"
7,Who presented the Logic Theorist (LT)?,Allen Newell and Herbert Simon from Carnegie Tech.\n,present logic theorist lt,"search, agent, environment"
8,What does General Problem Solver (GPS) is designed for?,GPS was designed from the start to imitate human problem-solving protocols.\n,general problem solver gps design,"problem, agent, model"
9,Which model was robably the first program to embody the “thinking humanly” approach?,General Problem Solver (GPS).\n,model robably first program embody think humanly approach,"search, agent, environment"


In [28]:
pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl (9.3 MB)
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/9.3 MB 10.1 MB/s eta 0:00:01
   ---- ----------------------------------- 1.0/9.3 MB 10.1 MB/s eta 0:00:01
   --------------------- ------------------ 5.0/9.3 MB 8.9 MB/s eta 0:00:01
   ---------------------------------------  9.2/9.3 MB 12.4 MB/s eta 0:00:01
   ---------------------------------------- 9.3/9.3 MB 11.5 MB/s eta 0:00:00
Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.3.2 threadpoolctl-3.5.0
Note: you may need to restart the kernel to use updated packages.


EMBEDDED PROCESSED QUESTION & STOREIN MONGODB

In [29]:
# Turn each question into a vector
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a list of cleaned question
question_documents = cleaned_df['question_processed'].values

# Declare rge tfidf Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1153)
tfidf_matrix = tfidf_vectorizer.fit_transform(question_documents)

In [30]:
import pickle
with open('dictionary.pkl', 'wb') as f:
    pickle.dump(dict_word, f)

# Save Tfidf Vectorizer model
with open('tfidf_vectorizer.pkl','wb') as f:
    pickle.dump(tfidf_vectorizer, f)

In [31]:
# Turn each matrix into array
question_vector = tfidf_matrix.toarray()

# Create list of answer
answer = cleaned_df['answer'].values

# Create list of topic words
topic_words = cleaned_df['Topic'].values

data_list = []
for i in range(0,len(answer)):
    data = {}
    data['embed_question']=question_vector[i].tolist()
    data['answer']=answer[i]
    data['topic_words']=topic_words[i]
    data_list.append(data)

In [34]:
print(data_list)

[{'embed_question': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.364330480930185, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [36]:
pip install pymongo





In [37]:
# Initiate MongoDB Instance
import pymongo
from pymongo import MongoClient

# Connect to the MongoDB instance

try:
    mongodb_client = pymongo.MongoClient("mongodb://localhost:27017/",serverSelectionTimeoutMS=5000)
except:
    print('error occured')

In [38]:
# Create a new database and a new connection
db = mongodb_client['bt']
collection = db['data']
# Insert new data
insert_results = collection.insert_many(data_list)

# print result
print("Inserted document ID:", insert_results.inserted_ids)

Inserted document ID: [ObjectId('674e93f3fbb9da52e4d1293c'), ObjectId('674e93f3fbb9da52e4d1293d'), ObjectId('674e93f3fbb9da52e4d1293e'), ObjectId('674e93f3fbb9da52e4d1293f'), ObjectId('674e93f3fbb9da52e4d12940'), ObjectId('674e93f3fbb9da52e4d12941'), ObjectId('674e93f3fbb9da52e4d12942'), ObjectId('674e93f3fbb9da52e4d12943'), ObjectId('674e93f3fbb9da52e4d12944'), ObjectId('674e93f3fbb9da52e4d12945'), ObjectId('674e93f3fbb9da52e4d12946'), ObjectId('674e93f3fbb9da52e4d12947'), ObjectId('674e93f3fbb9da52e4d12948'), ObjectId('674e93f3fbb9da52e4d12949'), ObjectId('674e93f3fbb9da52e4d1294a'), ObjectId('674e93f3fbb9da52e4d1294b'), ObjectId('674e93f3fbb9da52e4d1294c'), ObjectId('674e93f3fbb9da52e4d1294d'), ObjectId('674e93f3fbb9da52e4d1294e'), ObjectId('674e93f3fbb9da52e4d1294f'), ObjectId('674e93f3fbb9da52e4d12950'), ObjectId('674e93f3fbb9da52e4d12951'), ObjectId('674e93f3fbb9da52e4d12952'), ObjectId('674e93f3fbb9da52e4d12953'), ObjectId('674e93f3fbb9da52e4d12954'), ObjectId('674e93f3fbb9da52e

## TEST

In [45]:
def getTopics(userQuestion):
    # Load the dictionary from the file
    lda_model = gensim.models.LdaModel.load(r'lda_model')
    with open(r'dictionary.pkl', 'rb') as f:
        dict_word = pickle.load(f)

    new_data = pd.DataFrame(data=[userQuestion], columns=['Question'])

    #new_data.head()
    new_data['Question_processed'] = new_data['Question'].apply(preprocess_text)
    new_data['Question_processed'] = new_data['Question_processed'].apply(tokenize_text)
    new_data['Question_processed'] = new_data['Question_processed'].apply(remove_stopwords)
    new_data['Question_processed'] = new_data['Question_processed'].apply(lemmatize_text)
    new_data['Question_processed'] = new_data['Question_processed'].apply(lambda x: ' '.join(x))

    new_text = new_data.Question_processed.values.tolist()
    new_text_words = text_to_words(new_text)
    new_corpus_vec = [dict_word.doc2bow(text) for text in new_text_words]

    topic_names = infer_topic_names(lda_model, dict_word)

    # Infer topics for the new data
    new_topics = [max(lda_model.get_document_topics(doc), key=lambda x: x[1])[0] for doc in new_corpus_vec]

    # Interpret the inferred topics for the new data
    new_data['Topic_Num'] = new_topics
    new_data['Topic'] = new_data['Topic_Num'].map(topic_names)

    user_output = new_data[['Question', 'Question_processed', 'Topic']]
    # Extract the 'Topic' column from user_output
    topics = user_output['Topic']

    topics_list = topics.astype(str).tolist()

    # Join the elements of the list into a single string
    topics_string = ' '.join(topics_list)

    return topics_string

## RETRIEVE MOST RELEVENT ANSWER( BASED ON USER INPUT)

In [None]:
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client['bt']
collection = db['data']

# Function to calculate cosine similarity
def compute_cosine_similarity(query_vector, stored_vectors):
    return cosine_similarity([query_vector], stored_vectors)

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

# Input question
question = "utility-based function"

# Vectorize the input question using the loaded TF-IDF vectorizer
vec_question = tfidf_vectorizer.transform([question]).toarray().flatten()

# Retrieve stored vectors from the MongoDB collection (assuming each document has an 'embed_question' field with vector data)
stored_documents = list(collection.find({}, {'_id': 0, 'embed_question': 1, 'answer': 1, 'category': 1}))  # Convert cursor to list
stored_vectors = []
answers = []
categories = []

# Extract stored vectors, answers, and categories
for doc in stored_documents:
    stored_vectors.append(doc['embed_question'])
    answers.append(doc['answer'])
    # Check if 'category' exists, otherwise append 'N/A' or another default value
    category = doc.get('category', 'N/A')  # 'N/A' is the default value if 'category' is missing
    categories.append(category)

# Convert stored vectors to a numpy array for cosine similarity calculation
stored_vectors = np.array(stored_vectors)

# Compute cosine similarity between the query vector and stored vectors
similarities = compute_cosine_similarity(vec_question, stored_vectors)

# Find the index of the most similar document
most_similar_index = np.argmax(similarities)

# Retrieve the answer, category, and similarity score for the most similar document
answer = answers[most_similar_index]
category = categories[most_similar_index]
similarity_score = similarities[0][most_similar_index]

# Output the result
print(f"Answer: {answer}")
print(f"Category: {category}")
print(f"Cosine Similarity Score: {similarity_score}")


Answer: A utility function defines the final numeric value to player p when the game ends in terminal state s.

Category: N/A
Cosine Similarity Score: 1.0000000000000002
