In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
def load_data(file_path, label):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    questions = [line.strip() for line in lines]
    labels = [label] * len(questions)
    return questions, labels

In [23]:
file_paths = [
    ('/content/coding questions.txt', 'coding'),
    ('/content/ot.txt', 'llms'),
    ('/content/Bing Questions.txt', 'microsoft'),
    ('/content/ddsearch.txt', 'duckduckgo')
]

In [24]:
all_questions = []
all_labels = []

for file_path, label in file_paths:
    questions, labels = load_data(file_path, label)
    all_questions.extend(questions)
    all_labels.extend(labels)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(all_questions, all_labels, test_size=0.2, random_state=42)

In [26]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [27]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tfidf, y_train)


In [28]:
y_pred = clf.predict(X_test_tfidf)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 1.00


In [30]:
user_question = "python program to find the sum of two numbers"
user_question_tfidf = tfidf_vectorizer.transform([user_question])
predicted_category = clf.predict(user_question_tfidf)[0]
print(f"The question is about: {predicted_category}")


The question is about: coding


In [31]:
user_question="what is microsoft?"
user_question_tfidf = tfidf_vectorizer.transform([user_question])
predicted_category = clf.predict(user_question_tfidf)[0]
print(f"The question is about: {predicted_category}")


The question is about: microsoft


In [32]:
user_question="Which company holds the record for the world's fastest supercomputer?"
user_question_tfidf = tfidf_vectorizer.transform([user_question])
predicted_category = clf.predict(user_question_tfidf)[0]
print(f"The question is about: {predicted_category}")


The question is about: duckduckgo
