In [None]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Function to load data from a text file and assign labels
def load_data(file_path, label):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    questions = [line.strip() for line in lines]
    labels = [label] * len(questions)
    return questions, labels

In [None]:
# Define the file paths and corresponding labels
file_paths = [
    ('/content/coding questions.txt', 'coding'),
    ('/content/ot.txt', 'llms'),
    ('/content/Bing Questions.txt', 'microsoft'),
    ('/content/ddsearch.txt', 'duckduckgo')
]


In [None]:
# Load data from the text files
all_questions = []
all_labels = []

for file_path, label in file_paths:
    questions, labels = load_data(file_path, label)
    all_questions.extend(questions)
    all_labels.extend(labels)

In [None]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(all_questions, all_labels, test_size=0.2, random_state=42)

In [None]:
# Create TF-IDF vectors from the text data
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Create and train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)


In [None]:
# Predict the labels for the test set
y_pred = clf.predict(X_test_tfidf)

In [None]:
# Calculate accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.99


In [None]:
# Example usage
user_question = "java program to find the second largest element in an array"
user_question_tfidf = tfidf_vectorizer.transform([user_question])
predicted_category = clf.predict(user_question_tfidf)[0]
print(f"The question is about: {predicted_category}")


The question is about: coding
