In [1]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [2]:
# Download NLTK resources (first time only)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Define stopwords list (optional, remove common words like "the", "a")
stop_words = stopwords.words('english')

In [6]:
# Function to preprocess text
def preprocess_text(text):
  # Lowercase text
  text = text.lower()
  # Remove punctuation
  text = "".join([char for char in text if char.isalnum() or char.isspace()])
  # Remove stop words (optional)
  tokens = [word for word in text.split() if word not in stop_words]
  # Join tokens back to text
  text = " ".join(tokens)
  return text

In [7]:
data = [
  {"question": "What is the capital of France?", "answer": "Paris"},
  {"question": "Who is the father of modern physics?", "answer": "Albert Einstein"},
  {"question": "What is the tallest mountain in the world?", "answer": "Mount Everest"},
]

In [8]:
# Separate questions and answers
questions = [preprocess_text(item["question"]) for item in data]
answers = [item["answer"] for item in data]

In [9]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
question_features = vectorizer.fit_transform(questions)

In [10]:
# Train a Logistic Regression model
model = LogisticRegression(random_state=0)
model.fit(question_features, answers)

In [11]:
def answer_question(user_question):
  # Preprocess user question
  preprocessed_question = preprocess_text(user_question)
  # Convert question to feature vector
  question_vector = vectorizer.transform([preprocessed_question])
  # Predict answer using the trained model
  predicted_answer = model.predict(question_vector)[0]
  return predicted_answer

In [12]:
user_question = "What is the largest country in the world?"
answer = answer_question(user_question)
print(f"Question: {user_question}")
print(f"Answer: {answer}")

Question: What is the largest country in the world?
Answer: Mount Everest
