<a href="https://colab.research.google.com/github/Ganesh1Bachol/Polynomial-AI/blob/main/Question_answering_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from transformers import BertForQuestionAnswering, AutoTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import torch

# Downloading NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenizing text into words
    tokens = word_tokenize(text)

    # Removing punctuation and converting to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Applying stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Joining the tokens back into string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Reading the context from a .txt file
document_path = '/content/Krishna.txt'
with open(document_path, 'r', encoding='utf-8') as file:
    context = file.read()

# Preprocessing the context
preprocessed_context = preprocess_text(context)

# Defining questions
questions = [
    "Krishna is worshiped as an incarnation of which hindu god?",
    "Krishna and his brother Balarama returned to Mathura to slay whom?"
]

# Loading the pre-trained BERT model and tokenizer
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')
tokenizer = AutoTokenizer.from_pretrained('deepset/bert-base-cased-squad2')

# Processing each question
for question in questions:
    # Tokenizing input
    question = preprocess_text(question)  # Preprocess the question as well
    inputs = tokenizer(question, preprocessed_context, return_tensors='pt', truncation=True, padding=True)

    # Forward pass through the model
    outputs = model(**inputs)

    # Fetching start and end logits from the model output
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Finding the tokens with the highest start and end logits
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    # Converting token indices to actual text
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index+1]))

    print(f"Question: {question}")
    print(f"Answer: {answer}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question: krishna worship incarn hindu god
Answer: vishnu
Question: krishna brother balarama return mathura slay
Answer: [CLS]
