In [None]:
import re
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
class NaiveBayesWSD:
    def __init__(self):
        self.vocab = set()
        self.word_counts = {}
        self.class_probs = {}

    def preprocess(self, sentence):
        # Simple preprocessing: lowercase and remove punctuation
        tokens = word_tokenize(sentence.lower())
        stop_words=set(stopwords.words('english'))
        sentence=[token for token in tokens if token not in stop_words]
        return sentence

    def train(self, training_data):
        class_counts = {}
        total_docs = len(training_data)
        for doc, sense in training_data:
            if sense not in class_counts:
                class_counts[sense] = 0
            class_counts[sense] += 1

            for word in self.preprocess(doc):
                self.vocab.add(word)
                if (word, sense) not in self.word_counts:
                    self.word_counts[(word, sense)] = 0
                self.word_counts[(word, sense)] += 1

        for sense, count in class_counts.items():
            self.class_probs[sense] = count / total_docs

    def calculate_probability(self, word, sense):
        # Laplace smoothing for unseen words
        alpha = 1
        # Calculate P(word|sense) using Laplace smoothing
        word_count = self.word_counts.get((word, sense), 0) + alpha
        sense_count = sum(self.word_counts.get((word, s), 0) for s in self.class_probs.keys()) + (alpha * len(self.vocab))
        return word_count / sense_count

    def classify(self, sentence):
        probabilities = {}
        words = self.preprocess(sentence)

        for sense in self.class_probs.keys():
            log_prob = 0
            for word in words:
                log_prob += math.log(self.calculate_probability(word, sense))
            probabilities[sense] = log_prob + math.log(self.class_probs[sense])

        return max(probabilities, key=probabilities.get)

# Example usage
if __name__ == "__main__":
    # Training data: Each tuple contains a document and its corresponding sense
    training_data = [
        ("The bank is closed.", "finance"),
        ("He sat on the bank of the river", "river"),
        ("The bank gave me a loan", "finance"),
        ("The fish swam along the river bank", "river"),
        ("The professors said that the bank for referring will be give soon ","academic"),
        ("I need to deposit my money in the bank", "finance"),
        ("The pond bank has a lot of flowers growing there","river"),
        ("The students were given previous year questions bank to prepare","academic"),
    ]

    # Initialize and train the NaiveBayesWSD model
    model = NaiveBayesWSD()
    model.train(training_data)

    # Testing data
    test_sentence = input("Enter the sentence")

    # Classify the test sentence
    predicted_sense = model.classify(test_sentence)
    print("Predicted sense:", predicted_sense)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter the sentenceThe students answered with the question bank given
Predicted sense: academic
