In [2]:
import  wikipediaapi
import pickle
import re
import nltk

from nltk.corpus import stopwords


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\E_Hom\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [35]:
class NaiveBayesClassifier:
    def __init__(self):
        self.classes = []
        self.class_probabilities = {}
        self.word_counts = {}
        self.word_probabilities = {}

    def load_data(self, train_domains, test_domains):

      # Initialize the Wikipedia API
      wiki = wikipediaapi.Wikipedia('en')

    # Define the domains and titles of the documents to download
      domains = {
        'planes': ['Aircraft', 'Boeing 747', 'Airbus A380', 'Wright Brothers', 'Air traffic control', 'Jet engine',
    'Propeller', 'Glider', 'Flight simulator', 'Military aircraft'],
        'tech': ['Machine_learning', 'Data mining', 'Artificial intelligence', 'Neural network', 'Deep learning', 'Cybersecurity',
    'Internet of Things', 'Augmented Reality', 'Cryptocurrency', 'Virtual Reality']
    }


      # Download the training documents
      training_data = []
      for domain in train_domains:
          for title in domains[domain]:
              page = wiki.page(title)
              if page.exists():
                  text = page.text[:2000]  # take only the first 2000 characters 
                  training_data.append((text, domain))
              if len(training_data) >= 5 * len(train_domains):  # Stop downloading when 5 docs per domain are downloaded
                  break
          print(f"Loaded {len(training_data)} training documents for '{domain}' domain")


      # Download the test documents
      test_data = []
      for domain in test_domains:
          title = domains[domain][0]  # Choose the first title as the test document
          page = wiki.page(title)
          if page.exists():
              text = page.text[:1000]  # take only the first 10000 characters (about 100 words)
              test_data.append((text, domain))
          else:
              print(f"Error: Could not find test document for '{domain}' domain")
    
      return training_data, test_data


    def class_prob(self, doc_label):
       
             probability = self.class_probabilities[doc_label]
            
             return self.class_probabilities[doc_label]
    

    def preprocess(self, doc):
      # remove punctuation and lowercase all words
      doc = re.sub(r'[^\w\s]', ' ', doc)
      doc = doc.lower()
      # remove digits
      doc = re.sub(r'\d+', '', doc)
      # remove stop words
      stop_words = set(stopwords.words('english'))
      words = doc.split()
      words = [word for word in words if word not in stop_words]
      doc = ' '.join(words)
      # remove extra whitespace
      doc = re.sub(r'\s+', ' ', doc).strip()
      return doc

    def get_tokens(self, doc):
    # preprocess the document
      preprocessed_doc = self.preprocess(doc)
      # initialize variables
      start = 0
      tokens = []
    # loop through each character in the preprocessed document
      for i in range(len(preprocessed_doc)):
          # check if the current character is whitespace or punctuation
          if preprocessed_doc[i].isspace() or preprocessed_doc[i] in ('!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'):
              # get the current token and add it to the list of tokens
              token = preprocessed_doc[start:i]
              if token:
                  tokens.append(token)
              # update the start index
              start = i + 1
    # get the final token and add it to the list of tokens
      final_token = preprocessed_doc[start:]
      if final_token:
          tokens.append(final_token)
      # return the set of unique tokens in the document
      unique_tokens = set(tokens)
      print("unique word",unique_tokens)
      return unique_tokens
   

    

    def conditional_prob(self, word, label):
        # check if the word, label tuple is in the word_probabilities dictionary
        if (label, word) in self.word_probabilities:
            # if it is, retrieve the probability from the dictionary and print it
            probability = self.word_probabilities[(label, word)]
            print(f"Probability of word '{word}' given label '{label}': {probability}")
            # return the probability
            return probability
        else:
            # if the word, label tuple is not in the dictionary, return 0
            return 0


    def train(self, training_data):
        # calculate class probabilities
        total_docs = len(training_data)
        # iterate over the possible classes
        for c in self.classes:
            # count the number of documents in this class
            docs_in_class = len([d for d in training_data if d[1] == c])
            # calculate the probability of this class by dividing the number of documents in this class by the total number of documents
            self.class_probabilities[c] = docs_in_class / total_docs
        # count words
        # iterate over the training data
        for doc, label in training_data:
            # preprocess the document
            preprocessed_doc = self.preprocess(doc)
            # split the document into tokens
            tokens = self.get_tokens(preprocessed_doc)
            # iterate over the tokens
            for token in tokens:
                # if the label is not already in the word_counts dictionary, add it
                if label not in self.word_counts:
                    self.word_counts[label] = {}
                # if the token is not already in the word_counts dictionary for this label, add it
                if token not in self.word_counts[label]:
                    self.word_counts[label][token] = 0
                # increment the count for this word in this label
                self.word_counts[label][token] += 1
        # calculate word probabilities
        # iterate over the possible labels
        for label in self.classes:
            # calculate the total number of words in this label
            total_words_in_class = sum(self.word_counts[label].values())
            # iterate over the words for this label
            for word in self.word_counts[label]:
                # calculate the probability of this word for this label using the count and Laplace smoothing
                self.word_probabilities[(label, word)] = (self.word_counts[label][word] + 1) / (total_words_in_class + len(self.word_counts[label]))


    # This method saves the trained model to a file
    def save_model(self, filename):
        # Open a file in write mode and use pickle to dump the model
        # classes, class_probabilities, and word_probabilities into the file.
        with open(filename, 'wb') as f:
            pickle.dump((self.classes, self.class_probabilities, self.word_probabilities), f)

    # This method loads a trained model from a file
    def load_model(self, filename):
        # Open a file in read mode and use pickle to load the model classes,
        # class_probabilities, and word_probabilities from the file.
        with open(filename, 'rb') as f:
            self.classes, self.class_probabilities, self.word_probabilities = pickle.load(f)

    # This method takes in some test data and returns the predicted labels for each document
    def predict(self, test_data):
        # Initialize an empty list to store predictions
        predictions = []
        # Loop through each document and its actual label in the test data
        for doc, actual_label in test_data:
            # Preprocess the document (e.g. remove stop words, punctuation, etc.)
            preprocessed_doc = self.preprocess(doc)
            # Get the tokens in the preprocessed document
            tokens = self.get_tokens(preprocessed_doc)
            # Initialize a dictionary to store scores for each class
            scores = {c: 0 for c in self.classes}
            # Loop through each token in the document
            for token in tokens:
                # For each class, add the conditional probability of the token given the class to the score
                for c in self.classes:
                    scores[c] += self.conditional_prob(token, c)
            # For each class, add the class probability to the score
            for c in self.classes:
                scores[c] += self.class_prob(c)
            # Predict the label with the highest score
            predicted_label = max(scores, key=scores.get)
            # Add the preprocessed document, predicted label, and actual label to the predictions list
            predictions.append((preprocessed_doc, predicted_label, actual_label))
        # Return the list of predictions
        return predictions


In [36]:
# create NaiveBayesClassifier instance
nb_classifier = NaiveBayesClassifier()

# define the classes
classes = ['planes', 'tech']

# set the classes in the classifier
nb_classifier.classes = classes



In [37]:
# load the training and test data
train_data, test_data = nb_classifier.load_data(classes, classes)

# train the model
nb_classifier.train(train_data)




Loaded 10 training documents for 'planes' domain
Loaded 11 training documents for 'tech' domain
unique word {'world', 'methods', 'prior', 'gravity', 'model', 'china', 'airfoil', 'centuries', 'place', 'helium', 'led', 'also', 'able', 'common', 'th', 'usage', 'vehicles', 'era', 'hydrogen', 'thrust', 'pilot', 'flight', 'times', 'flown', 'remotely', 'paramotors', 'criteria', 'counters', 'activity', 'crewed', 'building', 'float', 'ships', 'war', 'surrounding', 'weight', 'advances', 'safe', 'rd', 'technical', 'aerostats', 'whereas', 'postwar', 'five', 'took', 'direct', 'jet', 'include', 'manned', 'hot', 'self', 'examples', 'great', 'history', 'one', 'sky', 'stories', 'fly', 'adds', 'including', 'gliders', 'wars', 'lanterns', 'ascent', 'much', 'eras', 'designing', 'different', 'century', 'second', 'descent', 'large', 'engines', 'ancient', 'aviation', 'celebrations', 'airplanes', 'controlled', 'low', 'flying', 'classified', 'added', 'characterized', 'use', 'bc', 'day', 'vehicle', 'either', 'li

In [38]:
# save the trained model to hard disk
nb_classifier.save_model('naive_payes_model.pkl')

# load the trained model from hard disk
nb_classifier.load_model('naive_payes_model.pkl')


In [39]:
# make predictions on the test data
predictions = nb_classifier.predict(test_data)

# print the predictions
for doc, predicted_label, actual_label in predictions:
    print(f"\n\nPredicted label: {predicted_label} -> Actual label: {actual_label} \n\n Document: \n\n{doc}")

unique word {'gravity', 'model', 'airfoil', 'centuries', 'place', 'able', 'common', 'th', 'usage', 'vehicles', 'thrust', 'pilot', 'flight', 'times', 'flown', 'remotely', 'paramotors', 'criteria', 'counters', 'activity', 'crewed', 'building', 'safe', 'whereas', 'took', 'direct', 'include', 'manned', 'hot', 'self', 'examples', 'history', 'stories', 'fly', 'including', 'gliders', 'ascent', 'designing', 'cent', 'different', 'descent', 'engines', 'aviation', 'airplanes', 'controlled', 'flying', 'classified', 'vehicle', 'either', 'developed', 'larger', 'aeronautics', 'craft', 'blimps', 'onboard', 'type', 'many', 'balloons', 'human', 'surrounds', 'static', 'may', 'modern', 'go', 'cases', 'unmanned', 'air', 'force', 'airships', 'downward', 'dynamic', 'using', 'computers', 'helicopters', 'called', 'science', 'first', 'aerial', 'propulsion', 'support', 'back', 'others', 'gaining', 'however', 'lift', 'aircraft'}
Probability of word 'gravity' given label 'planes': 0.0008378718056137411
Probability