In [652]:
# Import necessary libraries
import requests  # For making HTTP requests
import wikipedia  # For accessing Wikipedia articles and their content
import re  # For working with regular expressions
import os  # For reading and writing files
import pickle  # For save model
from gensim.parsing.preprocessing import STOPWORDS  # A set of common stop words

# Download set of common punctuation
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [653]:
def download_documents(topic_list, num_docs):
    """
    Downloads a specified number of Wikipedia articles for each topic in the given list of topics.

    Args:
        topic_list (list): A list of strings representing the topics to download articles for.
        num_docs (int): The number of articles to download for each topic.

    Returns:
        A list of strings representing the downloaded articles.
    """

    # Initialize an empty list to store the downloaded documents
    docs = []

    # Loop through each topic in the list and download the specified number of articles
    for topic in topic_list:
        # Get the Wikipedia page object for the current topic
        page = wikipedia.page(topic)
        # Get the page content as a string
        content = page.content
        # Split the content into sections based on the "==" delimiter
        sections = content.split('\n\n== ')
        # Loop through the sections and add the specified number of documents to the docs list
        for i in range(1, num_docs+1):
            try:
                # Get the i-th section as a string and add it to the docs list
                doc = sections[i]
                docs.append(doc)
            except:
                # If there are fewer than num_docs sections, skip the current iteration
                pass

    # Return the list of downloaded documents
    return docs

In [654]:
# Download 5 Wikipedia articles each for the topics 'Cities weather' and 'stock market' and store them in two separate lists
weather = download_documents(['Cities weather'], 5)
market = download_documents(['stock market'], 5)

# Combine the two lists of documents into a single training list
train_docs = weather + market

# Create a list of labels for the training documents
train_labels=['weather','weather','weather','weather','weather','market','market','market','market','market']

# Download 3 more articles each for the topics 'Cities weather' and 'stock market' and store them in two separate lists
weather = download_documents(['Cities weather'], 3)
market = download_documents(['stock market'], 3)

# Combine the two lists of test documents into a single test list
test_docs = weather + market

In [655]:
def class_prob(doc_labels):
    """
    Calculates the class probabilities for a given list of document labels.

    Args:
        doc_labels (list): A list of strings representing the class labels for a set of documents.

    Returns:
        A dictionary mapping each unique label to its corresponding class probability.
    """

    # Initialize an empty dictionary to store the class counts for each label
    class_counts = {}
    # Get the total number of documents
    total_count = len(doc_labels)
    # Loop through each label in the list of document labels
    for label in doc_labels:
        # If the label is already in the class_counts dictionary, increment its count by 1
        if label in class_counts:
            class_counts[label] += 1
        # Otherwise, add the label to the dictionary with a count of 1
        else:
            class_counts[label] = 1
    # Initialize an empty dictionary to store the class probabilities for each label
    class_probs = {}
    # Loop through each label in the class_counts dictionary
    for label in class_counts:
        # Calculate the class probability for the current label
        class_prob = class_counts[label] / total_count
        # Add the label and its corresponding class probability to the class_probs dictionary
        class_probs[label] = class_prob
    # Return the dictionary of class probabilities
    return class_probs

In [656]:
def preprocessing(doc, stop_words= set(STOPWORDS)):
    """
    Performs basic text preprocessing on a given document, including removing punctuation and stop words, converting to lowercase, and removing digits.

    Args:
        doc (str): A string representing the document to be preprocessed.
        stop_words (set): An optional set of stop words to be removed from the document.
        By default, the stop words provided by gensim are used.

    Returns:
        A preprocessed version of the input document as a string.
    """

    # Remove punctuation from the document
    doc = re.sub(r'[^\w\s]', '', doc)

    # Remove digits from the document
    doc = re.sub(r'[0-9]+', '', doc)

    # Convert the document to lowercase
    doc = doc.lower()

    # Split the document into words
    words = doc.split()

    # Add some custom stop words to the set
    stop_words.update({"ll", "m", "re", "s", "ve" ,"nt","t","us"})

    # Remove stop words and words with length less than or equal to 1 from the document
    words = [word.strip() for word in words if (word not in stop_words) and (len(word) > 1)]

    # Join the remaining words back into a single string and return the preprocessed document
    return ' '.join(words)

In [657]:
def getTokens(doc):
    """
    Tokenizes a given document by splitting it into words and returning a set of the unique words.

    Args:
        doc (str): A string representing the document to be tokenized.

    Returns:
        A set of the unique words contained in the input document.
    """

    # Split the document into words
    tokens = doc.split()

    # Convert the list of tokens to a set to remove duplicates
    unique_tokens = set(tokens)

    # Return the set of unique tokens
    return unique_tokens

In [658]:
def conditional_prob(preprocessed_docs, train_labels, class_name, token, vocab_size):
    """
    Computes the conditional probability of a given token appearing in a document belonging to a given class.

    Args:
        preprocessed_docs (list): A list of preprocessed documents, where each document is represented as a string.
        train_labels (list): A list of labels indicating the class of each document in preprocessed_docs.
        class_name (str): The name of the class for which the conditional probability is being calculated.
        token (str): The token for which the conditional probability is being calculated.
        vocab_size (int): The size of the vocabulary (i.e., the total number of unique tokens in all documents).

    Returns:
        The conditional probability of the given token appearing in a document belonging to the given class.
    """

    # Find the set of all unique tokens in documents belonging to the given class
    total_tokens = set()
    class_tokens = 0
    for i in range(len(preprocessed_docs)):
        if train_labels[i] == class_name:
            total_tokens.update(getTokens(preprocessed_docs[i]))

    # Count the number of times the token appears in documents belonging to the given class
    for i in range(len(preprocessed_docs)):
        if train_labels[i] == class_name:
            class_tokens += preprocessed_docs[i].count(token)

    # Calculate the conditional probability
    return (class_tokens + 1) / (len(total_tokens) + vocab_size)


In [659]:
def fit_model(train_docs, train_labels, path='probs.pkl'):
    """
    fit the data and save the probabilities.

    Args:
        train_docs (list): A list of documents, where each document is represented as a string.
        train_labels (list): A list of labels indicating the class of each document in preprocessed_docs.
        path (str): A string of the path to save the model

    """
    # Compute class probabilities
    class_probs = class_prob(train_labels)

    # Preprocess training documents
    preprocessed_docs = [preprocessing(doc) for doc in train_docs]

    # Generate vocabulary of unique tokens in training data
    train_tokens = set()
    for doc in preprocessed_docs:
        train_tokens.update(getTokens(doc))
    
    # get the total number of tokens
    vocab_size = len(train_tokens)

    # Compute conditional probabilities of each token given each class
    class_word_probs = {}
    for label in class_probs.keys():
        for token in train_tokens:
            class_word_probs[(label, token)] = conditional_prob(preprocessed_docs, train_labels, label, token, vocab_size)

    # Save class probabilities and class-word probabilities to a pickle file
    with open(path, 'wb') as f:
        pickle.dump((class_probs, class_word_probs), f)

# fit the model
fit_model(train_docs, train_labels)


In [663]:
def predict(test_doc, model_path):
    """
    load the probabilities and predict document category.

    Args:
        test_docs (str): A document represented as a string.
        model_path (str): A string of the path to load the model

    """
    # Load model from disk
    with open(model_path, 'rb') as f:
        class_prob, token_prob = pickle.load(f)
    
    # Preprocess test document
    test_doc = preprocessing(test_doc)
    
    # Generate vocabulary of unique tokens
    tokens = getTokens(test_doc)

    # Create list of labels
    labels = list(class_prob.keys())

    # Calculate the initial score for each class
    class_scores = class_prob[labels[0]]/class_prob[labels[1]]

    # Iterate through each token in the test document
    for token in tokens:
        # Check if the token appears in the training data for either class
        if ((labels[0], token) in token_prob)or((labels[1], token) in token_prob):
            # Update the class score with the conditional probability of the token given each class
            class_scores *= (token_prob[(labels[0], token)]/token_prob[(labels[1], token)])
    
    # Return class with highest score
    if class_scores>1:
        return(labels[0])
    else:
        return(labels[1])

print("Predict of the training data :-\n")
for i in train_docs:
  print(predict(i,'probs.pkl'))
print('------------------------------------\n')
print("Predict of the testing data :-\n")
for i in test_docs:
  print(predict(i,'probs.pkl'))

Predict of the training data:-

weather
weather
weather
weather
weather
market
market
market
market
market
------------------------------------

Predict of the testing data:-

weather
weather
weather
market
market
market
