In [None]:
#########################################
#########################################
# CS 395C Social Media Behavior
# Spring 2023 Semester
#----------------------------------------
# Artifact Title: Class Demo #5
#----------------------------------------
# Author: Jeremiah Onaolapo
# University of Vermont (UVM)
#########################################

**Note:** This notebook explores the bag-of-words model. It is a vector space representation that allows one to extract features from text (e.g., for machine learning purposes).

In [None]:
# import packages
import numpy as np
import pprint as pp
import string

-------

In [None]:
def load_text_from_file(file_location):
    """Loads text file from file_location (filename or path)"""
    raw_lines = []
    with open(file_location) as f:
        for line in f:
            if line.strip():
                #print(line)
                raw_lines.append(line.strip())
    raw_lines = " ".join(raw_lines)
    
    return raw_lines

In [None]:
def preprocess_text_data(s):
    """Removes punctuation marks. Also sets text to lowercase."""
    cleaned = s
    cleaned = cleaned.translate(str.maketrans("", "", string.punctuation))
    cleaned = cleaned.lower()
    
    return cleaned

In [None]:
def construct_model_dictionary(text_corpus):
    """Construct a list of all unique tokens that 
        exist in the corpus (across all documents in it).
        NB: Assume that tokens are delimited by 
        spaces only. Beware of empty/null tokens 
        (remove them if they exist). Sort the 
        dictionary alphabetically (in ascending order).
        This will return a list of 
        unique tokens (return type is list, not dict)."""
    
    model_dict = []
    
    uniq_words = set([])
    for k, cleaned_text in text_corpus.items():
        tokenized = cleaned_text.split(" ")
        tokenized = [t.strip() for t in tokenized if t.strip()]
        for elem in tokenized: 
            uniq_words.add(elem)
    
    model_dict = list(sorted(uniq_words))

    return model_dict

In [None]:
def score_document(input_doc, model_dictionary):
    """Using the input model_dictionary as your reference, 
    construct a vector representation of the input document. 
    The resulting vector will be as long as the model_dictionary. 
    It will encode the presence of each model_dictionary token 
    with the value 1--or 0 if absent. NB: This will yield 
    a binary vector, not a count of frequencies."""
    
    doc_vector = []
    
    temp_table = {}
    tokenized = input_doc.split(" ")
    tokenized = [t.strip() for t in tokenized if t.strip()]
    
    for word in model_dictionary:
        if word in tokenized:
            # word exists in document
            doc_vector.append(1)
        else:
            # word does not exist in document
            doc_vector.append(0)
            
    return doc_vector

In [None]:
def find_distinguishing_tokens(doc_vectors, model_dictionary):
    """From the document vectors you constructed, 
        identify/list all tokens that appear in 
        the gatsby document but NOT in the alice document.
        Also identify/list all tokens that appear in the 
        alice document but NOT in the gatsby document.
        Store the result in the distinguishers dictionary (type dict)."""

    distinguishers = {}

    # target schema of distinguishers dict:
    #   {"IN_GATSBY_not_in_alice": [list of tokens present 
    #       in the GATSBY document but not in the alice document],
    #   "IN_ALICE_not_in_gatsby": [list of tokens present 
    #       in the ALICE document but not in the gatsby document]}

    gatsby_vec = []
    alice_vec = []
    for k, v in doc_vectors.items():
        if "gatsby" in k:
            gatsby_vec = np.array(v)
        else:
            alice_vec = np.array(v)

            
    # create a "mask" using bitwise XOR of document vectors
    xor_of_vectors = np.bitwise_xor(gatsby_vec, alice_vec)
    
    IN_GATSBY_not_in_alice_vec = list(np.bitwise_and(gatsby_vec, xor_of_vectors))
    IN_ALICE_not_in_gatsby_vec = list(np.bitwise_and(alice_vec, xor_of_vectors))

    IN_GATSBY_only = [] # to store tokens
    IN_ALICE_only = [] # to store tokens
    for i in range(0, len(model_dictionary)):
        if IN_GATSBY_not_in_alice_vec[i] == 1:
            IN_GATSBY_only.append(model_dictionary[i])
        if IN_ALICE_not_in_gatsby_vec[i] == 1:
            IN_ALICE_only.append(model_dictionary[i])
            
    # sanity check
    print("sanity check: this should be an empty set ->", set(IN_GATSBY_only).intersection(set(IN_ALICE_only)))
    print("")
    
    for k, v in doc_vectors.items():
        if "gatsby" in k:
            distinguishers["IN_GATSBY_not_in_alice"] = IN_GATSBY_only
        else:
            distinguishers["IN_ALICE_not_in_gatsby"] = IN_ALICE_only
    
    return distinguishers

-------

In [None]:
#METADATA: Source of text corpus (reference information)

# Two input files excerpted from the following sources:
# 1. "The Great Gatsby" (1925) by F. Scott Fitzgerald. Full text available online at https://www.gutenberg.org/ebooks/64317
# 2. "Alice's Adventures in Wonderland" (1865) by Lewis Carroll. Full text available online at https://www.gutenberg.org/ebooks/11

In [None]:
# Central goal: Construct a basic bag-of-words model over the given corpus

input_files = ["the_great_gatsby_excerpt.txt", "alice_in_wonderland_excerpt.txt"]

text_corpus = {}

# load and clean input files
for elem in input_files:
    text_data = load_text_from_file(elem)
    cleaned_text = preprocess_text_data(text_data)
    # peek inside
    print("after cleaning {}:\n     {}...MORE TEXT...\n".format(elem, cleaned_text[:80]))
    text_corpus[elem.replace(".txt", "")] = cleaned_text

In [None]:
# construct model dictionary over the entire corpus
model_dictionary = construct_model_dictionary(text_corpus)
print("len(model_dictionary)", len(model_dictionary))
print(model_dictionary)

In [None]:
# construct document vectors
doc_vectors = {}
for label, cleaned_text in text_corpus.items():
    # compute the vector representation of cleaned_text (the current document)
    current_vec = score_document(cleaned_text, model_dictionary)
    # store that vector in the doc_vectors object
    doc_vectors[label] = current_vec
    print(label, current_vec)
    print("")

In [None]:
# a fun task: find the tokens that appear in a document but not the other
distinguishers = find_distinguishing_tokens(doc_vectors, model_dictionary)
for label, v in distinguishers.items():
    print("tokens {} -> count = {};\nselected examples = {}\n".format(label, len(v), v[:5]))
    #print(v)
