# Extract Unicode Character from Google Doc

**Author**: <u>Angelo Turri</u>
<br>
**Date**: <u>08/13/2024</u>
<br><br>


The functions in this notebook use a Google Doc link to extract a series of Unicode characters and their respective coordinates. Using these coordinates, the Unicode characters are then arranged in the shape of the intended alphabetical character[s].

In [268]:
url = 'https://docs.google.com/document/d/e/2PACX-1vRMx5YQlZNa3ra8dYYxmv-QIQ3YJe8tbI3kqcuC7lQiZm-CSEznKfN_HYNSpoXcZIV3Y_O3YoUB1ecq/pub'

In [269]:
import urllib.request  
from bs4 import BeautifulSoup

In [270]:
def get_raw_coords(url):
    
    """
    This function takes in a google doc URL and returns
    a list of all characters and their associated coordinates.
    """
    
    # Retrieves information from the URL provided
    html = urllib.request.urlopen(url)
    
    # Retrieves all HTML code from the URL
    htmlParse = BeautifulSoup(html, 'html.parser') 
    
    # All coordinates and symbols are in <p> tags
    # This code traverses the DOM and extracts all <p> tags
    raw_coords = [para.get_text() for para in htmlParse.find_all("p")]
    
    # Removes some extraneous text and spaces from the previous list.
    raw_coords = raw_coords[5:-1]
    
    return raw_coords

In [280]:
raw_coords = get_raw_coords(url)
raw_coords

['0',
 '█',
 '0',
 '0',
 '█',
 '1',
 '0',
 '█',
 '2',
 '1',
 '▀',
 '1',
 '1',
 '▀',
 '2',
 '2',
 '▀',
 '1',
 '2',
 '▀',
 '2',
 '3',
 '▀',
 '2']

In [271]:
def get_coords(raw_coords):
    
    """
    This function takes in a list of characters and
    associated coordinates and orders them into groups of three:
    a character, its x-coordinate, and its y-coordinate.
    """
    
    remaining_coords = raw_coords
    
    # This is where we will put each character and its
    # coordinates.
    coords = []
    
    while len(remaining_coords)>0:
        
        ints = remaining_coords[0:3]
        ints = [int(ints[0]), ints[1], int(ints[2])]
        
        # Appends the character and its coordinates to
        # our list.
        coords.append(ints)
        remaining_coords = remaining_coords[3:]
    
    return coords

In [283]:
coords = get_coords(raw_coords)
coords

[[0, '█', 0],
 [0, '█', 1],
 [0, '█', 2],
 [1, '▀', 1],
 [1, '▀', 2],
 [2, '▀', 1],
 [2, '▀', 2],
 [3, '▀', 2]]

In [272]:
def get_symbols(coords):
    
    """
    This function takes in a list of coordinates,
    which come in groups of three:
    
    1) x-coordinate
    2) symbol
    3) y-coordinate
    
    And returns a list of symbols to print in order.
    When these symbols are printed sequentially, you get
    the original letter.
    """

    # Gets the list of all y-coordinates occupied.
    y_coords = [lst[2] for lst in coords]
    y_range = list(range(min(y_coords), max(y_coords)+1))[::-1]

    # Gets the list of all x-coordinates occupied.
    x_coords = [lst[0] for lst in coords]
    x_range = list(range(min(x_coords), max(x_coords)+1))

    # Prepares a two-dimensional dictionary for population.
    # This dictionary mimics a grid.
    # dct[2][1] will give whatever has an x-coordinate of 1
    # and a y-coordinate of 2.
    dct = {a: {b: ' ' for b in x_range} for a in y_range}

    # If a cell on the grid has a character, it will be input
    # into the dictionary.
    for lst in coords:
        dct[lst[2]][lst[0]] = lst[1]
        
    # Once the dictionary is populated, we don't need the coordinates,
    # just the characters.
    symbols = [''.join(list(value.values())) for value in list(dct.values())]
    
    return symbols

In [284]:
symbols = get_symbols(coords)
symbols

['█▀▀▀', '█▀▀ ', '█   ']

In [285]:
for symbol in symbols:
    print(symbol)

█▀▀▀
█▀▀ 
█   


In [273]:
def unicode_from_grid(url):
    """
    This function takes a google doc URL and prints
    whatever unicode characters are on that URL with a
    series of helper functions.
    """
    
    raw_coords = get_raw_coords(url)
    coords = get_coords(raw_coords)
    symbols = get_symbols(coords)
    
    for symbol in symbols:
        print(symbol)

In [274]:
unicode_from_grid(url)

█▀▀▀
█▀▀ 
█   
