Jeremiah Mubiru and James Telzrow <br>
2024-05-06 <br>
CSDS490 <br>
Experimentation/Exploratory Project 

To run this notebook, you must install:
* BeautifulSoup4
* OpenCV-Python (Possibly OpenCV-Python-Headless)
* Requests
* ScikitLearn

This notebook can be used to create a dataset, or it can use an existing one (available in Google Drive)

In [82]:
# Necessary imports

from bs4 import BeautifulSoup
import cv2
import numpy as np
from os import listdir
import random
import requests
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import string

In [None]:
# In the following cells, we will demonstrate a fast an efficient way of 
# determining the state from which a license plate originates. We do 
# this by computing the histogram of such images, and using the resulting
# vectors to map the image into a high-dimensional Euclidean space.
# The theory is that visually similar license plates (i.e. those issued by
# the same state) will have similar histograms, and thus will be mapped
# into the same region in Euclidean space.
# Thus by creating a point cloud using a large training dataset, we can 
# apply a k-nearest-neighbors approach to determine the state of origin.

In [33]:
# Here, we use acme.com's license plate maker website to create an 
# image of a specific type of license plate for the specified 
# state, with the specified text.
# Usually, the type of plate (the value of the "plate" argument) 
# is the year in which it was issued.
# However, some states issue "special edition" license plates 
# to raise awareness for a cause or commemorate an event, so this
# can sometimes accept values like "Cure Cancer" or "Challenger".
# This returns the image as bytes.

def get_plate_image(state, plate, text):
    request_url = 'https://www.acme.com/licensemaker/'
    response_containing_page = requests.get(
        request_url + 'licensemaker.cgi',
        params = {
            'state': state,
            'plate': plate,
            'text': text,
        },
    )
    parsed_page = BeautifulSoup(response_containing_page.text)
    image_link = parsed_page.find('a', href=True)['href']
    response_containing_image = requests.get(
        request_url + image_link
    )
    image_bytes = response_containing_image.content
    return image_bytes

In [34]:
# License plate designs change over time; for brevity we don't consider 
# all of them here.
# Here, we list all 50 states, and the particular years of plates that we
# want to consider for each state.

states = [
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
    'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
    'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
    'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
    'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
    'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
    'West Virginia', 'Wisconsin', 'Wyoming'
]
years = {
    'Alabama': [
        '2002'
    ],
    'Alaska': [
        '1982'
    ],
    'Arizona': [
        '1996'
    ],
    'Arkansas': [
        '1996'
    ],
    'California': [
        '1998'
    ],
    'Colorado': [
        '2000'
    ],
    'Connecticut': [
        '1987'
    ],
    'Colorado': [
        '2000'
    ],
    'Delaware': [
        '1970'
    ],
    'Florida': [
        '1997'
    ],
    'Georgia': [
        '1998'
    ],
    'Colorado': [
        '2000'
    ],
    'Hawaii': [
        '1991'
    ],
    'Idaho': [
        '1997'
    ],
    'Colorado': [
        '2000'
    ],
    'Illinois': [
        '2002'
    ],
    'Indiana': [
        '1999'
    ],
    'Iowa': [
        '1997'
    ],
    'Kansas': [
        '1995'
    ],
    'Kentucky': [
        '1997'
    ],
    'Louisiana': [
        '1994'
    ],
    'Maine': [
        '1999'
    ],
    'Maryland': [
        '1986'
    ],
    'Massachusetts': [
        '1988'
    ],
    'Michigan': [
        '1983'
    ],
    'Minnesota': [
        '1993'
    ],
    'Mississippi': [
        '1997'
    ],
    'Missouri': [
        '1998'
    ],
    'Montana': [
        '1991'
    ],
    'Nebraska': [
        '1993'
    ],
    'Nevada': [
        '1987'
    ],
    'New Hampshire': [
        '1999'
    ],
    'New Jersey': [
        '1993'
    ],
    'New Mexico': [
        '1991'
    ],
    'New York': [
        '1986'
    ],
    'North Carolina': [
        '1982'
    ],
    'North Dakota': [
        '1993'
    ],
    'Ohio': [
        '1997'
    ],
    'Oklahoma': [
        '1994'
    ],
    'Oregon': [
        '1988'
    ],
    'Pennsylvania': [
        '2000'
    ],
    'Rhode Island': [
        '1996'
    ],
    'South Carolina': [
        '1998'
    ],
    'South Dakota': [
        '2000'
    ],
    'Tennessee': [
        '2000'
    ],
    'Texas': [
        '2000'
    ],
    'Utah': [
        '1996'
    ],
    'Vermont': [
        '1985'
    ],
    'Virginia': [
        '1980'
    ],
    'Washington': [
        '1998'
    ],
    'West Virginia': [
        '1995'
    ],
    'Wisconsin': [
        '1987'
    ],
    'Wyoming': [
        '1992'
    ],
}
plate_images = {}

In [35]:
# Here we create our dataset.

# The number of plates from each state that will be in the dataset.
number_of_plates_to_generate = 50
# The number of characters on each plate
plate_text_length = 7

# Get the plate images from acme.com
for state in states:
    plate_images[state] = {}
    for year in years[state]:
        plate_images[state][year] = {}
        for _ in range(0, number_of_plates_to_generate):
            text = ''.join(random.choices(string.ascii_uppercase + string.digits, k=plate_text_length))
            plate_image = get_plate_image(state, year, text)
            plate_images[state][year][text] = plate_image

In [36]:
# Specify a directory where the dataset will be saved
image_directory = './images/'

In [38]:
# Save dataset to disk
for state in states:
    for year in years[state]:
        for plate_text in list(plate_images[state][year].keys()):
            file_name = state + '_' + year + '_' + plate_text + '.jpg'
            open(image_directory + file_name, 'wb').write(plate_images[state][year][plate_text])

In [39]:
# Every instance of this class represents a particular image in the dataset.
class Labeled_Image:

    def __init__(self, state, year, text, image, blue_histogram, green_histogram, red_histogram):
        self.state = state
        self.year = year
        self.text = text
        self.image = image
        self.blue_histogram = blue_histogram
        self.green_histogram = green_histogram
        self.red_histogram = red_histogram

# Each histogram has 32 bins, and intensity values can range between 0 and 256.
number_of_bins = 32
possible_values = [0, 256]

labeled_images = []

# Load the dataset from disk, and compute the histograms for each color channel of each image.
# In this way, we map images to points in R^{number_of_bins * 3}
for image_file in listdir(image_directory):
    image_file_name = image_file.split('.')[0]
    state, year, text = image_file_name.split('_')
    image = cv2.imread(image_directory + image_file, cv2.IMREAD_COLOR)

    image_channels = cv2.split(image)
    blue_histogram = cv2.calcHist([image_channels[0]], [0], None, [number_of_bins], possible_values)
    green_histogram = cv2.calcHist([image_channels[1]], [0], None, [number_of_bins], possible_values)
    red_histogram = cv2.calcHist([image_channels[2]], [0], None, [number_of_bins], possible_values)

    labeled_image = Labeled_Image(
        state = state,
        year = year,
        text = text,
        image = image,
        blue_histogram = blue_histogram,
        green_histogram = green_histogram,
        red_histogram = red_histogram,
    )

    labeled_images.append(labeled_image)

In [45]:
# A function that maps state names to the corresponding integer 
# when states are organized in zero-indexed alphabetical order
def get_numerical_state_label(state):
    return states.index(state)

In [74]:
# Now we arrange these images into a numpy array so we can perform predictions using scikit learn, and assign them integer labels

# Every row in this array corresponds to an image.
# The first "number_of_bins" columns correspond to blue histogram values, the second correspond to green histogram values, and the third correspond to red histogram values.
# The final column is the label, an integer indicating which state the plate is from.

labeled_imgs_array = np.zeros((len(labeled_images), (3 * number_of_bins) + 1))

for index in range(0, len(labeled_images)):
    labeled_image = labeled_images[index]
    labeled_imgs_array[index, 0:number_of_bins] = labeled_image.blue_histogram.flatten()
    labeled_imgs_array[index, number_of_bins:2*number_of_bins] = labeled_image.green_histogram.flatten()
    labeled_imgs_array[index, 2*number_of_bins:3*number_of_bins] = labeled_image.red_histogram.flatten()
    labeled_imgs_array[index, 3*number_of_bins] = get_numerical_state_label(labeled_image.state)

# Next, we normalize the histograms, so that the sum of the values for each bin within a particular channel equal one.
blue_channel_sums, green_channel_sums, red_channel_sums = [labeled_imgs_array[:, x*number_of_bins:(x + 1)*number_of_bins].sum(axis = 1) for x in range(0, 3)]
nrmlzd_lb_img_arr = np.copy(labeled_imgs_array)
nrmlzd_lb_img_arr[:, 0:number_of_bins] /= blue_channel_sums[:, np.newaxis]
nrmlzd_lb_img_arr[:, number_of_bins:2*number_of_bins] /= green_channel_sums[:, np.newaxis]
nrmlzd_lb_img_arr[:, 2*number_of_bins:3*number_of_bins] /= red_channel_sums[:, np.newaxis]


In [130]:
# Next, we split thee dataset into training and tests sets, to train 
# and evaluate our classifier
training_imgs, test_imgs, train_labels, test_labels = train_test_split(
    nrmlzd_lb_img_arr[:, :-1], nrmlzd_lb_img_arr[:, -1], test_size = 0.875
)

In [134]:
# Next, we actually train and score our classifier.
# Even when trained on only 12.5 percent of the available
# data, this approach is able to achieve 97.7 percent accuracy.
# Accuracy increases significantly if the training dataset is 
# larger.
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(training_imgs, train_labels)

classifier.score(test_imgs, test_labels)

0.9776051188299817