<h1>License Plate Classifier</h1>

<h2>Data_Preprocessing</h2>
This segment encodes the license plate data from (https://github.com/datanews/license-plates) and save the dataframe as a pickle (serialized) file.

In [3]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

import sklearn.model_selection as model_selection

# import files
accepted = pd.read_csv('accepted-plates.csv')
rejected = pd.read_csv('rejected-plates.csv')

# create dataset
accepted['label'] = 1
rejected['label'] = 0
dataset = pd.concat([accepted, rejected], ignore_index=True)

# filter out nan values
dataset = dataset[dataset['plate'].notna()]

print(dataset.head())

# print the number of accepted and rejected plates
print(dataset['label'].value_counts())

print(f'length of dataset: {len(dataset)}')
# make the dataset smaller
dataset = dataset.sample(frac=0.01, random_state=1)
print(f'length of dataset: {len(dataset)}')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


         date   plate  label
0  2010-10-01  ANDARE      1
1  2010-10-01   1TWIN      1
2  2010-10-01  11VROD      1
3  2010-10-01  4SKNMC      1
4  2010-10-01  7IRON6      1
1    131989
0      1646
Name: label, dtype: int64
length of dataset: 133635
length of dataset: 1336


In [4]:

from numba import cuda
import torch
import numpy as np
# Initialize the GPU
# cuda.select_device(0)

# use BERT to encode the text characters
def encode_text(text):
    # encode the text characters (not words)
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    # get the hidden states from the model
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]
    # get the first token embedding
    features = last_hidden_states[:, 0, :].numpy()
    return features

# split dataset into train and test
X = dataset['plate']
y = dataset['label']

# encode the text
encoded = X.apply(encode_text)

In [None]:

from numba import cuda
import torch
import numpy as np
'''
# Initialize the GPU
cuda.select_device(0)

# use BERT to encode the text characters
@cuda.jit
def process_features(features, output):
    pos = cuda.grid(1)
    if pos < features.shape[0]:
        # process the features here as needed
        output[pos] = features[pos]

# split dataset into train and test
X = dataset['plate'].values  # convert to Numpy array
y = dataset['label'].values  # convert to Numpy array

# encode the text on the CPU
encoded = np.zeros((len(X), 768))  # assuming the feature size is 768
for i, text in enumerate(X):
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]
    encoded[i] = last_hidden_states[:, 0, :].numpy()

# process the encoded data on the GPU
output = np.zeros_like(encoded)
threadsperblock = 32
blockspergrid = (len(encoded) + (threadsperblock - 1)) // threadsperblock
process_features[blockspergrid, threadsperblock](encoded, output)
'''

In [5]:
print(f'encoded shape is {encoded.shape}')

# filter the encoded dataframe to only include the rows that are in the dataset dataframe
encoded_filtered = encoded.loc[dataset.index]

# reshape the encoded values to 1D array
encoded_plate = encoded_filtered.values.reshape(-1)

# add the encoded plate to the dataset
dataset['encoded_plate'] = encoded_plate
print(dataset.head())


encoded shape is (1336,)
             date     plate  label  \
94621  2013-06-22    I970SS      1   
63814  2012-06-02    PD4419      1   
30496  2011-06-23    173INF      1   
38111  2011-09-07  IAMA46ER      1   
59102  2012-04-19  SUPES429      1   

                                           encoded_plate  
94621  [[-0.6183268, 0.21730568, 0.17258556, -0.12839...  
63814  [[-0.9495514, -0.113612294, -0.21341404, -0.31...  
30496  [[-0.51866204, -0.13206097, 0.036276437, -0.32...  
38111  [[-0.77778316, -0.049498416, -0.15912797, -0.0...  
59102  [[-0.58588433, 0.053693242, -0.10338338, -0.14...  


In [6]:
# serialize the dataset as to not have to re-encode the text
import pickle

with open('dataset.pickle', 'wb') as f:
    pickle.dump(dataset, f)


In [10]:
# load the dataset
with open('dataset.pickle', 'rb') as f:
    dataset = pickle.load(f)


# train the model
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# since there are more rejected plates than accepted, we need to balance the dataset
from imblearn.over_sampling import RandomOverSampler

# ros = RandomOverSampler(random_state=0)
# X_resampled, y_resampled = ros.fit_resample(encoded.to_numpy().reshape(-1, 1), y)

# split the dataset into train and test
X_train, X_test, y_train, y_test = model_selection.train_test_split(encoded, y, test_size=0.2, random_state=0)

# print the distribution of the labels
print('\n\nTrain set distribution after: ', np.unique(y_train, return_counts=True))




Train set distribution after:  (array([0, 1]), array([  18, 1050]))


In [11]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')

print(f'encoded shape: {encoded.shape}')

# train the model using a random forest classifier
from sklearn.ensemble import RandomForestClassifier

# create the model
model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

model.fit(np.vstack(X_train), y_train)

# make predictions
y_pred = model.predict(np.vstack(X_train))
y_pred = model.predict(np.vstack(X_test))

# print the accuracy
print(f'accuracy: {accuracy_score(y_test, y_pred)}')

X_train shape: (1068,)
y_train shape: (1068,)
encoded shape: (1336,)
accuracy: 0.9925373134328358


In [9]:
# check accuracy and f1 score and confusion matrix
print(accuracy_score(y_test.tolist(), y_pred))
print(classification_report(y_test.tolist(), y_pred))
print(confusion_matrix(y_test.tolist(), y_pred))

0.9925373134328358
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      1.00       266

    accuracy                           0.99       268
   macro avg       0.50      0.50      0.50       268
weighted avg       0.99      0.99      0.99       268

[[  0   2]
 [  0 266]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
