In [1]:
# This is the main notebook used to generate training/validation datasets.

In [2]:
%matplotlib inline

In [3]:
import os
import sys
import random

from torchvision import transforms
import torch.utils.data as data
from scipy.ndimage.filters import gaussian_filter
import cv2

from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import create_dataset

import json
import unicodedata
import string
import itertools
import bisect
from functools import partial

In [4]:
home_directory = os.path.expanduser('~')
nn_library_path = home_directory + '/Documents/HarveyMuddWork/Neural_Nets_Research/neural_nets_research/neural_nets_library'
sys.path.append(nn_library_path)

In [5]:
from dataset import greyscale_image_loader, tightest_image_crop, square_padding, vertical_scale_preserve_aspect_ratio

In [6]:
def show(img):
    npimg = np.asarray(img)
    plt.figure()
    plt.imshow(npimg, interpolation='nearest', cmap='gray')

In [7]:
def overlap_concat(img1, img2, overlap_amt):
    left = img1[:,0:len(img1[0]) - overlap_amt]
    mid = img1[:,len(img1[0])-overlap_amt : len(img1[0])] + img2[:, 0:overlap_amt]
    values, counts = np.unique(mid, return_counts=True)
    values = list(values)
    overlapping = 0
    
    if 254 in values:
        overlapping = counts[values.index(254)]
    right = img2[:,overlap_amt:]

    return np.concatenate((left, mid, right), axis = 1), overlapping

In [8]:
def get_concat(img1, img2, max_over):    
    for i in range(1, min(len(img1[0]), len(img2[0]))):
        new, overlap = overlap_concat(img1, img2, i)
        if overlap >= max_over:
            if i == 1:
                new, _ = overlap_concat(img1, img2, i)
            else:
                slide_back = random.randrange(1, 10)
                overlap = max(1, i - slide_back)
                new, _ = overlap_concat(img1, img2, overlap)
            return new
        
    if max_over != 1:
        return get_concat(img1, img2, 1)
    else:
        return overlap_concat(img1, img2, 1)[0]

In [9]:
def np_tightest_image_crop(img):
    image_indices = img.nonzero()
    image_indices = np.array(list(zip(image_indices[0], image_indices[1])))

    top_i = image_indices[0,0]
    bottom_i = image_indices[-1,0]
    
    mins = image_indices.min(axis=0)
    left_i = mins[1]
    
    maxs = image_indices.max(axis=0)
    right_i = maxs[1]
    
    new_width = right_i-left_i+1
    new_height = top_i-bottom_i+1
        
    return img[top_i:bottom_i+1, left_i:right_i+1]

In [10]:
def choices(elements, probabilities):
    cumdist = list(itertools.accumulate(probabilities))
    return elements[bisect.bisect(cumdist, random.random())]

In [11]:
def add_random_dot(img):
    dot = np.zeros_like(img)
    height, width = img.shape
    hDot, wDot = random.randrange(height-5, height), random.randrange(width - 5, width)
    dot[hDot, wDot] = random.random() * 100 + 50
    
    dot = gaussian_filter(dot, 5)
    
    dot[dot < 0.5] = 0
    dot[dot > 0.5] = 1
    dot[np.logical_and(dot, img)] = 0
    dot = dot.astype('uint8')
    
    return img + dot*np.amax(img)

def close_to_line_segment(tolerance, x_mid, y_mid, slope, length_squared):
    def close_to_line_segment2(x, y):
        result = np.ones_like(x)
        result[(x - x_mid)**2 + (y - y_mid)**2 > length_squared] = 0
        result[abs((x - x_mid) * slope + y_mid - y) > tolerance] = 0
        return result
        
    return close_to_line_segment2

def add_random_line(img):
    tolerance = random.randrange(3) + 0.1
    height, width = img.shape
    x_mid, y_mid = random.randrange(height), random.randrange(width)
    slope = (random.random() - 0.5) * 10
    length_squared = random.randrange(4,10) ** 2
    line = np.fromfunction(close_to_line_segment(tolerance, x_mid, y_mid, slope, length_squared), 
                                 (height, width), dtype='uint8')
    line[np.logical_and(line, img)] = 0
    
    return img + line*np.amax(img)
    
# Image should be a numpy array.
def add_noise(img):
    noise_prob_dist = [0.75, 0.15, 0.05, 0.05]
    amount_of_noise = choices(range(4), noise_prob_dist)
    
    for _ in range(amount_of_noise):
        if random.random() < 0.5:
            img = add_random_dot(img)
        else:
            img = add_random_line(img)
    
    return img

In [27]:
class TextDataset(object):
    def __init__(self, word_dictionary, image_folder_names,
                 number_of_training_data_points=500000, number_of_validation_data_points=10000, 
                 number_prob=0.5, deskewed=True, noise=True, seed=1000, variation_characters=None):
        self.training_img_paths = []
        self.training_text_labels = []
        
        self.validation_img_paths = []
        self.validation_text_labels = []
        self.tuple_mode = {}
        
        possible_num_digits = [1, 2, 3, 4, 5]
        num_digit_dist = [0.02, 0.2, 0.6, 0.1, 0.08]
        training_image_path_names = {}
        validation_image_path_names = {}
        
        for key, folder_list in image_folder_names.items():
            training_image_path_names[key] = []
            validation_image_path_names[key] = []
            
            for folder in folder_list:
                if isinstance(folder, tuple):
                    self.tuple_mode[key] = True
                    prob, folder_name = folder
                    image_paths = list(map(lambda file: os.path.join(folder_name, file), 
                                           os.listdir(folder_name)))
                    training_image_path_names[key].append((prob, image_paths[:int(0.8*len(image_paths))]))
                    validation_image_path_names[key].append((prob, image_paths[int(0.8*len(image_paths)):]))
                else:
                    self.tuple_mode[key] = False
                    image_paths = list(map(lambda file: os.path.join(folder, file), os.listdir(folder)))
                    training_image_path_names[key].extend(image_paths[:int(0.8*len(image_paths))])
                    validation_image_path_names[key].extend(image_paths[int(0.8*len(image_paths)):]) 
        
        for i in range(number_of_training_data_points):
            self.training_img_paths.append([])
            
            if random.random() < number_prob:
                num_digits = choices(possible_num_digits, num_digit_dist)
                number = str(random.randrange(10**(num_digits - 1), 10**num_digits - 1))
                self.training_text_labels.append(number)
                
                for digit in number:
                    if self.tuple_mode[digit]:
                        probabilities = list(map(lambda pair: pair[0], training_image_path_names[digit]))
                        folders = list(map(lambda pair: pair[1], training_image_path_names[digit]))
                        folder_choice = choices(folders, probabilities)
                        self.training_img_paths[i].append(random.choice(folder_choice))
                    else:
                        self.training_img_paths[i].append(random.choice(training_image_path_names[digit]))
            else:
                word = random.choice(word_dictionary)
                self.training_text_labels.append("")
                                
                for char in word:
                    if variation_characters is not None:
                        if random.random() < 0.5:
                            char = variation_characters[char]
                    
                    if self.tuple_mode[char]:
                        probabilities = list(map(lambda pair: pair[0], training_image_path_names[char]))
                        folders = list(map(lambda pair: pair[1], training_image_path_names[char]))
                        folder_choice = choices(folders, probabilities)
                        self.training_img_paths[i].append(random.choice(folder_choice))
                    else:
                        self.training_img_paths[i].append(random.choice(training_image_path_names[char]))
                        
                    self.training_text_labels[i] += char
        
        for i in range(number_of_validation_data_points):
            self.validation_img_paths.append([])
            
            if random.random() < number_prob:
                num_digits = choices(possible_num_digits, num_digit_dist)
                number = str(random.randrange(10**(num_digits - 1), 10**num_digits - 1))
                self.validation_text_labels.append(number)
                
                for digit in number:
                    if self.tuple_mode[digit]:
                        probabilities = list(map(lambda pair: pair[0], validation_image_path_names[digit]))
                        folders = list(map(lambda pair: pair[1], validation_image_path_names[digit]))
                        folder_choice = choices(folders, probabilities)
                        self.validation_img_paths[i].append(random.choice(folder_choice))
                    else:
                        self.validation_img_paths[i].append(random.choice(validation_image_path_names[digit]))
            else:
                word = random.choice(word_dictionary)
                self.validation_text_labels.append("")
                                
                for char in word:
                    if variation_characters is not None:
                        if random.random() < 0.5:
                            char = variation_characters[char]
                    
                    if self.tuple_mode[char]:
                        probabilities = list(map(lambda pair: pair[0], validation_image_path_names[char]))
                        folders = list(map(lambda pair: pair[1], validation_image_path_names[char]))
                        folder_choice = choices(folders, probabilities)
                        self.validation_img_paths[i].append(random.choice(folder_choice))
                    else:
                        self.validation_img_paths[i].append(random.choice(validation_image_path_names[char]))
                        
                    self.validation_text_labels[i] += char
                            
        self.seed = seed
        self.reset_seed()
        
        self.deskewed = deskewed
        
        if self.deskewed:
            self.deskew_matrix = np.array([[1, 0, 0], [-0.51, 1, 32*0.51/2]])
            
        self.noise = noise
                
    def generate_words(self, output_folder, output_path, training=False):
        if training:
            num_images = len(self.training_img_paths)
            labels = self.training_text_labels
        else:
            num_images = len(self.validation_img_paths)
            labels = self.validation_text_labels
                
        for i in range(num_images):
            image = self.generate_image(i, training)
            image_name = str(i) + '.png'
            image_path = os.path.join(output_folder, image_name)
            image.save(image_path)
        
        def width_by_index(n):
            image_name = str(n) + '.png'
            image_path = os.path.join(output_folder, image_name)
            image = greyscale_image_loader(image_path)
            return image.size[0]
        
        images_by_width_index = sorted(range(num_images), key=width_by_index)        
        image_paths_by_width = list(map(lambda n: os.path.join(output_folder, str(n) + '.png'), 
                                    images_by_width_index))
        text_labels_by_width = list(map(lambda n: labels[n], images_by_width_index))
        create_dataset.createDataset(output_path, image_paths_by_width, text_labels_by_width)
          
    def generate_image(self, i, training=False):
        zero_padding = random.randrange(1,3)
                
        if training:
            curr_img_paths = self.training_img_paths[i]
        else:
            curr_img_paths = self.validation_img_paths[i] 
        
        img = greyscale_image_loader(curr_img_paths[0])
        img = np.asarray(img)
        img = np_tightest_image_crop(img)      
        img = Image.fromarray(img)
        img = vertical_scale_preserve_aspect_ratio(img, 32 - (2*zero_padding))
        img = np.asarray(img)      
        img = np.pad(img, zero_padding, 'constant', constant_values=0)
        
        if self.deskewed and 'SanbornDigits' in curr_img_paths[0]:
            h, w = img.shape
            img = cv2.warpAffine(img, self.deskew_matrix, (w,h), flags=cv2.INTER_NEAREST)
        
        rand_over = random.randrange(1,3)
        
        for j in range(1, len(curr_img_paths)):
            img2 = greyscale_image_loader(curr_img_paths[j])
            img2 = np.asarray(img2)
            img2 = np_tightest_image_crop(img2)
            img2 = Image.fromarray(img2)
            img2 = vertical_scale_preserve_aspect_ratio(img2, 32 - (2*zero_padding))
            img2 = np.asarray(img2)
            img2 = np.pad(img2, zero_padding, 'constant', constant_values=0)
            
            if self.deskewed and 'SanbornDigits' in curr_img_paths[j]:
                h, w = img2.shape
                img2 = cv2.warpAffine(img2, self.deskew_matrix, (w,h), 
                                      flags=cv2.INTER_NEAREST)
            
            img = get_concat(img, img2, rand_over)

        if self.noise:
            img = add_noise(img)            

        img = Image.fromarray(img)
        return img
    
    def reset_seed(self):
        random.seed(self.seed)

In [13]:
all_letters = string.ascii_letters

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def make_dictionary(dictionary_file):
    word_dictionary = []
    
    with open(dictionary_file) as dictionary:
        for line in dictionary:
            line = line.strip()
            
            if line.isalpha():
                word_dictionary.append(unicodeToAscii(line).upper())
    
    return word_dictionary

In [14]:
not_sanborn_char_folder_names = {'A': 'B10', 'B': 'B11', 'C': 'B12', 'D': 'B13', 'E': 'B14', 
                                 'F': 'B15', 'G': 'B16', 'H': 'B17', 'I': 'B18', 'J': 'B19', 
                                 'K': 'C20', 'L': 'C21', 'M': 'C22', 'N': 'C23', 'O': 'C24', 
                                 'P': 'C25', 'Q': 'C26', 'R': 'C27', 'S': 'C28', 'T': 'C29', 
                                 'U': 'D30', 'V': 'D31', 'W': 'D32', 'X': 'D33', 'Y': 'D34', 
                                 'Z': 'D35'}

for c in string.ascii_uppercase:
    not_sanborn_char_folder_names[c] = ['/home/mehdi2277/Documents/HarveyMuddWork/Clinic/by_class/' + not_sanborn_char_folder_names[c]]

sanborn_char_folder_names = {}

for c in string.ascii_uppercase:
    sanborn_char_folder_names[c] = ['/media/mehdi2277/MyFiles/large_datasets/text_classification/labeled_letters/' + c]

sanborn_digits = {}

for i in range(10):
    sanborn_digits[str(i)] = ['/media/mehdi2277/MyFiles/large_datasets/text_classification/SanbornDigits/' + str(i)]

In [15]:
pure_sanborn_words = {}
    
for c in string.ascii_uppercase:
    pure_sanborn_words[c] = sanborn_char_folder_names[c]
    
for i in range(10):
    pure_sanborn_words[str(i)] = sanborn_digits[str(i)]

In [16]:
mixed_sanborn_words = {}

for c in string.ascii_uppercase:
    mixed_sanborn_words[c] = list(map(lambda folder_name: (0.5, folder_name), 
                                      sanborn_char_folder_names[c] +  not_sanborn_char_folder_names[c]))

for i in range(10):
    mixed_sanborn_words[str(i)] = sanborn_digits[str(i)]

In [None]:
mixed_variation_sanborn_words = {}

for c in string.ascii_uppercase:
    mixed_variation_sanborn_words[c] = sanborn_char_folder_names[c]
    
for c in string.ascii_lowercase:
    mixed_variation_sanborn_words[c] = not_sanborn_char_folder_names[c.upper()]
    
for i in range(10):
    mixed_variation_sanborn_words[str(i)] = sanborn_digits[str(i)]
    
mixed_variation_characters = {c: c.lower() for c in string.ascii_uppercase}

In [17]:
mixed_rare_sanborn_words = {}

for c in string.ascii_uppercase:
    if c == 'J' or c == 'Q' or c == 'X' or c == 'Z':
        mixed_rare_sanborn_words[c] = list(map(lambda folder_name: (0.5, folder_name), 
                                               sanborn_char_folder_names[c] +  \
                                               not_sanborn_char_folder_names[c]))
    else:
        mixed_rare_sanborn_words[c] = sanborn_char_folder_names[c]

for i in range(10):
    mixed_sanborn_words[str(i)] = sanborn_digits[str(i)]

In [None]:
mixed_rare_variation_sanborn_words = {}

for c in string.ascii_uppercase:
    mixed_rare_variation_sanborn_words[c] = sanborn_char_folder_names[c]
    
for c in string.ascii_lowercase:
    if c == 'J' or c == 'Q' or c == 'X' or c == 'Z':
        continue
    
    mixed_rare_variation_sanborn_words[c] = not_sanborn_char_folder_names[c.upper()]
    
for i in range(10):
    mixed_rare_variation_sanborn_words[str(i)] = sanborn_digits[str(i)]

mixed_rare_variation_characters = {}

for c in string.ascii_uppercase:
    if c == 'J' or c == 'Q' or c == 'X' or c == 'Z':
        mixed_rare_variation_characters[c] = c
    else:
        mixed_rare_variation_characters[c] = c.lower()

In [18]:
no_word_dictionary = []
dictionary_file = '/usr/share/dict/words'
use_word_dictionary = make_dictionary(dictionary_file)

In [31]:
output_image_folder = '/media/mehdi2277/MyFiles/large_datasets/text_classification/produced_words'
sanborn_dataset = TextDataset(use_word_dictionary, mixed_sanborn_words, 
                              number_of_training_data_points = 500000,
                              number_of_validation_data_points = 10000,
                              number_prob=0.5, deskewed=False, noise=False)
database_path = '/media/mehdi2277/MyFiles/large_datasets/text_classification/lmdb_files/mixed_sanborn_word_training'
sanborn_dataset.generate_words(output_image_folder, database_path, training=True)
database_path = '/media/mehdi2277/MyFiles/large_datasets/text_classification/lmdb_files/mixed_sanborn_word_validation'
sanborn_dataset.generate_words(output_image_folder, database_path)

Written 1000 / 500000
Written 2000 / 500000
Written 3000 / 500000
Written 4000 / 500000
Written 5000 / 500000
Written 6000 / 500000
Written 7000 / 500000
Written 8000 / 500000
Written 9000 / 500000
Written 10000 / 500000
Written 11000 / 500000
Written 12000 / 500000
Written 13000 / 500000
Written 14000 / 500000
Written 15000 / 500000
Written 16000 / 500000
Written 17000 / 500000
Written 18000 / 500000
Written 19000 / 500000
Written 20000 / 500000
Written 21000 / 500000
Written 22000 / 500000
Written 23000 / 500000
Written 24000 / 500000
Written 25000 / 500000
Written 26000 / 500000
Written 27000 / 500000
Written 28000 / 500000
Written 29000 / 500000
Written 30000 / 500000
Written 31000 / 500000
Written 32000 / 500000
Written 33000 / 500000
Written 34000 / 500000
Written 35000 / 500000
Written 36000 / 500000
Written 37000 / 500000
Written 38000 / 500000
Written 39000 / 500000
Written 40000 / 500000
Written 41000 / 500000
Written 42000 / 500000
Written 43000 / 500000
Written 44000 / 5000

Written 347000 / 500000
Written 348000 / 500000
Written 349000 / 500000
Written 350000 / 500000
Written 351000 / 500000
Written 352000 / 500000
Written 353000 / 500000
Written 354000 / 500000
Written 355000 / 500000
Written 356000 / 500000
Written 357000 / 500000
Written 358000 / 500000
Written 359000 / 500000
Written 360000 / 500000
Written 361000 / 500000
Written 362000 / 500000
Written 363000 / 500000
Written 364000 / 500000
Written 365000 / 500000
Written 366000 / 500000
Written 367000 / 500000
Written 368000 / 500000
Written 369000 / 500000
Written 370000 / 500000
Written 371000 / 500000
Written 372000 / 500000
Written 373000 / 500000
Written 374000 / 500000
Written 375000 / 500000
Written 376000 / 500000
Written 377000 / 500000
Written 378000 / 500000
Written 379000 / 500000
Written 380000 / 500000
Written 381000 / 500000
Written 382000 / 500000
Written 383000 / 500000
Written 384000 / 500000
Written 385000 / 500000
Written 386000 / 500000
Written 387000 / 500000
Written 388000 /