In [None]:
from google.cloud import vision
from google.cloud import storage
from PIL import Image
import numpy as np
import os, sys
from helper import *
import csv  
from io import BytesIO
from IPython import display

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"../guppies-test-4c48569421d8.json"

### Read all files from the google bucket. Can cycle through the files in this list and write results to the csv file.

In [None]:
# Read all JPG files. All images are in both .jpg and .nef formats, need to avoid duplicates.
all_files = ListAvaliableFiles("guppy_images")
all_jpg_files = [ x for x in all_files if "JPG" in x ]

# Read the random number array we will use to assign images.
# random_array = np.arange(len(all_jpg_files))
# np.random.shuffle(random_array)
# np.save('../Data/radom_array.npy', random_array)
random_array = np.load('../Data/radom_array.npy')

# Assign images.
Jordan_array = random_array[0:500]
Sunny_array = random_array[500:1000]

Jordan_files = []
Sunny_files = []

for index in Jordan_array:
    Jordan_files.append(all_jpg_files[index])

for index in Sunny_array:
    Sunny_files.append(all_jpg_files[index])

### Opens a file from the google bucket (or locally, depends if using RetreiveImage (cloud) or LoadImage (local)), crops it, reads it and corrects the output.

In [None]:
"""Retreive cloud image"""
# file = all_jpg_files[50]
file= '1301/Lower Lalaja/28A/100NCD60/DSC_1728.JPG'
image = RetreiveImage(file, verbose=True)

"""Read local image"""
# file = '/Users/jordan/Desktop/Guppies/Data/raw/DSC_1053.JPG'
# image = LoadImage(file)

cropped_image = CroppedImage(image, verbose=True)
output_string, word_confidences = ReadImage(cropped_image, verbose=True)
label = FindErrors(output_string, verbose=True)
print("Initial label:", output_string,
      "\nCorrected label:", label)

In [None]:
Image.open(image)

### Write filename, initial prediction, corrected prediction and the manually inputted truth to the truth.csv file for a set of file names.
The files have been randomised and split.

In [None]:
with open('../Data/truth.csv', 'a') as f:
    writer = csv.writer(f)

    for file in Sunny_files:
        # Read the data and predict the label.
        image = RetreiveImage(file, verbose=False)
        cropped_image = CroppedImage(image, verbose=False)

        display.display(Image.open(cropped_image))
        display.clear_output(wait=True)

        initial_prediction = ReadImage(cropped_image, verbose=False)[0]
        corrected_prediction = FindErrors(initial_prediction)
    
        # Provide truth
        truth = input("Truth:")

        writer.writerow([file, initial_prediction, corrected_prediction, truth])

### Read through all cloud files and write filename, initial prediction and corrected prediction to the predictions.csv file.

In [None]:
files = all_jpg_files
with open('../Data/predictions.csv', 'w') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(['filename', 'prediction', 'corrected'])

    for i, file in enumerate(files):    
        print(f'{i+1}/{len(files)}',end='\r')

        # Read the data
        image = RetreiveImage(file, verbose=False)
        cropped_image = CroppedImage(image, verbose=False)
        Image.open(cropped_image)
        initial_prediction = ReadImage(cropped_image, verbose=False)[0]
        corrected_prediction = FindErrors(initial_prediction)
        
        # write the data
        writer.writerow([all_files[i], initial_prediction, corrected_prediction])

### Some code to reanalyse predictions once we have changed the algorithm

In [None]:
import csv

pred_file = 'Truth'

with open(f'../Data/{pred_file}_new.csv', 'w') as f_new:
    with open(f'../Data/{pred_file}.csv', 'r') as f:
        filereader = csv.reader(f)
        writer = csv.writer(f_new)
        for n, row in enumerate(filereader):
            if row != []:
                new_row = row[:2]

                new_prediction = FindErrors(row[1])

                new_row.append(new_prediction)
                new_row.append(row[3])

                writer.writerow(new_row)


now delete old file and rename new one.

### Some code to check the accuracy of the algorithm

In [None]:
import csv

correct_files = []
incorrect_files = []
invalid_files = []

character_confusions = []


with open(f'../Data/{pred_file}_new.csv', 'r') as f:
    filereader = csv.reader(f)
    for n, row in enumerate(filereader):
        if row != []:
            correct = int(row[3] == row[2])
            if correct:
                correct_files.append(row[0])

            if not correct:
                if row[2] == '1':
                    invalid_files.append(row[0])
                    print(n + 1, "Invalid")

                else: 
                    true = row[3].split('-')
                    pred = row[2].split('-')

                    incorrect_files.append(row[0])

                    if len(true) != 3 or len(pred) != 3:
                        print(n, 'error')
                        continue

                    for i in range(3):
                        if true[i] != pred[i]:
                            print(n + 1, true[i], pred[i])

                            if (i == 1) and (len(true[i]) == len(pred[i])): #mistake in identity.
                                for j in range(len(true[i])):
                                    if true[i][j] != pred[i][j]:
                                        character_confusions.append((true[i][j], pred[i][j]))




print("\nNumber Correct:", len(correct_files), 
      "\nNumber Incorrect:", len(incorrect_files),
      "\nNumber Invalid:", len(invalid_files))

character_confusions =np.array(character_confusions)

In [None]:
print("Attempted % Correct:", len(correct_files) / (len(correct_files) + len(incorrect_files)))
print("Total % Correct:", len(correct_files) / (len(correct_files) + len(incorrect_files) + len(invalid_files)))

In [None]:
np.save('../Data/incorrect_files.npy',np.array(incorrect_files))

make the confusion matrix

In [None]:
true_chars = [X[0] for X in character_confusions]
true_chars_unique = np.unique(true_chars)
pred_chars = [X[1] for X in character_confusions]
pred_chars_unique = np.unique(pred_chars)

In [None]:
confusion_matrix =  np.zeros((len(true_chars_unique), len(pred_chars_unique)))

for i in range(len(true_chars_unique)):
    for j in range(len(pred_chars_unique)):
        confusion_matrix[i,j] = np.sum((character_confusions[:,0] == true_chars_unique[i]) & (character_confusions[:,1] == pred_chars_unique[j]))

## Save the Confusion Matrix. This can be read by the code to directy choose the appropraite replacement.

In [None]:
confusion_data = [confusion_matrix, true_chars_unique, pred_chars_unique]
np.save('../Data/ConfusionData/confusion_matrix.npy', confusion_data[0])
np.save('../Data/ConfusionData/confusion_true_chars.npy', confusion_data[1])
np.save('../Data/ConfusionData/confusion_pred_chars.npy', confusion_data[2])

In [None]:
ConfusionMatrixPath = '../Data/ConfusionData'

confusion_matrix = np.load(ConfusionMatrixPath + '/confusion_matrix.npy')
true_chars_unique = np.load(ConfusionMatrixPath + '/confusion_true_chars.npy')
pred_chars_unique = np.load(ConfusionMatrixPath + '/confusion_pred_chars.npy')

import matplotlib.pyplot as plt
plt.imshow(confusion_matrix)
plt.yticks(np.arange(len(true_chars_unique)), true_chars_unique)
plt.xticks(np.arange(len(pred_chars_unique)), pred_chars_unique)
plt.colorbar()
plt.xlabel("Predicted Character")
plt.ylabel("True Character")

### Reads image from error file array in full verbose mode.

In [None]:
files = invalid_files

file = files[5]
image = RetreiveImage(file, verbose=True)

cropped_image = CroppedImage(image, verbose=True)
output_string, word_confidences = ReadImage(cropped_image, verbose=True)
label = FindErrors(output_string, verbose=True)
print("Initial label:", output_string,
      "\nCorrected label:", label)

In [None]:
Image.open(image)

### Code used to combine truth_sunny and truth_Jordan files.

In [None]:
# files = ['../Data/truth_Jordan.csv', '../Data/truth_sunny.csv']

# whole_file = []

# for file in files:
#     with open(file, 'r') as f:
#         filereader = csv.reader(f)
#         for row in filereader:
#             if row != []:
#                 whole_file.append(row)


# with open('../Data/Truth.csv', 'w') as Truth:
#     writer = csv.writer(Truth)
#     for row in whole_file:
#         writer.writerow(row)

### Code used to download all incorrect images.

In [None]:
import random

truth_path = '../Data/Truth_new.csv'

error_files = ErrorFiles(truth_path)[0]
correct_files = ErrorFiles(truth_path)[2]
random.shuffle(correct_files)
correct_subset = correct_files[0:200]

for i, file in enumerate(error_files):
    # Initialise a client
    storage_client = storage.Client()
    # Create a bucket object for our bucket
    bucket = storage_client.get_bucket("guppy_images")
    # Create a blob object from the filepath
    blob = bucket.blob(file)
    # Download the file to a destination
    blob.download_to_filename('../Data/ErrorImages/%s' %(file.replace("/", "-")))
    print("%i/%i" %(i+1, len(error_files)))

for i, file in enumerate(correct_subset):
    # Initialise a client
    storage_client = storage.Client()
    # Create a bucket object for our bucket
    bucket = storage_client.get_bucket("guppy_images")
    # Create a blob object from the filepath
    blob = bucket.blob(file)
    # Download the file to a destination
    blob.download_to_filename('../Data/CorrectImages/%s' %(file.replace("/", "-")))
    print("%i/%i" %(i+1, len(correct_subset)))

In [None]:
with open(f'../Data/{pred_file}_new.csv', 'r') as f:
    filereader = csv.reader(f)
    for n, row in enumerate(filereader):
        split = row[3].split('-')[1]
        if '0' in split:
            print(split, n)