## Import all the Necessary Libraries and Define Some Functions

In [None]:
import os
from PIL import Image, ImageDraw, ImageFilter, ImageEnhance
import imagehash
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.2.0/bin/tesseract'
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import pickle
import pandas as pd
import numpy as np
from urllib.request import urlopen
from natsort import natsorted

#simple function to pickle variables for later use. save a local pickle
def save_object(obj, filename):
    '''Help: Given an object & filepath, store the object as a pickle for later use.'''
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)
    print(f"File saved at {filename}")

#and later load the file back into a variable
def load_object(filename):
    '''Help: Loads something previously pickled from the provided file path.'''
    with open(filename, 'rb') as f:
        load_test = pickle.load(f)
    print(f"File loaded from {filename}")
    return load_test

def find_potential_card_matches(parsed_title, verbose=False):
    '''Help: Given a string of "words" in the card title from the OCR, find all printings that 
    could possibly match. Filter out invalid words from the OCR parsed text, and find all of the
    valid MTG cards that contain the parsed words. Find all printings of the matching card names
    and store them in a dataframe along with relevant info & urls. Printings for which no digital
    image is available are exlcuded. Returns Pandas dataframe with all potential matches.'''
    
    #after the OCR processing, we need to gather a large list of cards that could match

    #first determine which of the parsed OCR words are valid, and keep only those
    valid_cardname_words = [word for word in parsed_title if word in unique_cardname_words]
    
    #also filter out single letter words to cut down on potential options
    valid_cardname_words = [word for word in valid_cardname_words if len(word)>1]

    if verbose:
        print(f'OCR Result:{parsed_title} filtered to {valid_cardname_words}\n')

    #now find any magic card names that include any of the MTG valid parsed text
    potential_matches = []
    for keyword in valid_cardname_words:
        potential_matches.extend([card_match for card_match in valid_card_names if 
                                  keyword in card_match.split()])

    #remove duplicate card names that were found
    potential_matches = set(potential_matches)
    if verbose:
        print(f'Potential Matches: {potential_matches}')
    
    #initialize a new dataframe to store this subset of cards
    potential_matches_df = pd.DataFrame(columns=card_database.columns)

    #for each potential card name, find and store all printings
    for potential_match in potential_matches:
        #get all card entries matching the card name
        initial_subset = card_database.loc[(card_database['name']==potential_match)]
        #find entries without a link to the card image
        no_img_entries = initial_subset[initial_subset['image_url'].isna()].index
        #remove entries without an available card image
        subset_with_imgs =  initial_subset.drop(no_img_entries)

        #add the potential matches to the new database
        potential_matches_df = potential_matches_df.append(subset_with_imgs)
        
    #also drop any duplicate entries
    potential_matches_df = potential_matches_df.drop_duplicates()
        
    return potential_matches_df
        
def initial_card_scan(local_img, verbose=False):
    '''Help: Given a PIL image, crop and use tesseract OCR to quickly read the title area on the card.
    Returns parsed text that includes all "characters" recognized by the OCR, even wrong ones!'''

    #crop the sample image to just the area where card title typically is written, subject to camera
    crop_portion = 0.15

    #create a cropped version of the starting image
    local_img_title = local_img.crop((0,0,local_img.size[0], local_img.size[1]*crop_portion))

    #resize so that each image is 1000px wide, maintaining aspect ratio
    basewidth = 1000
    wpercent = (basewidth/float(local_img_title.size[0]))
    hsize = int((float(local_img_title.size[1])*float(wpercent)))
    local_img_title = local_img_title.resize((basewidth,hsize), Image.ANTIALIAS)
    
    #use tesseract OCR to read the title
    parsed_title = pytesseract.image_to_string(local_img_title).split()
    
    if verbose:
        print(f'Parsed Characters: {parsed_title}')
        
    return parsed_title, local_img_title

def card_match_scan(local_img, potential_matches_df, verbose=False):
    '''Help: Given the local_img card & the dataframe of potential matches, determine with reasonable
    confidence the multiverse_id of the local card.'''
    
    #hash the orginal local image
    local_img_hash = imagehash.average_hash(local_img)
    
    #create an array to store the comparison results
    hash_results = np.empty(shape=(0), dtype=int)
    
    if verbose:
        print('Local image hash generated...')
    
    #for each potential match, has the card image and compare to the local image
    for index, potential_match in potential_matches_df.iterrows():
        #get the multiverse ID of the potential match
        multiverse_id = potential_match['multiverse_id']

        #get the image url and pull the image file
        img_url = potential_match['image_url']
        ref_img = Image.open(urlopen(img_url))
                
        #compare this ref_image to the live_img captured locally
        ref_img_hash = imagehash.average_hash(ref_img)

        #calculate the similarity between the photo hashes
        hash_similarity = local_img_hash - ref_img_hash

        hash_results = np.append(hash_results,hash_similarity)
        #print(f'Card {multiverse_id} scored {hash_similarity}')
        
        if verbose:
                print(f'{multiverse_id} opened & hashed with a score of {hash_similarity} ...')
                
    #add the hash_results to the original dataframe
    potential_matches_df['hash_results'] = hash_results

    #sort the dataframe in likeliness of card match, and grab the top 15 most likely hash results
    initial_matches = potential_matches_df.sort_values('hash_results').iloc[:15]

    return initial_matches
    

## Load our prepared Database & Cardname List

In [None]:
#load the packaged card data
card_database = load_object('card_database.p')
unique_cardname_words = load_object('unique_cardname_words.p')
valid_card_names = load_object('valid_card_names.p')
#and create the necessary folder structure
os.mkdir('Demo Images')
os.mkdir('Processed OCR Input Images')
os.mkdir('Raw OCR Input Images')

## Create a sample set of common cards & images to use

In [None]:
#find all entries with an image that match a given card name

card_name = 'Llanowar Elves'       #'Giant Growth', 'Cancel', 'Llanowar Elves' etc..

#pull all matching cards
all_matches = card_database[card_database['name'] == card_name]

#remove entries without an available image url
no_img_entries = all_matches[all_matches['image_url'].isna()].index
result =  all_matches.drop(no_img_entries)

#print the result
print(f"{len(result)} printings of {card_name} found!")


In [None]:
result

In [None]:
#now pull & save each image file for demo use
for multiverse_id in list(result['multiverse_id']):
    card = result[result['multiverse_id']==multiverse_id]
    #load the image file from the internet
    image_file = Image.open(urlopen(card['image_url'].item()))
    #save a copy locally with the multiverse_id as the filename
    image_file.save(f'Demo Images/{multiverse_id}.png')
    #print a brief message about the results
    print(f"{card['name'].item()} from {card['set_name'].item()} \
saved as {multiverse_id}.png ({image_file.size})")

## Run each card through the OCR and see what comes back

In [None]:
#pull each card image saved locally, edit contrast & brightness, then crop to typical title area
for card in natsorted(os.listdir('Demo Images')):
    if card.endswith('.png'):
        #open the image file
        img = Image.open(f'Demo Images/{card}')
        #use tesseract OCR to scan it
        ocr_result, scanned_img = initial_card_scan(img, False)
        #save for reference the image that was scanned
        scanned_img.save(f'Raw OCR Input Images/{card}')
        
        print(f"{card} raw OCR scan found: {ocr_result}")
        

## Process the cards then run them through the OCR scanner again

In [None]:
#pull each card image saved locally, edit contrast & brightness, then crop to typical title area
for card in natsorted(os.listdir('Demo Images')):
    if card.endswith('.png'):
        img = Image.open(f'Demo Images/{card}')
        #increase the image brightness
        converter = ImageEnhance.Brightness(img)
        img = converter.enhance(1.25)
        #increase the image contrast
        converter = ImageEnhance.Contrast(img)
        img = converter.enhance(1.25)
        #run the card through the OCR scan again
        ocr_result, scanned_img = initial_card_scan(img, False)

        #save for reference the image that was scanned
        scanned_img.save(f'Processed OCR Input Images/{card}')
        
        print(f"{card} raw OCR scan found: {ocr_result}")


## Take the best case OCR result and find all close MTG cards

In [None]:
#take any of the above results that look decent
ocr_result, current_multiverse_id = , 

#determine all possible MTG cards that could be a match
potential_matches_df = find_potential_card_matches(ocr_result)
potential_matches_df

## From the set of close card matches, use imagehashing to find closest one

In [None]:
#first reload the image that we have chosen
local_img = Image.open(f"Demo Images/{current_multiverse_id}.png")

#use imagehashing to scan all relevant cards and return closest matches
hashing_results = card_match_scan(local_img, potential_matches_df, True)



In [None]:
#once done, store the results sorted by hash similarity
hashed_matches = potential_matches_df.sort_values('hash_results').iloc[:15]

#and print the top 15 results
print(hashed_matches[['multiverse_id','image_url','hash_results']].to_string(index=False))

## Thanks for watching! Stay tuned for a better method to actually do this!
