# Data Wrangler for Pokemon Identifier Project

Note: Place any additional gathered images into Tmp Directory

In [1]:
import os
import string
import csv
import re
import requests
import shutil
import tensorflow as tf
import numpy as np
from bs4 import BeautifulSoup
from PIL import Image
from imutils import paths #used to get the paths of all images in a dir

## Helper Functions

### Global Values

In [2]:
#wether or not to gather images from the web
gatherFromWeb = True

#number of frames to gather at most from each gif
numFramesExtractGif = 0

#generation of pokemon to prepare for the final dataset
generationsToPrepare = [1, 2, 3, 4, 5, 6, 7, 8]

#list of URLs for internet sourced images
listOfImageURLs = []

#percent of images in scraped directory that will be used for training -- from 0 to 1
percentToUseForTrain = .9

#directory where images will be placed before being processed
tempDirectory = '../Tmp/'

#directory for image datasets
coreImageDir = "../Datasets/Images/"

#directory where scraped images will be placed
gatherDirectory = '../Datasets/Images/Scraped/'

#directory for main neural net data 
mainInfoDirectory = '../Datasets/Main/'

DIR_MODE_IMAGES = '../Datasets/Main/Images/'

#list of websites to scrape
TARGETURLS = ['https://play.pokemonshowdown.com/sprites/']
# ["https://play.pokemonshowdown.com/sprites/"]

REPROCESS_GIFS = True

#### Make directories that will be needed

In [3]:
if os.path.isdir(coreImageDir) is False: 
    os.mkdir(coreImageDir)
    
if os.path.isdir(gatherDirectory) is False:
    os.mkdir(gatherDirectory)

if os.path.isdir(tempDirectory) is False:
    os.mkdir(tempDirectory)
    
if os.path.isdir(mainInfoDirectory) is False:
    os.mkdir(mainInfoDirectory)
    
if os.path.isdir(os.path.join(mainInfoDirectory, 'Images')) is False:
    os.mkdir(os.path.join(mainInfoDirectory, 'Images'))

### Regex Helpers

In [4]:
compiledRE_forwardSlash = re.compile(r'/')
compiledRE_gif = re.compile(r'.gif$')
compiledRE_png = re.compile(r'.png$')

### String Helpers

In [5]:
def extractFileNameFromPath(path: string, removeExtension: bool):
    nameBeginIndex = path.rfind('/')
    fullName = path[nameBeginIndex+1:]
    if removeExtension:
        extensionBeginIndex = fullName.rfind('.')
        return fullName[:extensionBeginIndex]
    else:
        return fullName

In [6]:
def removeFileNameFromPath(path: string):
    nameBeginIndex = path.rfind('/')
    return path[:nameBeginIndex]

In [7]:
def generateScrapedPath(file: string):
    pokemonName = compiledData.getProperPokemonName(file)
    if pokemonName is not False:
        fullPath = os.path.join(gatherDirectory, pokemonName)
        return fullPath
    return False

### Class

In [8]:
class DataWrangler:
    completeDataset = '../Datasets/GeneralData/UpdatedCompletePokemonDataset/pokedex_(Update_04.21).csv'
    imageWebLocations = [
        ''
    ]
    def __init__(self):
        self.uniqueDexIDs = []
        self.uniqueDexNames = []
        self.pokemonGenerations = []
        self.pokeDictionary = {}
        self.populateDataFromFile(DataWrangler.completeDataset)
    
    def populateDataFromFile(self, filePath: string, generations: list=None):
        
        with open(filePath) as file: 
            csv_reader = csv.reader(file, delimiter=',')
        
            firstLine = True
            for line in csv_reader:
                if firstLine is not True: 
                    self.uniqueDexIDs.append(line[1])
                    self.uniqueDexNames.append(line[2])
                    self.pokemonGenerations.append(line[5])
                else: 
                    #figure out what columns in the dataset contain the pokemon name and pokedexID -- TODO 
                    firstLine = False

    #pick the correct pokemon that a given filename should associate with 
    def getProperPokemonName(self, inString: string): 
        potentialMatches = []
        potentialMatchesIndex = []
        searchString = inString.lower()
        counter = 0
        
        for name in self.uniqueDexNames: 
            currName = name.lower() 
            if (currName in searchString):
                #need to clean up the string and find a way to chop out the name to compare with directly (eternatus has the name natu in it)
                potentialMatches.append(currName)
                potentialMatchesIndex.append(counter)
            counter += 1

        #after going through entire pokedex, go through list of potential matches and check which is most appropriate
        currBestMatch = None 
        currBestMatchIndex = None
        
        for match in potentialMatches:
            #if pokemon name is eternatus
            #matched list should include 'natu' AND 'eternatus' 
            #of the potential matches, determine which is the best

            #go through entire string and see how many characters of the string that it matches
            charCount = 0
            searchStringIndex = 0
            continueMatch = True

            #get start index of the potential pokemon name 
            try:
                searchStringIndex = searchString.find(match)
            except:
                #string does not contain name, bad match
                continueMatch = False
            
            if continueMatch is True:
                for i in range(len(match)):
                    if match[i] == searchString[searchStringIndex]:
                        charCount += 1
                        searchStringIndex += 1

                #see if searchString matches the entire length of the potential pokemon name
                if ((currBestMatch is None) or ((charCount == len(match)) and (len(match) > len(currBestMatch)))):
                    currBestMatch = match
        
        if currBestMatch is not None: 
            return currBestMatch
        else:
            return False
        
    def getPokemonGeneration(self, pokemonName: string) -> int:
        for counter in range(len(self.uniqueDexNames)):
            if (pokemonName.lower() == self.uniqueDexNames[counter].lower()):
                return int(self.pokemonGenerations[counter])
        return False

compiledData = DataWrangler()

### Image Helpers

In [9]:
#split a given gif into seperate images -- will return paths to all new files
def gifToImages(pathToGif: string, destinationPath: string):

    #get number of keyframes of gif
    if (os.path.isfile(pathToGif)):
        createdFilePaths = []
        
        with Image.open(pathToGif) as openGif:
            numFrames = openGif.n_frames
            numToExtract = 0

            #check if the number of frames in a given gif is more than the max number defined to get
            if numFramesExtractGif != 0 and numFrames > numFramesExtractGif:
                numToExtract = numFramesExtractGif
            else:
                numToExtract = numFrames

            framesToGet = np.linspace(0, openGif.n_frames - 1, numToExtract)
            isFirstFrame = True
            for frameNumber in framesToGet.astype(np.int64):
                openGif.seek(frameNumber)
                fileName = f'{extractFileNameFromPath(pathToGif, True)}-{frameNumber}.png'
                finalFullPath = os.path.join(destinationPath, fileName)
                createdFilePaths.append(finalFullPath)
                
                if isFirstFrame is True: 
                    palette = openGif.getpalette()
                else:
                    openGif.putpalette(palette)
                
                if os.path.isdir(destinationPath) is False:
                    os.mkdir(destinationPath)
                # openGif.palette.dirty = 1
                # openGif.palette.rawmode = "RGBA"
                openGif.save(finalFullPath)
        return createdFilePaths

# gifToImages('./Tmp/abomasnow-mega.gif', generateScrapedPath('./Tmp/abomasnow-mega.gif'))

In [10]:
#apply any formatting that is needed for the given image and place into correct directory
def processImage(pathToImage: string, isWebPath: bool, overrideDestinationPath: string=None): 
    destinationPath = None
    if overrideDestinationPath is None:
        destinationPath = gatherDirectory
    else:
        destinationPath = overrideDestinationPath

    pokemonName = compiledData.getProperPokemonName(pathToImage)
    if ((pokemonName is not False) and (tempDirectory in pathToImage) and len(compiledRE_png.findall(pathToImage)) !=0):
        #current image is in the temp directory, copy to other directory
        labeledDir = os.path.join(destinationPath, pokemonName)

        #dont copy if the file is already in the proper compiled directory
        if os.path.isdir(labeledDir) is False: 
            os.mkdir(labeledDir)
        shutil.copy2(pathToImage, labeledDir)
        #check if image is a gif and convert to a group of images
    elif (len(compiledRE_gif.findall(pathToImage)) != 0):
        #it is a gif -- if processing image from other dataset (not currently in tmp), shouldnt do extra copy to tmp directory :: TODO: UNLESS EXTRA PROCESSING IS NEEDED (fixing images in some way)
        gifCreationDir = None
        
        createdGifImages = gifToImages(pathToImage, destinationPath)

        #will need to process each image just created
        for newImages in createdGifImages:
            processImage(newImages, False, destinationPath)

# processImage('./Datasets/Images/1300-big-front-gifs/001-bulbasaur-s.gif', False)

### Web Helpers

In [11]:
def downloadImage(imageURL: string):

    #create filename for new file - get file name from URL along with parent directory on remote server (combine)
    nameBeginIndex = imageURL.rfind('/')
    pathWithoutName = imageURL[:nameBeginIndex]
    extendedDirIndex = pathWithoutName.rfind('/')
    fileName = pathWithoutName[extendedDirIndex+1:] + '--' + imageURL[nameBeginIndex+1:]

    fullNewFilePath = os.path.join(tempDirectory, fileName)

    if (os.path.isfile(fullNewFilePath) is not True):
        #download the file from the remote and place in new path
        read = requests.get(imageURL)

        with open (fullNewFilePath, 'wb') as f: 
            f.write(read.content)
            f.close()
            
# downloadImage('https://play.pokemonshowdown.com/sprites/ani-back/ferroseed.gif')

### Supporting methods for image search

In [12]:
#recursively search through a provided URL to find gifs
def browseForImages(currRoot):
    global listOfImageURLs
    
    #avoid april fools day images on pokemon showdown
    if "afd" not in currRoot:
        page = requests.get(currRoot)
        soup = BeautifulSoup(page.content, "html.parser")

        results = soup.find_all("a", text=compiledRE_forwardSlash)
        pngSources = soup.find_all("a", text=compiledRE_png)
        gifSources = soup.find_all("a", text=compiledRE_gif)

        for image in pngSources: 
            full = currRoot + image.text
            listOfImageURLs.append(full)

        for image in gifSources: 
            full = currRoot + image.text
            listOfImageURLs.append(full)

        #navigate through all of the possible directories 
        for each in results: 
            subURL = currRoot + each.text
            browseForImages(subURL)

## Gather data from internet resources

#### Gather image paths into list and then download images as needed

In [13]:
if gatherFromWeb is True:
    #gather target URLs for images
    for target in TARGETURLS:
        browseForImages(target)

    #go through and download images as needed
    for url in listOfImageURLs: 
        downloadImage(url)

Took 168 minutes to complete -- with processing

## Sort data gathered into useable dataset for testing and training

In [None]:
def searchForFiles(currentDir):
    if os.path.isdir(currentDir):
        nextLevelContents = os.listdir(currentDir)
        for content in nextLevelContents:
                #go through all contents except for gathered directory 
                fullPath = os.path.join(currentDir, content)    
                searchForFiles(fullPath)   
    else:
        #this child has to be a file -- copy to core dataset 
        pokemonName = compiledData.getProperPokemonName(currentDir)
        if pokemonName is not False and compiledData.getPokemonGeneration(pokemonName) in generationsToPrepare:
            datasetPath = os.path.join(gatherDirectory, pokemonName)
            processImage(currentDir, False, datasetPath)
searchForFiles(tempDirectory)

## Verifiy Data

## Test Images to ensure proper format

In [None]:
def testFile(filePath) -> bool:
    if os.path.getsize(filePath) == 0 or os.path.isdir(filePath):
        print(file + " is zero length or is directory, ignoring")
        return False
    elif "afd" in filePath:
        print(file + " this is garbage file, removing")
        return False
    else:
        #attempt to open file to confirm that it is a valid file
        try:
            tmp = Image.open(filePath)
            tmp.load()
            if tmp.format != 'PNG':
                print(file + " is not correct format, ignoring")
                return False
            elif tmp.n_frames > 1:
                print(tmp.format)
                print(file + " has more than one frame, ignoring")
                return False
            tmp.close()
        except:
            print(filePath + " failed to open, ignoring")
            return False

        #ensure all images are encoded in the correct format 
        with open(filePath, 'rb') as imageFile:
            if imageFile.read().startswith(b'RIFF'):
                print(file + " isnt right type, ignoring")
                return False
    return True

# testFile('../Datasets/Main/Images/Train/gyarados/pokemon--gyarados.png')

In [None]:
listOfPokemonDirs = os.listdir(gatherDirectory)
# listOfPokemonDirs = os.listdir('../Datasets/Main/Images/Train/')
listOfAllImages = list(paths.list_images(coreImageDir))
verifiedFiles = []

for file in listOfAllImages: 
    pokemonName =  compiledData.getProperPokemonName(file)
    gen = compiledData.getPokemonGeneration(pokemonName)
    if gen in generationsToPrepare: 
        if testFile(file) is True: 
            verifiedFiles.append(file)

#copy verified files to core model directory
for file in verifiedFiles: 
    pokemonName = compiledData.getProperPokemonName(file)
    finalPokemonDir = os.path.join(DIR_MODE_IMAGES, pokemonName)
    shutil.copy2(file, finalPokemonDir)

In [None]:
# for pokemonDir in listOfPokemonDirs:
#     #check if the pokemon is in the generation of targeted pokemon 
#     pokemonName = compiledData.getProperPokemonName(pokemonDir)
#     gen = compiledData.getPokemonGeneration(pokemonName)
#     if gen in generationsToPrepare:


#         #decide if to copy image, then copy if so 
#         pathPokemonDir = os.path.join(gatherDirectory, pokemonDir)
#         # pathPokemonDir = os.path.join(pokemonDir, 
#         # fileList = os.listdir('../Datasets/Main/Images/Train/gyarados')|
#         fileList = os.listdir(pathPokemonDir)
#         verifiedList = []
        
#         for file in fileList: 
#             fullPath = os.path.join(pathPokemonDir, file)
#             # fullPath = os.path.join('../Datasets/Main/Images/Train/gyarados', file)
#             if testFile(fullPath) is True:
#                 verifiedList.append(file)

#         for file in fileList:
#             randCounter += 1
#             finalPokemonDir = os.path.join(DIR_MODEL_IMAGES, pokemonName)
#             currPath = os.path.join(pathPokemonDir, file)
#             if os.path.isdir(finalPokemonDir) is False: 
#                 os.mkdir(finalPokemonDir)
#             shutil.copy2(currPath, os.path.join(finalPokemonTrainDir, file))
#             os.remove(currPath)