In [1]:
import os
import csv
from csv import DictReader
import itertools
import pathlib2 as pathlib
import shutil
from PIL import Image
import pandas as pd

def oldPathDict(roots):
    '''
    Get dictionary of all files we want to transfer
    Input- root directory
    Output- dictionary of barcode: [list of absolute paths to all files with barcode]
    Details- does not move any txt, _l, _m, _s files. Does not specify file extension
    <https://stackoverflow.com/questions/2909975/python-list-directory-subdirectory-and-files>
    '''
    oldPathList=[]
    oldPathDictionary={}
    unwanted=["_m","_s","txt","_l"]
    for root in roots:
        for path, subdirs, files in os.walk(root):
            # Ignore hidden directories as files, those that start with "."
            files = [f for f in files if not f[0] == '.']
            subdirs[:] = [d for d in subdirs if not d[0] == '.']
            for name in files:
                # Do not keep any files from unwanted list
                if any(x in name for x in unwanted):
                    pass
                else:
                    oldPath=os.path.join(path,name)
                    oldPathList.append(oldPath)
    #turn this info into dictionary
    for oldPath in oldPathList:
        # Get file name 
        fileName=oldPath.split("/")[-1]
        barcode=fileName.split(".")[0].split("_")[0]
        #print barcode
        if barcode not in oldPathDictionary:
            oldPathDictionary[barcode]=[oldPath]
        elif barcode in oldPathDictionary:
            oldPathDictionary[barcode]=[oldPath]+oldPathDictionary[barcode]
        else:
            print("This should never happen")
    return oldPathDictionary

def portalDict(occurrencesFile,portalName,colName="catalogNumber"):
    '''
    Get dictionary of all barcodes and respective portal names
    Input - occurrences.csv, name of portal, and name of column with barcodes
    Output - Dictionary of barcodes and portal name for each barcode
    '''
    with open(occurrencesFile, "rU") as csv_file:
        catNumList = [row[colName] for row in DictReader(csv_file)]
    portalDictionary={}
    for n in catNumList:
        portalDictionary[n]=portalName
    # return an all caps dicts
    return portalDictionary

def newPathNames(bcp,oldPath,barcodeSplit,portalDictionary,portalName):
    '''
    Use old path to image, and portal to create a new path to place images. 
    Makes an old and new path to a large (_l) image for each image. Does not confirm existance of this _l file. 
    '''
    # Grab name of file, collection (lsu,no,etc), numerical part of barcode, portal
    fileName=str(oldPath.split("/")[-1]).upper()
    largeFile_low=fileName.split(".")[0].upper()+"_l."+fileName.split(".")[1].upper()
    largeFile_up=fileName.split(".")[0]+"_l."+fileName.split(".")[1]
    collection=barcodeSplit[0]
    number=barcodeSplit[1]
    # Split apart barcode number to create new file path
    lastThree=number[-3:] # this isnt nessecary, just to double check things
    cutoffThree=number[:-3]
    secondFolder=cutoffThree[-3:]
    firstFolder=cutoffThree[:-3]
    # Create folders from barcode and portal information
    # ex: LSU01020304 -> root/portal/lsu/01/020/LSU01020304.jpg 
    newPath=os.path.join(newRoot,portalName,collection,firstFolder,secondFolder,fileName)
    # Get directory path to check if folders need to be created
    newDir=os.path.dirname(newPath)
    oldLarge=os.path.join(os.path.dirname(oldPath),largeFile_low)
    newLarge=os.path.join(newDir,largeFile_up)
    #print(bcp,portalDictionary[bcp], collection,number,fileName)
    #print(len(number),number,firstFolder,secondFolder,lastThree)
    # If file does not exist. Create path if needed. Then move/copy file to new destination
    return newDir,newPath,oldLarge,newLarge,fileName

def moveFiles(newRoot,oldPathDictionary,portalDictionary,portalName):
    '''
    Organizes files based on barcode and portal. 
    Input - New parent folder. Dictionary of old paths. Dictionary of barcodes and their portal
    Output - Dictionary of files moved {filename:[barcode,portal,newpath,newlargepath]}. 
    Dictionary of barcodes with no image {barcode:portal}
    '''
    # Make all keys(barcodes) into uppercase. values(list of paths) will stay as is. 
    oldDictionary_BCcaps = dict((k.upper(), v) for k, v in oldPathDictionary.items())
    # filename:[barcode,portal,newpath,newlargepath]
    filesMovedDict={}
    # barcode:portal
    barcodeNoImageDict={}
    # files that need a large image made 
    noLargeDict={}
    # Iterate through barcodes that are in the portal database
    for bcp in portalDictionary:
        # If barcode has image files... 
        if bcp in oldDictionary_BCcaps:
            # Split apart letters and numbers from barcode
            barcodeSplit = ["".join(x) for _, x in itertools.groupby(bcp, key=str.isdigit)]
            # Iterate through all image files associated with barcode
            for oldPaths in oldDictionary_BCcaps[bcp]:
                for oldPath in [oldPaths]:
                    newDir,newPath,oldLarge,newLarge,fileName=newPathNames(bcp,oldPath,barcodeSplit,portalDictionary,portalName)
                    if not os.path.exists(newPath):
                        pathlib.Path(newDir).mkdir(parents=True, exist_ok=True)
                        shutil.copy2(oldPath,newPath)
                        #os.rename(oldPath,newPath)
                        filesMovedDict[fileName]=[bcp,portalName,newPath]
                        try:
                            shutil.copy2(oldLarge,newLarge)
                            #os.rename(oldPath,newPath)
                        except:
                            noLargeDict[fileName]=newLarge
        # If barcode has no image files
        elif bcp not in oldPathDictionary:
            # keep track of specify records with no image file. barcode:portal
            barcodeNoImageDict[bcp]=portalDictionary[bcp]
    return filesMovedDict,barcodeNoImageDict,noLargeDict


def dictToBigList(filesMovedDict):
    '''
    Turns dictionary of filename:[barcode,portal,newpath] into continuous list of all newpaths
    '''
    # Make one single list of all image files
    allFilesList=[]
    for paths in filesMovedDict.values():
        allFilesList.append(paths[2])
    return allFilesList

def corruptImageFinder(allFilesList):
    '''
    Takes list of all absolute paths to files. Checks for image, and corruption. 
    Returns list of non image files, and list of corrupt image files
    '''
    # Dictionary of files that cannot open as an image
    notImageDict={}
    # Dictionary of files that cannot load as an image, are corrupted
    corruptImageDict={}

    for f in allFilesList:
        # Try opening image. 
        try:
            v_image = Image.open(f)
            # Try loading image
            try:
                    x=v_image.load()
            # If image cannot load, it is corrupted    
            except Exception as e:
                    corruptImageDict[os.path.basename(f)]=f
                    #print(str(e)+f)
        # If image doesnt open as an image, take note
        except IOError as i:
                corruptImageDict[os.path.basename(f)]=f
                #notImageDict[os.path.basename(f)]=f
                #print(str(i)+f)
    return corruptImageDict

# Specify full path to folder for output lists
outFolder='/Users/ChatNoir/Projects/HerbariumRA/'

# Specify full path to DwC-A occurences.csv file downloaded from portal, name of portal, column name for barcodes in occurences.csv
occurrencesFile="/Users/ChatNoir/Projects/HerbariumRA/LSU-Bryophytes_backup_2018-10-01_115050_DwC-A/occurrencesfake.csv"
portalName="bryophyte"
colName="catalogNumber"

# Specify full path of the new parent folder for images
newRoot='/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsuNEW/'

# Specify full path of current parent folder of images
rootLSU = '/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/'
rootNO = '/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/no/'
rootNLU = '/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/nlu/'
oldRoots = [rootLSU,rootNO,rootNLU]

# Get dictionary of current image paths for each barcode
# barcode:[filepath1,...filepathN]
oldPathDictionary=oldPathDict(oldRoots)

# Get dictionary of barcodes and their portal
# barcode:portal
portalDictionary=portalDict(occurrencesFile,portalName,colName)


# Move files and keep track of files that were moved, and barcodes that don't have images 
filesMovedDict,barcodeNoImageDict,noLargeDict=moveFiles(newRoot,oldPathDictionary,portalDictionary,portalName)

# Get list of all new image paths
newPathList = dictToBigList(filesMovedDict)

# Get lists of images with issues
corruptImageDict = corruptImageFinder(newPathList)

# Output Lists!! 

dfBad = pd.DataFrame.from_dict(corruptImageDict,orient='index',columns=['File Path'])
dfBad.index.name = 'Image File Name'
dfBad.to_csv(os.path.join(outFolder,(portalName+"_corruptImages.csv")),sep=",")

dfNoLarge = pd.DataFrame.from_dict(noLargeDict,orient='index',columns=['File Path'])
dfNoLarge.index.name = 'Image File Name'
dfBad.to_csv(os.path.join(outFolder,(portalName+"_noLargeImages.csv")),sep=",")

dfNoImage = pd.DataFrame.from_dict(barcodeNoImageDict,orient='index',columns=['File Path'])
dfNoImage.index.name = 'Image File Name'
dfBad.to_csv(os.path.join(outFolder,(portalName+"_noImages.csv")),sep=",")



print(oldPathDictionary)


{'LSU00072633': ['/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/0/7/26/33/LSU00072633.jpg'], 'NO0088855': ['/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/no/vasc_plants/0/8/88/55/NO0088855.CR2', '/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/no/vasc_plants/0/8/88/55/NO0088855.jpg'], 'LSU00021305': ['/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/0/2/13/5/LSU00021305.jpg'], 'LSU00072512': ['/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/0/7/25/12/LSU00072512.jpg'], 'lsu00157996': ['/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/0/15/79/96/lsu00157996_5.jpg', '/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/0/15/79/96/lsu00157996_1.jpg'], 'lsu00157997': ['/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/0/15/79/97/lsu00157997_1.jpg'], 'LSU00137769': ['/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/0/13/77/69/L

In [4]:
import pandas as pd

dfBad = pd.DataFrame.from_dict(corruptImageDict,orient='index',columns=['File Path'])
dfBad.index.name = 'Image File Name'
dfBad.to_csv(os.path.join(outFolder,(portalName+"_corruptImages"),sep=",")

dfNoLarge = pd.DataFrame.from_dict(noLargeDict,orient='index',columns=['File Path'])
dfNoLarge.index.name = 'Image File Name'
dfBad.to_csv(os.path.join(outFolder,(portalName+"_noLargeImages"),sep=",")

dfNoImage = pd.DataFrame.from_dict(barcodeNoImageDict,orient='index',columns=['File Path'])
dfNoImage.index.name = 'Image File Name'
dfBad.to_csv(os.path.join(outFolder,(portalName+"_noImages"),sep=",")



<type 'dict'>
<type 'dict'>
<type 'dict'>


Unnamed: 0_level_0,File Path
Image File Name,Unnamed: 1_level_1
LSU00158045_5.JPG,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
LSU00042493.jpg,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
LSU00021272.jpg,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
LSU00072633.jpg,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
LSU00157996_5.JPG,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
NLU0043454.jpg,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
NLU0035517.jpg,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
LSU00157997_5.JPG,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
LSU00021305.jpg,/Users/ChatNoir/Projects/HerbariumRA/data_stor...
LSU00042491.jpg,/Users/ChatNoir/Projects/HerbariumRA/data_stor...


In [5]:
import pandas as pd
data = filesMovedDict
df = pd.DataFrame.from_dict(data, orient='index',columns=['Catalogue Number','Group','File Path'])
df.index.name = 'Image File Name'
print(df)
outFile='/Users/ChatNoir/Projects/HerbariumRA/out.out'
df.to_csv(outFile,sep=",")

                  Catalogue Number      Group  \
Image File Name                                 
LSU00158045_5.JPG      LSU00158045  bryophyte   
LSU00042493.jpg        LSU00042493  bryophyte   
LSU00021272.jpg        LSU00021272  bryophyte   
LSU00072633.jpg        LSU00072633  bryophyte   
LSU00157996_5.JPG      LSU00157996  bryophyte   
NLU0043454.jpg          NLU0043454  bryophyte   
NLU0035517.jpg          NLU0035517  bryophyte   
LSU00157997_5.JPG      LSU00157997  bryophyte   
LSU00021305.jpg        LSU00021305  bryophyte   
LSU00042491.jpg        LSU00042491  bryophyte   
LSU00137769.JPG        LSU00137769  bryophyte   
LSU00157997_1.JPG      LSU00157997  bryophyte   
LSU00138294.JPG        LSU00138294  bryophyte   
LSU00137770.JPG        LSU00137770  bryophyte   
LSU00138295.JPG        LSU00138295  bryophyte   
LSU00021301.jpg        LSU00021301  bryophyte   
LSU00072512.jpg        LSU00072512  bryophyte   
LSU00122218.jpg        LSU00122218  bryophyte   
LSU00176631.JPG     

In [None]:
##### Input 

# Specify full path of current parent folder of images
oldRoot = '/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsu/'
# Specify full path of the new parent folder for images
newRoot='/Users/ChatNoir/Projects/HerbariumRA/data_storage_fake/nfsshare/lsuNEW/'
# Specify full path to DwC-A occurences.csv file downloaded from portal, name of portal, column name for barcodes in occurences.csv
occurrencesFile="/Users/ChatNoir/Projects/HerbariumRA/LSU-Bryophytes_backup_2018-10-01_115050_DwC-A/occurrencesfake.csv"
portalName="bryophyte"
colName="catalogNumber"

##### Output

# barcode:[filepath1,...filepathN]
oldPathDictionary

# barcode:portal
portalDictionary

# filename:[barcode,portal,newpath]
filesMovedDict

# barcode:portal
barcodeNoImageDict

# all new image paths
newPathList 

# non image files
noImageList

# corrupt image files
corrurptImageList

###### NOTES ##########

Workflow for running script:
Run once for each portal



To Do:
- try csv output
- |Image File Name | Catalog Number | Group | Path List |
- figure out how to update sql database on a regular basis (probably separate script)







https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python



Images without specify entry - should be 0. output list of all barcodes that have an image on cbfla. 
    - all of these should exist in master specify list 
Specify entry without image - during each portal move, output list of all barcodes in specify with no image file barcode. 
    - these may be already moved, or just dont have an image. 
    - will need to compare this list to list of all image barcodes

root+lsu(?)+portal+collection+
nfsshare/lsu/vascular/lsu/##/###/file.jpg 
#print(noImageBarcodeDict)

nfsshare/lsu/vascular/lsu/##/###/file.jpg 
nfsshare/vascular/lsu/##/###/file.jpg

LSU01020304 -> ['01','020']
vascular/lsu/01/020/LSU01020304.jpg
vascular/lsu/00/099/LSU00099999.jpg



NO0010203 -> ['0','010']
vascular/no/0/010/NO0010203.jpg


# Portal Dictionary. key = barcode. value = portal type
'''
Vascular Plants: http://sernecportal.org
Lichens: http://lichenportal.org
Bryophytes: http://bryophyteportal.org
Fungi: http://mycoportal.org
Algae: http://macroalgae.org
'''
occurances file - look for 'catalogNumber' user input 