In [48]:
import numpy as np
import os
import pandas as pd
import traceback

In [150]:
image_names_raw = open('dar_all_jpg_filenames.txt').read().split('\n')
image_names = set()

for name in image_names_raw:
    name = name.lower()
    if name.endswith('.jpg'):
        image_names.add(name.split('/')[-1])

Relates the image "filename" to the "eid" (useful for finding the images at their URLS), "class_mark" (useful for relating the images to the text transcript) and to "sort_class_mark" (which might be useful!). NOTE: Strip your filenames before searching! Some filenames have leading spaces (yes, actually) as the spaces are not included here.

# Identify useful csvs

In [16]:
def find_good_csvs(PATH):
    
    csvs = []

    for directory in os.walk(PATH):
        dirpath, _, files = directory
        for f in files:
            try:
                f = f.strip()
                if f.endswith('.csv') and 'MS-DAR-' in f and not f.startswith('.'):
                    
                    file_path = os.path.join(dirpath, f)
                    
                    with open(file_path, encoding='latin8') as f_file:
                        header = f_file.readline()
                        if len(header) > 500:
                            print(f + " is not ideal")

                        if 'filename' in header.lower():
                            csvs.append(file_path)

            except:
                traceback.format_exc()
                break

    return csvs

In [12]:
def concatenate_good_csvs(good_csvs):
    
    dfs = []
    for csv in good_csvs:
        dfs.append(pd.read_csv(csv, encoding='latin8'))

    return pd.concat(dfs)

In [23]:
good_csvs = find_good_csvs('/home/jacob/Downloads/hackathon/data/Original_transcript_folder_from_AMNH/')
data = concatenate_good_csvs(good_csvs)

data = data[['eid', 'filename', 'class-mark', 'sort-class-mark']]

data.drop_duplicates(inplace=True)
data = data[data.filename.notnull()]
data['eid'] = data['eid'].astype(int).astype(str)

In [52]:
def filter_file_names(file_name, group):
    
    if len(group) == 1:
        return group.as_matrix()[0]
    
    eid = set(group.eid.dropna().values)
    class_mark = set(group['class-mark'].dropna().values)
    sort_class_mark = set(group['sort-class-mark'].dropna().values)

    if len(eid) > 1 or len(class_mark) > 1 or len(sort_class_mark) > 1:
        
        return None
    
    return np.array([file_name, eid.pop(), class_mark.pop(), sort_class_mark.pop()])

In [55]:
def get_final_files_names():

    final_file_names = []

    for file_name, group in data.groupby('filename'):

        row = filter_file_names(file_name, group)

        if row is not None:
            final_file_names.append(row)

    return np.array(final_file_names)

In [56]:
final_file_names = get_final_files_names()

In [58]:
final_file_names = pd.DataFrame.from_records(final_file_names, columns=['file_name', 'eid', 'class_mark', 'sort_class_mark'])
final_file_names.to_csv('file_identifiers.csv', index=False)

# Let's find some reverse things

In [63]:
data = pd.read_csv('file_identifiers.csv')

In [64]:
data.sort_class_mark.str.lower().str.endswith('v').mean()

0.47537243691935571

In [65]:
data.sort_class_mark.str.lower().str.endswith('r').mean()

0.48133692485294927

In [69]:
def find_orientation(sort_class_mark):
    
    sort_class_mark = sort_class_mark.lower().strip()
    
    if sort_class_mark.endswith('v'):
        return 'v'
    
    if sort_class_mark.endswith('r'):
        return 'r'    

In [72]:
data['orientation'] = data.sort_class_mark.apply(find_orientation)
data.to_csv('file_identifiers.csv', index=False)

In [162]:
len(set(data.filename.str.lower().values).intersection(image_names))#/len(image_names)

29633