### APNR dataset

In [1]:
import pandas as pd
import os, sys, random, ssl
import urllib, urllib.request
import pysftp #pip install sftp
# sys.path.join('/Users/manishrai/Desktop/UMN/Research/Zooniverse/Code/')
pd.options.display.max_rows=100

In [2]:
def get_sampled_subject_set(data_df, Species, species_count):
    """This function takes in a dataframe and a species name and returns a list of sampled events for the species
    Arguments:
        data_df - dataframe
        Species - Species name
        species_count - count of species in a dataframe
    """
    if species_count <= 100:
        return list(data_df[data_df['species']==Species]['capture_id'])
    elif species_count > 100 and species_count <= 1000:
        return list(random.sample(list(df1[df1['species']==Species]['capture_id']), 100))
    else:
        return list(random.sample(list(df1[df1['species']==Species]['capture_id']), 500))

def get_images_from_url(dataset, image_name_index, url_col_index, outpath):
    if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
        getattr(ssl, '_create_unverified_context', None)): 
        ssl._create_default_https_context = ssl._create_unverified_context
        
        check = []
        
        for i in range(dataset.shape[0]):
            if dataset.iloc[i][image_name_index] not in check:
                j = 0
            if dataset.iloc[i][image_name_index] in check:
                j += 1 
            
            print('Processing image: %d' % i)
            
            urllib.request.urlretrieve(dataset.iloc[i][url_col_index], outpath+'{0}.jpg'.format(dataset.iloc[i][image_name_index] ))

In [3]:
filepath = '/home/ubuntu/data/zooniverse-support/db'

In [4]:
# Import the file
df1 = pd.read_csv(os.path.join(filepath, 'APN_S1_export.csv'))\
    .rename(index=str, columns={'empty':'blank_images', 'count': 'count_species'}) # renaming it because of python keywords
print(df1.dtypes)
print(df1.count_species.unique())
df1.head()

capture_id          object
blank_images        object
species             object
count_species       object
standing             int64
resting              int64
moving               int64
eating               int64
interacting          int64
babies               int64
season              object
capturetimestamp    object
location            object
split_name          object
image1              object
image2              object
image3              object
dtype: object
['1' '2' '0' '4' '3' '5' '11-50' '7' '6' '51+' '8' '9' '10']


Unnamed: 0,capture_id,blank_images,species,count_species,standing,resting,moving,eating,interacting,babies,season,capturetimestamp,location,split_name,image1,image2,image3
0,APN_S1#JJ5#1#184,species,impala,1,0,0,1,0,0,0,APN_S1,2017-08-26-22-23-04,JJ5,train_apn_s1,APN/APN_S1/JJ5/JJ5_R1/APN_S1_JJ5_R1_IMAG0398.JPG,,
1,APN_S1#JJ5#2#44,species,zebra,2,1,0,1,0,0,0,APN_S1,2017-09-17-10-58-47,JJ5,train_apn_s1,APN/APN_S1/JJ5/JJ5_R2/APN_S1_JJ5_R2_IMAG0108.JPG,APN/APN_S1/JJ5/JJ5_R2/APN_S1_JJ5_R2_IMAG0109.JPG,APN/APN_S1/JJ5/JJ5_R2/APN_S1_JJ5_R2_IMAG0110.JPG
2,APN_S1#K034#1#18,species,elephant,1,0,0,1,0,0,0,APN_S1,2017-07-08-11-40-12,K034,train_apn_s1,APN/APN_S1/K034/K034_R1/APN_S1_K034_R1_IMAG005...,APN/APN_S1/K034/K034_R1/APN_S1_K034_R1_IMAG005...,APN/APN_S1/K034/K034_R1/APN_S1_K034_R1_IMAG005...
3,APN_S1#K082#2#51,species,warthog,1,0,0,0,0,0,0,APN_S1,2017-10-10-17-33-30,K082,train_apn_s1,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG010...,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG011...,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG011...
4,APN_S1#N1#1#160,species,duiker,1,0,0,1,0,0,0,APN_S1,2017-07-27-18-19-58,N1,train_apn_s1,APN/APN_S1/N1/N1_R1/APN_S1_N1_R1_IMAG0444.JPG,,


In [5]:
# Selecting count of 1 for now
# The 1 count is important for snapshot focus for now because 
# the machine will draw the bounding box and the volunteers will validate it
# and because of only one animal in the image, We will have the annotations for the bounding boxes
df1 = df1[df1.count_species=='1']
# converting the count_species column to integer. 
# This is a count so should be integer
df1.count_species = df1.count_species.astype(int)
print(df1.dtypes)
print(df1.shape)
df1.head()

capture_id          object
blank_images        object
species             object
count_species        int64
standing             int64
resting              int64
moving               int64
eating               int64
interacting          int64
babies               int64
season              object
capturetimestamp    object
location            object
split_name          object
image1              object
image2              object
image3              object
dtype: object
(5828, 17)


Unnamed: 0,capture_id,blank_images,species,count_species,standing,resting,moving,eating,interacting,babies,season,capturetimestamp,location,split_name,image1,image2,image3
0,APN_S1#JJ5#1#184,species,impala,1,0,0,1,0,0,0,APN_S1,2017-08-26-22-23-04,JJ5,train_apn_s1,APN/APN_S1/JJ5/JJ5_R1/APN_S1_JJ5_R1_IMAG0398.JPG,,
2,APN_S1#K034#1#18,species,elephant,1,0,0,1,0,0,0,APN_S1,2017-07-08-11-40-12,K034,train_apn_s1,APN/APN_S1/K034/K034_R1/APN_S1_K034_R1_IMAG005...,APN/APN_S1/K034/K034_R1/APN_S1_K034_R1_IMAG005...,APN/APN_S1/K034/K034_R1/APN_S1_K034_R1_IMAG005...
3,APN_S1#K082#2#51,species,warthog,1,0,0,0,0,0,0,APN_S1,2017-10-10-17-33-30,K082,train_apn_s1,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG010...,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG011...,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG011...
4,APN_S1#N1#1#160,species,duiker,1,0,0,1,0,0,0,APN_S1,2017-07-27-18-19-58,N1,train_apn_s1,APN/APN_S1/N1/N1_R1/APN_S1_N1_R1_IMAG0444.JPG,,
8,APN_S1#UMH8#1#65,species,giraffe,1,1,0,0,0,0,0,APN_S1,2017-07-31-18-00-12,UMH8,train_apn_s1,APN/APN_S1/UMH8/UMH8_R1/APN_S1_UMH8_R1_IMAG014...,,


In [6]:
# Counting the frequency of occurances of each species in images. 
df2 = df1[['species', 'count_species']].groupby(by=['species'], axis=0, as_index=False).count()
df2 = df2.sort_values(by = 'count_species')
df2['pct_cnt'] = round(df2['count_species']*100/9905, 1)
df2 = df2[df2['species'] != 'empty']
df2

Unnamed: 0,species,count_species,pct_cnt
9,caracal,1,0.0
1,aardwolf,1,0.0
51,serval,1,0.0
29,hyenastriped,1,0.0
42,nyala,2,0.0
33,koribustard,2,0.0
61,zorilla,2,0.0
57,wildcat,2,0.0
24,hartebeest,3,0.0
49,sable,3,0.0


Uncomment the below part when running the first time

In [7]:
# # This does a random sampling of capture event id's. So do this only once to have consistentcy.
# # Might get different outputs each time.
# event_ids = []
# for s in df2['species']:
#     event_ids = event_ids + get_sampled_subject_set(df1, s, int(df2[df2['species']==s]['count_species']))

# capture_event_id = pd.DataFrame(event_ids).rename(index=str, columns={0: 'CaptureEventID'})
# capture_event_id.to_csv(os.path.join(filepath, 'APNR/capture_event_id.csv'), sep = ',', index=False)

In [8]:
capture_event_id = pd.read_csv(os.path.join(filepath, 'APNR/capture_event_id.csv'), sep = ',')
df3 = df1[df1['capture_id'].isin(list(capture_event_id['CaptureEventID']))]
print(df3.shape)
df3.head()

(2116, 17)


Unnamed: 0,capture_id,blank_images,species,count_species,standing,resting,moving,eating,interacting,babies,season,capturetimestamp,location,split_name,image1,image2,image3
0,APN_S1#JJ5#1#184,species,impala,1,0,0,1,0,0,0,APN_S1,2017-08-26-22-23-04,JJ5,train_apn_s1,APN/APN_S1/JJ5/JJ5_R1/APN_S1_JJ5_R1_IMAG0398.JPG,,
3,APN_S1#K082#2#51,species,warthog,1,0,0,0,0,0,0,APN_S1,2017-10-10-17-33-30,K082,train_apn_s1,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG010...,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG011...,APN/APN_S1/K082/K082_R2/APN_S1_K082_R2_IMAG011...
10,APN_S1#13U#1#86,species,duiker,1,0,0,1,0,0,0,APN_S1,2017-07-28-08-16-07,13U,train_apn_s1,APN/APN_S1/13U/13U_R1/APN_S1_13U_R1_IMAG0212.JPG,APN/APN_S1/13U/13U_R1/APN_S1_13U_R1_IMAG0213.JPG,APN/APN_S1/13U/13U_R1/APN_S1_13U_R1_IMAG0214.JPG
11,APN_S1#UMH9#2#88,species,impala,1,0,0,1,0,0,0,APN_S1,2017-09-22-23-46-28,UMH9,train_apn_s1,APN/APN_S1/UMH9/UMH9_R2/APN_S1_UMH9_R2_IMAG012...,,
13,APN_S1#K014#1#219,species,impala,1,0,0,1,0,0,0,APN_S1,2017-08-13-17-01-33,K014,train_apn_s1,APN/APN_S1/K014/K014_R1/APN_S1_K014_R1_IMAG058...,APN/APN_S1/K014/K014_R1/APN_S1_K014_R1_IMAG058...,APN/APN_S1/K014/K014_R1/APN_S1_K014_R1_IMAG058...


In [9]:
# df3 = df1[df1['CaptureEventID'].isin(event_ids)]
# outpath = '/Users/manishrai/Desktop/test_dir/'  
# get_images_from_url(df3, image_name_index=0, url_col_index=6, outpath = '/Users/manishrai/Desktop/test_dir/')

In [10]:
hostname = "login.msi.umn.edu"
username = "LLLLLLLLLLL"       
password = "QQQQQQQQQ"
sftp = pysftp.Connection(hostname, username=username, password=password)

SSHException: No hostkey for host login.msi.umn.edu found.

In [None]:
cd /home/ubuntu/data/tensorflow/my_workspace/camera-trap-detection/snapshot-safari/APNR/subject_set
for i in range(df3.shape[0]):
    sftp.get('/panfs/roc/groups/5/packerc/shared/albums/' + df3.iloc[i].image1)
print('done')

In [107]:
cd /home/ubuntu/data/tensorflow/my_workspace/camera-trap-detection/snapshot-safari/APNR/subject_set

/home/ubuntu/data/tensorflow/my_workspace/camera-trap-detection/snapshot-safari/APNR/subject_set


In [122]:
sftp.get('/panfs/roc/groups/5/packerc/shared/albums/' + df3.iloc[1].image1)

NameError: name 'sftp' is not defined