In [99]:
import os
import numpy as np
import pandas as pd

import sklearn.linear_model
import sklearn.tree
import sklearn.metrics
import sklearn.neighbors

from matplotlib import pyplot as plt
from pprint import pprint 
#import seaborn as sns

VERBOSE = False

In [100]:
# determine whales to include based on what dataCSVs exist that dont exist
# extracts the whale name name and video index (eg, Daffodil_1) based on the labeled videos that exist

labeled_path = '../training_data/labeled_data/'
fv_path = '../training_data/fv_norm_CSVs/'
whale_names = list(os.walk(labeled_path))[0][2]
whale_names = [n[:-4] for n in whale_names]

CSV_names = [fv_path + n + '_images_fv_norm.csv' for n in whale_names]
labeled_names = [labeled_path + n + '.csv' for n in whale_names]

# verifies all corrosponding feature vector files exist
existing = [fv_path + m for m in list(os.walk(fv_path))[0][2]]
for n in CSV_names:
    if n not in existing:
        print("ALERT: file ", n, " not found")


In [101]:
frame_path = '../training_data/labeled_data/frames/' #whale_video/

In [102]:
pprint(whale_names)
pprint(labeled_names)
pprint(CSV_names)
if VERBOSE: print(len(whale_names), len(labeled_names), len(CSV_names))
if not (len(whale_names) == len(labeled_names) == len(CSV_names)):
    print("ALERT: some error in files was detected")

['Daffodil_1', 'Daffodil_2', 'Fan_1', 'Fan_2', 'Gom_1', 'Grommet_1']
['../training_data/labeled_data/Daffodil_1.csv',
 '../training_data/labeled_data/Daffodil_2.csv',
 '../training_data/labeled_data/Fan_1.csv',
 '../training_data/labeled_data/Fan_2.csv',
 '../training_data/labeled_data/Gom_1.csv',
 '../training_data/labeled_data/Grommet_1.csv']
['../training_data/fv_norm_CSVs/Daffodil_1_images_fv_norm.csv',
 '../training_data/fv_norm_CSVs/Daffodil_2_images_fv_norm.csv',
 '../training_data/fv_norm_CSVs/Fan_1_images_fv_norm.csv',
 '../training_data/fv_norm_CSVs/Fan_2_images_fv_norm.csv',
 '../training_data/fv_norm_CSVs/Gom_1_images_fv_norm.csv',
 '../training_data/fv_norm_CSVs/Grommet_1_images_fv_norm.csv']


In [103]:
x_data = []
y_data = []
for labeled_name, CSV_name, w in zip(labeled_names, CSV_names, whale_names):  
    xd = pd.read_csv(CSV_name, header = None)
    yd = pd.read_csv(labeled_name, header = None)
    print(w)
    x_data.append(xd)
    y_data.append(yd)
    

Daffodil_1
Daffodil_2
Fan_1
Fan_2
Gom_1
Grommet_1


In [104]:
if VERBOSE:
    print(y_data[0:2])
    print(x_data[0:2])

In [105]:
#y_data is an array of dataframes
#change this function to effect what the model trains to recognize
def process_y(arr):
    arr = [df.replace(to_replace='.*O.*', regex=True, value=1) for df in arr]
    arr = [df.replace(to_replace='.*', regex=True, value=0) for df in arr]
    return(arr)
        
def y_trim(arr):
    imgs = [(df. iloc[:, 0]).tolist() for df in arr]
    arr = [df.drop(axis='columns', labels=0) for df in arr]
    return(imgs, arr)

#y_data = [process_y(arr) for arr in y_data]
imgs, y_data = y_trim(y_data)
y_data = process_y(y_data)

In [106]:
if VERBOSE:
    print(y_data[0:2])
    print(y_data[0].max())
    print(imgs[0])

In [107]:
def x_trim(arr):
    arr = [df.drop(axis='columns', labels=0) for df in arr]
    return(arr)

x_data = x_trim(x_data)

In [108]:
if VERBOSE: print(x_data[0:2])

In [115]:
#cross validate, holding out exacrly one video for testing
for i, name in enumerate(whale_names):
    x_icl = x_data[0:i] + x_data[i+1:]
    y_icl = y_data[0:i] + y_data[i+1:]
    imgs_icl = imgs[0:i] + imgs[i+1:]
    x_tr = np.concatenate(x_icl) #concatonate list of dataframes
    y_tr = np.concatenate(y_icl)
    imgs_tr = sum(imgs_icl, []) #concatonate list of lists
    x_test = x_data[i]
    y_test = y_data[i]
    y_imgs = imgs[i]
    clf = sklearn.linear_model.LogisticRegression(solver='liblinear').fit(x_tr, y_tr.T[0])
    print("for heldout video: " + name)
    print(clf.score(x_test, y_test))
    print(x_tr.shape, y_tr.shape)
    print("weight parameters: ", clf.coef_)
    probs = clf.predict_proba(x_test)
    probs_list = probs.tolist()
    print(list(zip(probs_list, y_imgs)))
    print()
    
    #break

for heldout video: Daffodil_1
0.9679144385026738
(340, 6) (340, 1)
weight parameters:  [[-1.03178834 -0.90161759 -0.07092122  0.17553344  0.38228788  0.03208299]]
[([0.9656514323059912, 0.03434856769400882], '../whale_videos/Daffodil_1_images/Daffodil_1_00001000.jpg'), ([0.9109916623569299, 0.08900833764307013], '../whale_videos/Daffodil_1_images/Daffodil_1_00002000.jpg'), ([0.9155644344668653, 0.08443556553313468], '../whale_videos/Daffodil_1_images/Daffodil_1_00003000.jpg'), ([0.9647970500697611, 0.03520294993023882], '../whale_videos/Daffodil_1_images/Daffodil_1_00004000.jpg'), ([0.9580346856804264, 0.04196531431957357], '../whale_videos/Daffodil_1_images/Daffodil_1_00005000.jpg'), ([0.9230749650328716, 0.07692503496712842], '../whale_videos/Daffodil_1_images/Daffodil_1_00006000.jpg'), ([0.9280716470299057, 0.07192835297009426], '../whale_videos/Daffodil_1_images/Daffodil_1_00007000.jpg'), ([0.9207371607159566, 0.07926283928404344], '../whale_videos/Daffodil_1_images/Daffodil_1_0000

In [110]:
min([1, 2, 3])

1