###### Installing dependencies and importing them.

In [1]:
# Loading the images data.


import matplotlib.pyplot as plt
import numpy as np
import os

from PIL import Image

from sklearn.metrics import roc_curve

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score


import pickle
import time

###### Connecting to my google drive

# Loading The data.

In [2]:
AVES_PATH = "./images_resized/aves/"
MAMMALS_PATH = "./images_resized/mammals/"

In [3]:
'''
MAMMALS_FULL_DATASET = []
filesList = os.listdir(MAMMALS_PATH)
for i, f in enumerate(filesList):
  try:
    imgdata = list(Image.open(os.path.join(MAMMALS_PATH,f)).convert('RGB').getdata())
    imgdataReshaped = [x for sets in imgdata for x in sets]
    MAMMALS_FULL_DATASET.append(imgdataReshaped)
  except:
    print("Failed loading image: ", str(i))
    pass
len(MAMMALS_FULL_DATASET)
'''

'\nMAMMALS_FULL_DATASET = []\nfilesList = os.listdir(MAMMALS_PATH)\nfor i, f in enumerate(filesList):\n  try:\n    imgdata = list(Image.open(os.path.join(MAMMALS_PATH,f)).convert(\'RGB\').getdata())\n    imgdataReshaped = [x for sets in imgdata for x in sets]\n    MAMMALS_FULL_DATASET.append(imgdataReshaped)\n  except:\n    print("Failed loading image: ", str(i))\n    pass\nlen(MAMMALS_FULL_DATASET)\n'

In [4]:
'''
AVES_FULL_DATASET = []
filesList = os.listdir(AVES_PATH)
for i, f in enumerate(filesList):
  print("Loading image: ", str(i), end='\r')
  imgdata = list(Image.open(os.path.join(AVES_PATH,f)).getdata())
  imgdataReshaped = [x for sets in imgdata for x in sets]
  AVES_FULL_DATASET.append(imgdataReshaped)

len(AVES_FULL_DATASET)
'''

'\nAVES_FULL_DATASET = []\nfilesList = os.listdir(AVES_PATH)\nfor i, f in enumerate(filesList):\n  print("Loading image: ", str(i), end=\'\r\')\n  imgdata = list(Image.open(os.path.join(AVES_PATH,f)).getdata())\n  imgdataReshaped = [x for sets in imgdata for x in sets]\n  AVES_FULL_DATASET.append(imgdataReshaped)\n\nlen(AVES_FULL_DATASET)\n'

In [None]:
def loadBatch(batchsize, nth_batch, path):
    filesList = os.listdir(path)
    start = batchsize*nth_batch
    end =  batchsize*nth_batch + batchsize
    if(start >= len(filesList)):
        return np.array([])
    f = filesList[start]
    imgs = np.array(Image.open(os.path.join(path,f)).convert('RGB').getdata(), dtype = np.int8)
    for i in range(start+1, end):
        if(i >= len(filesList)):
            break
        f = filesList[i]
        try:
            imgdata = np.array(Image.open(os.path.join(path,f)).convert('RGB').getdata(), dtype=np.int8)
            imgdataReshaped = np.array([x for sets in imgdata for x in sets], dtype=np.int8)
            imgs = np.concatenate( (imgs, imgdataReshaped), axis=0)
            print("Done loading image: ", str(i))
        except:
            print(i)
            continue
            pass
    return imgs

In [None]:
X = np.concatenate((loadBatch(1000000, 0, AVES_PATH) , loadBatch(1000000, 0, MAMMALS_PATH)), axis = 0)


In [None]:
len(X)

In [None]:
def show(imgs, titles=[], rows=3, cols=3):
  plt.figure(figsize=(7, 7))
  for i, avian in enumerate(imgs):
    plt.subplot(rows, cols, i+1)
    plt.imshow(np.array(avian).reshape(224,224,3))
    if(len(imgs) != len(titles)):
      plt.title( str(i), fontsize=10)
    else:
      plt.title( str(titles[i]), fontsize=8)
    plt.axis('off')
  plt.show()

In [None]:
show(X[:9])

# Model trainings

### Logistic regression
  * We do logistic regression on data patches until we fit the whole model. We also can fit it all in one run but that will be a bit expensive.
  We know that we have ~3,230 training set if we want our data to be 85%-15% 

In [None]:
TRAINING_SIZE = 3200
BATCH_SIZE = 3200

In [None]:
def trainLogisticRegression(pen='l2'):
  clf = LogisticRegression(random_state=0, n_jobs=-1, max_iter=500, penalty=pen)
  n = round(TRAINING_SIZE / BATCH_SIZE)
  print()
  ETA = 140*n
  for i in range(n):
    print("ETA: ", str(ETA/60))
    print("Loading Aves batch data: ", str(i))
    start_time = time.time()
    X = np.array(loadBatch(BATCH_SIZE, i, AVES_PATH)).reshape(-1, 224*224*3)
    y = np.array([ 1 for j in range(X.shape[0])])
    print("Time taken: ", (time.time() - start_time) )

    print("-------------\nLoading Mammals batch data: ", str(i))
    start_time = time.time()
    X = np.concatenate( (
                          X, 
                          np.array(loadBatch(BATCH_SIZE, i, MAMMALS_PATH)).reshape(-1, 224*224*3)
                        ), 
                        axis = 0)
    
    X = X / 255
    y = np.concatenate( (
                            y,
                            np.array([0 for i in range( X.shape[0] - y.shape[0])] )
                         ),
                         axis = 0)
    print("Time taken: ", (time.time() - start_time) )
    print("-------------\nFitting model: ", str(i))
    start_time = time.time()
    clf = clf.fit(X, y)
    endt = time.time() - start_time
    print("Time taken: ",  endt)
    ETA = endt * (n - (i + 1))
    print("Finished batch: ", str(i))
    print("------------------------------------------------")
  return clf
  

In [None]:
%%time
trained_logistic_regression = trainLogisticRegression()

In [None]:
filename = 'logistic_regression.sav'
pickle.dump(trained_logistic_regression, open(filename, 'wb'))

### Gathering the Testing data

In [None]:
# Avians loading
TEST_PICTURES = np.array(loadBatch(3200,1 , AVES_PATH))
TEST_LABELS  = np.array([1 for i in range(TEST_PICTURES.shape[0])])
print(len(TEST_PICTURES) , len(TEST_LABELS))

# mammals
TEST_PICTURES = np.concatenate( (TEST_PICTURES, 
                                np.array(loadBatch(3200,1 , MAMMALS_PATH))
                                ),
                               axis = 0)

TEST_LABELS  = np.concatenate(
                            ( TEST_LABELS, 
                              np.array([0 for i in range( TEST_PICTURES.shape[0] - TEST_LABELS.shape[0]) ])
                            ), 
                            axis = 0)


print(len(TEST_PICTURES) , len(TEST_LABELS))





In [None]:
TEST_PICTURES, TEST_LABELS = shuffle(TEST_PICTURES, TEST_LABELS)

In [None]:
show(TEST_PICTURES[600-4:600+5], TEST_LABELS[600-4:600+5])

In [None]:
log_pred = trained_logistic_regression.predict(TEST_PICTURES.reshape(-1, 224*224*3))

In [None]:
acc = accuracy_score(TEST_LABELS, log_pred)
auc = roc_auc_score(TEST_LABELS, log_pred)
print("acc: " , acc , " AUC: ", auc)

In [None]:
plot_roc_curve(trained_logistic_regression, X = TEST_PICTURES.reshape(-1, 224*224*3), y = TEST_LABELS)
plt.title("LOGISTIC REGRESSION ROC")
plt.show()

In [None]:
# Avians loading
TRAIN_PARTIAL_PIC = np.array(loadBatch(800, 0, AVES_PATH))
TRAIN_PARTIAL_LABELS  = np.array([1 for i in range(TRAIN_PARTIAL_PIC.shape[0])])
print(len(TRAIN_PARTIAL_PIC) , len(TRAIN_PARTIAL_LABELS))

# mammals
TRAIN_PARTIAL_PIC = np.concatenate( (TRAIN_PARTIAL_PIC, 
                                np.array(loadBatch(800,0 , MAMMALS_PATH))
                                ),
                               axis = 0)

TRAIN_PARTIAL_LABELS  = np.concatenate(
                            ( TRAIN_PARTIAL_LABELS, 
                              np.array([0 for i in range( TRAIN_PARTIAL_PIC.shape[0] - TRAIN_PARTIAL_LABELS.shape[0]) ])
                            ), 
                            axis = 0)


print(len(TRAIN_PARTIAL_PIC) , len(TRAIN_PARTIAL_LABELS))

TRAIN_PARTIAL_PIC, TRAIN_PARTIAL_LABELS = shuffle(TRAIN_PARTIAL_PIC, TRAIN_PARTIAL_LABELS)

show(TRAIN_PARTIAL_PIC[200-4:200+5], TRAIN_PARTIAL_LABELS[200-4:200+5])


In [None]:
log_pred_train = trained_logistic_regression.predict(TRAIN_PARTIAL_PIC.reshape(-1, 224*224*3))

acc = accuracy_score(TRAIN_PARTIAL_LABELS, log_pred_train)
auc = roc_auc_score(TRAIN_PARTIAL_LABELS, log_pred_train)
print("acc: " , acc , " AUC: ", auc)

plot_roc_curve(trained_logistic_regression, X = TRAIN_PARTIAL_PIC.reshape(-1, 224*224*3), y = TRAIN_PARTIAL_LABELS)
plt.title("LOGISTIC REGRESSION ROC TRAINING PARTS")
plt.show()