In [None]:
import os
import pandas as pd
import scipy as sc
import seaborn as sns

from IPython.display import display

from hpsklearn import HyperoptEstimator, any_classifier, extra_trees, any_preprocessing, svc, xgboost_classification
from hyperopt import tpe, hp

from keras.applications.mobilenet import MobileNet, preprocess_input
#from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD
import numpy as np
import json
from sklearn import linear_model, ensemble, svm, model_selection, dummy, feature_selection, naive_bayes

In [None]:
input_shape = (224,224,3)

In [None]:
model = MobileNet(weights='imagenet', include_top=False, input_shape=input_shape, pooling='avg')

model = InceptionResNetV2(weights='imagenet', include_top=False, input_shape=input_shape, pooling='avg')

model = MobileNet(weights='imagenet', include_top=False, input_shape=(224,224,3))

img_path = 'phos/a00289bff4a2699940f08833e727ea34338997bf.png'
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

features = model.predict(x)

# Data bottleneck preprocessing

In [None]:
def calc_predictions(cs=None, model=None):
    data = []
    for i,c in enumerate(cs):
        for fd in c:
            fs = os.listdir(fd+'/')
            n=0
            for f in fs:
                n+=1
                if not n%50:
                    print(n,i,f)
                img_path = fd+'/'+f
                img = image.load_img(img_path, target_size=input_shape[:-1])
                x = image.img_to_array(img)
                x = np.expand_dims(x, axis=0)
                x = preprocess_input(x)
                features = model.predict(x)
                yield dict(features=features, c=i)
    raise StopIteration

In [None]:
df = pd.DataFrame.from_dict((x for x in calc_predictions(cs=[['phosifier/sank/not-phos',],['phosifier/sank/phos',]], model=model)))

# Data saving/loading

df.to_json('mobilenet-data.json')

df.to_json('mobilenet-sank-avg-data.json')

df.to_json('inresv3-data.json')

df = pd.read_json('mobilenet-data.json')

df = pd.read_json('mobilenet-sank-avg-data.json')

In [None]:
df.head()

In [None]:
df.features.values.shape

In [None]:
nf=df.features.apply(lambda x: sc.array(x).flatten())[0].shape[0]
df.features.apply(lambda x: sc.array(x).flatten())[0].shape[0]

In [None]:
df.features = df.features.apply(lambda x: sc.array(x).flatten())

In [None]:
df.head()

In [None]:
X = df.features.values
# make 1d array of 1d arrays -> 2d array
X = sc.array(X.tolist())
r = df.c.values

In [None]:
Xtot = X
rtot = r

In [None]:
X,Xt,r,rt = model_selection.train_test_split(X,r,test_size=.2)

# Classifiers

Various classifiers fitted on data

Below a table of performance of logistic regression on various sets and poolings

|Net|Pooling|LinCV|LinTest|Best|TestSplit|Train data|
|:-|:-|:-|:-|:-|
|MobileNet|avg|.79|.81|.83|.2|sank|
|MobileNet|max|.79|.83|.|.2|sank|
|MobileNet|none|.|.|.|.2|sank|

In [None]:
%%time
m = linear_model.LogisticRegressionCV(cv=10).fit(X,r)

In [None]:
m.scores_[1].mean(axis=0), m.scores_[1].mean(axis=0).max()

In [None]:
m.score(Xt,rt)

In [None]:
%%time
m = linear_model.LogisticRegressionCV(cv=10).fit(Xtot,rtot)

In [None]:
model_selection.cross_val_score(m, Xtot, rtot, cv=10)

In [None]:
import matplotlib.pyplot as plt
idxs=m.coef_[0].argpartition(-5)[-5:]
len(idxs)

In [None]:
for idx in idxs:
    plt.figure()
    sns.distplot([x[idx] for x,r in zip(X,r) if r])
    sns.distplot([x[idx] for x,r in zip(X,r) if not r])

ensemble.GradientBoostingClassifier(n_estimators=2**6).fit(X,r).score(Xt,rt)

C=1291.54966501

In [None]:
m.C_, m.coef_

In [None]:
m2 = linear_model.LogisticRegression(C=1291).fit(X,r)
m2.score(Xt,rt)

In [None]:
m3 = linear_model.LogisticRegression()
m3.fit(X,r)
m3.score(Xt,rt)

In [None]:
m4 = linear_model.LogisticRegression(penalty='l1').fit(X,r)
m4.score(Xt,rt)

In [None]:
sum(m4.coef_[0]!=0), len(m4.coef_[0])

mm = feature_selection.SelectFromModel(linear_model.LogisticRegression(penalty='l1')).fit(X,r)
nX = mm.transform(X)
nXt = mm.transform(Xt)

mmf1 = ensemble.RandomForestClassifier(n_estimators=2**9, n_jobs=-1)
mmf1.fit(nX,r)
mmf1.score(nXt,rt)

In [None]:
ms1 = svm.LinearSVC().fit(X,r)
ms1.score(Xt,rt)

ms2 = svm.SVC(kernel='rbf')
ms2 = model_selection.RandomizedSearchCV(ms2, param_distributions=dict(C=sc.stats.expon(0,10)), cv=5, n_iter=10)
ms2.fit(X,r)
ms2.score(Xt,rt)

In [None]:
ms2 = svm.SVC(kernel='rbf')
ms2.fit(X,r)
ms2.score(Xt,rt)

mf1 = ensemble.RandomForestClassifier(n_estimators=2**7, n_jobs=-1)
mf1.fit(X,r)
mf1.score(Xt,rt)

In [None]:
mf2 = ensemble.ExtraTreesClassifier(n_estimators=2**6, n_jobs=-1)
mf2.fit(X,r)
mf2.score(Xt,rt)

In [None]:
md = dummy.DummyClassifier().fit(X,r)
md.score(Xt,rt)

# General hyperparameter optimizers
Using hyperopt for finding a model

estim = HyperoptEstimator(classifier=any_classifier('my_clf'),
                          preprocessing=any_preprocessing('my_prp'),
                          algo=tpe.suggest,
                          max_evals=100,
                          trial_timeout=200,
                          verbose=1,
                         )

In [None]:
estim = HyperoptEstimator(classifier=svc('clf'),
                          preprocessing=any_preprocessing('my_prp'),
                          algo=tpe.suggest,
                          max_evals=100,
                          trial_timeout=200,
                          verbose=1,
                         )

In [None]:
%%time
estim.fit(Xtot, rtot)

In [None]:
print(estim.score( Xt, rt))

In [None]:
losses = [x for x in estim.trials.losses() if x]
sns.distplot(losses)
len(losses)/len(estim.trials.losses())

In [None]:
plt.plot(losses)
sc.sum(sc.array(losses)[sc.array(losses).argpartition(7)[:7][-1]]>sc.array(losses))
#sc.array(losses)[]
sc.array(losses).argpartition(5)[:5]

In [None]:
durs = [x['duration'] for x in estim.trials.results if x['status']=='ok']
sns.distplot(durs)

In [None]:
sc.mean(sc.array(durs)<200)

In [None]:
mh = estim.best_model()['learner']
print(estim.best_model())

In [None]:
if type(mh)==svm.classes.SVC:
    mh.probability = True

In [None]:
mh.fit(Xtot,rtot)

In [None]:
mh.score(Xt,rt)

In [None]:
model_selection.cross_val_score(mh, Xt, rt, cv=10).mean()

In [None]:
model_selection.cross_val_score(mh, X, r, cv=10).mean()

In [None]:
model_selection.cross_val_score(mh, Xtot, rtot, cv=10).mean()

In [None]:
X.shape

In [None]:
def calc_pred(fp, c, model, clf, clf2=None, print_wrong=True, print_right=False):
    img_path = fp
    img = image.load_img(img_path, target_size=input_shape[:-1])
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    features = features.flatten().reshape(1, -1)
    probs = []
    y_pred = clf.predict(features)
    probs.append(clf.predict_proba(features).flatten()[1])
    def print_img():
        display(img)
        if clf2:
            probs.append(clf2.predict_proba(features).flatten()[1])
        print('Classification probability:')
        print(probs)
        print('File:'+fp)
    if print_wrong and print_right:
        print_img()
    elif print_wrong and c!=y_pred:
        print_img()
    elif print_right and c==y_pred:
        print_img()
    return clf.predict(features)

d='cleaned-v1/not-phos/'
n = 0
for f in os.listdir(d):
    n+=1
    if not n%50:
        print(n)
        break
    calc_pred(d+f, 0, model, mh, clf2=m)

d='cleaned-v1/phos/'
n=0
for f in os.listdir(d):
    n+=1
    if not n%50:
        print(n)
        break
    calc_pred(d+f, 1, model, mh, clf2=m)    

d='../phosifier_test/'
for f in os.listdir(d):
    calc_pred(d+f, 1, model, m, print_right=True, print_wrong=False)

d='../phosifier_test/'
for f in os.listdir(d):
    calc_pred(d+f, 0, model, m, print_right=True, print_wrong=False)

phos = []
not_phos = []
d='all/'
n=0
for f in os.listdir(d):
    n+=1
    if not n%10:
        print(n)
    try:
        pred = calc_pred(d+f, 1, model, m, print_right=False, print_wrong=False)
    except IsADirectoryError:
        continue
    if not pred:
        phos.append(f)
    else:
        not_phos.append(f)

import shutil
for f in phos:
    shutil.move(d+f, 'phos/'+f)
    
for f in not_phos:
    shutil.move(d+f, 'not-phos/'+f)

import shutil
for f in phos:
    shutil.copy2(d+f, 'phos/'+f)
    
for f in not_phos:
    shutil.copy2(d+f, 'not-phos/'+f)

len(phos)

len(not_phos)

In [None]:
print('done')