<a href="https://colab.research.google.com/github/ICRAR/PHYS5511/blob/master/2019/week05/keras_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A baseline solution of the GalaxZoo competition adapted from [this kernel](https://www.kaggle.com/helmehelmuto/keras-cnn/notebook).

#Machine setup
Make sure to change the Runtime --> runtime type to "GPU"

#Mount G-drive filesystem

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import sys
from zipfile import ZipFile
import numpy as np
import os.path as osp
import pandas as pd
from sklearn.model_selection import train_test_split
from skimage.transform import resize
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
%matplotlib inline



#Data preprocessing (no unzip required)

In [0]:
# please modify this root_path
root_path = '/content/drive/My Drive/PHYS5512/data/galaxy_zoo'
training_solution_file = osp.join(root_path, 'training_solutions_rev1.csv')
df = pd.read_csv(training_solution_file)

df_train, df_test = train_test_split(df, test_size=.2)
df_train.shape, df_test.shape

In [0]:
ORIG_SHAPE = (424, 424)
CROP_SIZE = (256, 256)
IMG_SHAPE = (64, 64)


def get_image(path, x1, y1, shape, crop_size):
    x = plt.imread(path)
    #print(x.shape)
    x = x[x1:x1 + crop_size[0], y1:y1 + crop_size[1]]
    x = resize(x, shape)
    #x = x / 255. # comment this out
    return x
    
def get_all_images(dataframe, shape=IMG_SHAPE, crop_size=CROP_SIZE):
    x1 = (ORIG_SHAPE[0] - CROP_SIZE[0]) // 2
    y1 = (ORIG_SHAPE[1] - CROP_SIZE[1]) // 2
   
    sel = dataframe.values
    ids = sel[:, 0].astype(int).astype(str)
    y_batch = sel[:, 1:]
    x_batch = []
    filename = osp.join(root_path, 'images_training_rev1.zip')
    with ZipFile(filename) as archive:
      for i in tqdm(ids):
          fn = archive.open('images_training_rev1/{0}.jpg'.format(i))
          x = get_image(fn, x1, y1, shape=shape, crop_size=crop_size)
          x_batch.append(x)
      x_batch = np.array(x_batch)
    return x_batch, y_batch
        
X_train, y_train = get_all_images(df_train)
X_test, y_test = get_all_images(df_test)

In [0]:
for ar in [X_train, y_train, X_test, y_test]:
  print(ar.shape)

In [0]:
X_train = (X_train * 255).astype(np.uint8)
X_test = (X_test * 255).astype(np.uint8)

In [0]:
for lb, arr in zip(['X_train', 'y_train', 'X_test', 'y_test'], [X_train, y_train, X_test, y_test]):
  npfn = osp.join(root_path, '%s.npy' % lb)
  np.save(npfn, arr

In [0]:
!du -sh '{root_path}/X_train.npy'

In [0]:
X_train = np.load(osp.join(root_path, 'X_train.npy'))
y_train = np.load(osp.join(root_path, 'y_train.npy'))
X_test = np.load(osp.join(root_path, 'X_test.npy'))
y_test = np.load(osp.join(root_path, 'y_test.npy'))


In [0]:
print(X_train.shape, X_test.shape)
ind_check = np.random.choice(len(X_train), 3)
plt.figure(figsize=(15, 6))
for i, idx in enumerate(ind_check):
  plt.subplot(1, 3, i + 1)
  plt.imshow(X_train[idx])

#Build the model

In [0]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization, GlobalMaxPooling2D
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
  return K.sqrt(K.mean(K.square(y_pred - y_true))) 

model = Sequential()
model.add(Conv2D(512, (3, 3), input_shape=(IMG_SHAPE[0], IMG_SHAPE[1], 3)))
model.add(Conv2D(256, (3, 3)))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(256, (3, 3)))
model.add(Conv2D(128, (3, 3)))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3)))
model.add(Conv2D(128, (3, 3)))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(GlobalMaxPooling2D())


model.add(Dropout(0.25))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(37))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adamax', metrics=[root_mean_squared_error])
model.summary()

#Training

In [0]:
batch_size = 64
small_train_set = 2000
small_val_set = 100
nb_epochs = 5
X_train = X_train.astype(np.float32) / 255 # why?
X_test = X_test.astype(np.float32) / 255  # why?
history = model.fit(X_train[0:small_train_set, :, :], y_train[0:small_train_set], 
                    epochs=nb_epochs, batch_size=batch_size, 
                    validation_data=(X_test[0:small_val_set, :, :], y_test[0:small_val_set]))

In [0]:
from matplotlib.ticker import MaxNLocator
fig = plt.figure(figsize=(10, 6))
ax = fig.gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
histories = history.history.items()
xvals = np.arange(1, nb_epochs + 1)
for k, v in histories:
    plt.plot(xvals, v, label=k if 'val_' in k else 'train_%s' % k)

plt.legend(loc='best', fontsize=14)
plt.suptitle('Loss curve', fontsize=16)
plt.ylabel('MSE', fontsize=14)
plt.xlabel('Epoch', fontsize=14)

# Test Prediction Submission (change me to get it working)

In [0]:
import os
from tqdm import tqdm

def test_image_generator(ids, shape=IMG_SHAPE):
    x1 = (ORIG_SHAPE[0]-CROP_SIZE[0])//2
    y1 = (ORIG_SHAPE[1]-CROP_SIZE[1])//2
    x_batch = []
    for i in ids:
        x = get_image('../input/44352/images_test_rev1/'+i, x1, y1, shape=IMG_SHAPE, crop_size=CROP_SIZE)
        x_batch.append(x)
    x_batch = np.array(x_batch)
    return x_batch

val_files = os.listdir('../input/44352/images_test_rev1/')
val_predictions = []
N_val = len(val_files)
for i in tqdm(np.arange(0, N_val, batch_size)):
    if i+batch_size > N_val:
        upper = N_val
    else:
        upper = i+batch_size
    X = test_image_generator(val_files[i:upper])
    y_pred = model.predict(X)
    val_predictions.append(y_pred)
val_predictions = np.array(val_predictions)
Y_pred = np.vstack(val_predictions)
ids = np.array([v.split('.')[0] for v in val_files]).reshape(len(val_files),1)
submission_df = pd.DataFrame(np.hstack((ids, Y_pred)), columns=df.columns)
submission_df = submission_df.sort_values(by=['GalaxyID'])
submission_df.to_csv('sample_submission.csv', index=False)