In [10]:
''' Loads Grey Galaxy Images from Directory and Saves X_train,X_test,y_train,y_test to local directory'''
import pandas as pd
import numpy as np
from skimage.io import imread
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [11]:
number_of_images = 20000

In [12]:
#Import Galaxy Labels
def label_loader(file):
    labels = pd.read_csv(file)
    labels.set_index('GalaxyID',inplace = True)
    return labels

In [13]:
#Import Training Images as flattened np array from:
def image_loader(images,y_all):
    
    num_train_images = images
    i = 0
    X_orig  =[]
    for ID in y_all.index:
    
        #import image based on Galaxy ID
        img = f'/Users/LiamRoberts/desktop/Brainstation/DemoProject/images_training_rev1/{ID}.jpg'
        n_uncut= imread(img,as_gray=True)
        x,y = n_uncut.shape
        #preform image transformations (Crop,transpose,mirror)
        n = n_uncut[int(0.3*x):int(0.7*x),int(0.3*y):int(0.7*y)]  

        #Append X_train values
        X_orig.append(n)

        #Break Loop once it reaches desired number of images
        i += 1
        if i == num_train_images:
            break
    
    X_orig = np.float32(X_orig).reshape((int(num_train_images),int(x*0.4),int(y*0.4),1)) 
    
    y = y_all.values[0:num_train_images]
    X = X_orig
        
    return X,y

In [14]:
def format_images(X,y,scale=True):
    #Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True)
    train_shape = X_train.shape
    test_shape = X_test.shape
    
    #Unroll X
    X_train = X_train.reshape(X_train.shape[0],X_train.shape[1]*X_train.shape[2])
    X_test = X_test.reshape(X_test.shape[0],X_test.shape[1]*X_test.shape[2])
    
    #Standard Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    #Reshape X
    X_train = X_train.reshape(train_shape)
    X_test = X_test.reshape(test_shape)
    
    print('X_train shape:',X_train.shape)
    print('X_test shape:',X_test.shape)
    print('y_train shape:',y_train.shape)
    print('y_test shape:',y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [15]:
#Write Files for use in model
def save_values(X_train,X_test,y_train,y_test):
    np.save('y_train_greyscaled',y_train)
    np.save('y_test_greyscaled',y_test)
    np.save('X_train_greyscaled',X_train)
    np.save('X_test_greyscaled',X_test)
    return

In [16]:
labels = label_loader('training_solutions_rev1.csv')

In [17]:
X,y = image_loader(number_of_images,labels)
X_train, X_test, y_train, y_test = format_images(X,y)

X_train shape: (16000, 169, 169, 1)
X_test shape: (4000, 169, 169, 1)
y_train shape: (16000, 37)
y_test shape: (4000, 37)


In [18]:
save_values(X_train,X_test,y_train,y_test)