# Normalizing image data in preparation for classification:

### Data can often be manipulated in such a way that the underlying information isn't altered, but the data is better prepared for input into machine learning algorithms. Such manipulation is referred to as "pre-processing", and usually involves scaling (or "standardizing") the data, or applying the same operation to each data point. Pre-processing data usually results in better performance for machine learning algorithms.

### This notebook:
- Opens each cutout
- Flattens each image, in preparation for the classifier
- Applies 5 different data normalization techniques to each flattened image
- Saves all flatted data to .csv, for correct input into MuyGPyS
- Plots a few cutouts so you can visualize what each normalizing technique does to the data

In [None]:
import numpy as np
import os
import astropy.io.fits as fits
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

### Getting list of all cutout paths

In [None]:
cutout_path = 'cutouts/'
cutout_files = []
gals = 0
stars = 0

for file in os.listdir(cutout_path):
    if file.endswith('.fits'):
        cutout_files.append(file)
        if file.startswith('gal'):
            gals += 1
        elif file.startswith('star'):
            stars += 1

print("Found", len(cutout_files), "cutouts:", stars, "stars, and", gals, "gals")

In [None]:
random.Random(128).shuffle(cutout_files)

### Flatten and save the un-normalized "raw" image data for every cutout. 

For each cutout:
- Open the cutout
- Set truth labels (0=Star, 1=Galaxy)
- Flatten the data (turns 20x20 pixel image from shape (20, 20) to shape (400,))
- Append the maximum and minimum pixel value (for use later)
- Save to .csv

In [None]:
# Initialize DataFrame to append each cutout data to
gal_star = pd.DataFrame({})

# Define lists that each minimum and maximum pixel value will be appended to
min_pixel = []
max_pixel = []

# Create a DataFrame to append plotting data to
plot_data = pd.DataFrame({})

In [None]:
for idx, file in enumerate(tqdm(cutout_files)):
    image = fits.getdata(cutout_path+str(file))
    
    if file.startswith('star')==True:
        obj_id = 0
    if file.startswith('galaxy')==True:
        obj_id = 1
         
    data_flattened = image.flatten()
    
    min_pixel.append(np.min(data_flattened))
    max_pixel.append(np.max(data_flattened))
    
    gal_star = gal_star.append([np.append(obj_id, data_flattened)], ignore_index=True)
    
plot_data = pd.concat([plot_data, pd.DataFrame({'raw': gal_star[:5].to_numpy().flatten()})], axis=1)
gal_star.to_csv('raw_image_data.csv', header=False, index=False)

### Calculate the minimum and maximum pixel value across all images

In [None]:
min_pixel_all = np.min(min_pixel)
max_pixel_all = np.max(max_pixel)

## Define each data normalization technique function:

### Technique 1: 

#### For each cutout:
- Take the log of each pixel value
- Find the minimum pixel value accross the image
- Subtract that minimum value from each pixel

In [None]:
def norm_1(data):
    data_log = np.log10(data)
    min_log_data = np.amin(data_log)
    data_norm = data_log - min_log_data
    return data_norm

### Technique 2:

#### For each cutout:
- Calculate minimum pixel value across image
- Calculate maximum pixel value accross image
- Scale all data between (0, 1) with:<br>
$data\_norm = \frac{data - min}{max - min}$

In [None]:
def norm_2(data):
    min_data = np.min(data)
    max_data = np.max(data)
    data_norm = (data - min_data) / (max_data - min_data)
    return data_norm

### Technique 3:

The same as technique 2, but now `min` and `max` are the minimum and maximum pixel value over ALL images

#### For each cutout:
- Scale all data between (0, 1) with:<br>
$data\_norm = \frac{data - min\_all}{max\_all - min\_all}$

In [None]:
def norm_3(data):
    data_norm = (data - min_pixel_all) / (max_pixel_all - min_pixel_all)
    return data_norm

### Technique 4: 

#### For each cutout:
- Find the minimum pixel value in the image
- Subtract that value off of each pixel
- Divide each pixel in the image by the maximum value over ALL images

In [None]:
def norm_4(data):
    min_data = np.amin(data)
    data_min_subtracted = data - min_data
    data_norm = data_min_subtracted/max_pixel_all
    return data_norm

### Technique 5:

The same as technique 4, but now we take the log of each value first

#### For each cutout:
- Log each pixel (and the maximum pixel value)
- Find the minimum pixel value in the image
- Subtract that value off of each pixel
- Divide each pixel in the image by the maximum value over ALL images

In [None]:
def norm_5(data):
    log_data = np.log10(data)
    log_max_pixel = np.log10(max_pixel_all)
    min_data = np.amin(log_data)
    data_min_subtracted = log_data - min_data
    data_norm = data_min_subtracted/log_max_pixel
    return data_norm

## Applying normalization techniques:

### Run each normalization technique on each cutout:

In [None]:
techniques = [norm_1, norm_2, norm_3, norm_4, norm_5]

for num, technique in enumerate(techniques):
    gal_star_norm = pd.DataFrame({})
    for idx, row in tqdm(gal_star.iterrows(), total=gal_star.shape[0], desc='Technique '+str(num+1), leave=True):
        # Separate type and data
        raw_data = row[1:].values
        obj_id = row[:1].values 

        # Run raw data through each normalization technique
        norm_data = technique(raw_data)

        # Append values for current cutout to dataframe
        gal_star_norm = gal_star_norm.append([np.append(obj_id, norm_data)], ignore_index=True)

    # Append plot data for first 5 images
    plot_data = pd.concat([plot_data, pd.DataFrame({'norm_'+str(num+1): gal_star_norm[:5].to_numpy().flatten()})], axis=1)

    # Save to .csv
    gal_star_norm.to_csv('norm_'+str(num+1)+'_image_data.csv', header=False, index=False)

### Plot for first 5 objects to visualize what each normalization method is doing

In [None]:
technique_names = ['Raw Data', 'Technique 1', 'Technique 2', 'Technique 3', 'Technique 4', 'Technique 5']
num_objects_to_plot = 5

fig, axes = plt.subplots(nrows=num_objects_to_plot, ncols=6, sharex=True, sharey=True, constrained_layout=True, figsize=(18, 15))

for technique in range(6):
    data_technique = plot_data.iloc[:,technique]
    new_data_technique = np.reshape(data_technique.values, (num_objects_to_plot, 401))
    for idx, obj in enumerate(range(num_objects_to_plot)):
        new_data = np.reshape(new_data_technique[idx][1:], (20, 20))
        new_data = (new_data - np.amin(new_data)) / (np.amax(new_data) - np.amin(new_data))
        im = axes[idx][technique].imshow(new_data, cmap='gray')
        plt.colorbar(im, orientation="horizontal", ax=axes)
        if new_data_technique[idx][0] == 0.0:
            name = 'Star'
        elif new_data_technique[idx][0] == 1.0:
            name = 'Galaxy'
        if idx == 0:
            axes[idx][technique].annotate(technique_names[technique], xy=(0.5, 1.1),
                xycoords='axes fraction', size='xx-large', ha='center', va='baseline')
        if technique == 0:
            axes[idx][technique].annotate(name, xy=(-.2, 0.6), xycoords='axes fraction', 
                size='xx-large', ha='right', va='center')