# Pre Processing Images

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import cv2
import random
import time
from datetime import datetime
import seaborn as sns
from sklearn.utils import check_random_state


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

## Loading and labelling the images

In [None]:
import os
# identifyign folders/labels
cloud_labels = os.listdir("resources/cloud-images/CCSN_v2")
# Removing the .DS_Store file - autogenerated metadata for a folder - native to macOS
if ".DS_Store" in cloud_labels: cloud_labels.remove(".DS_Store")

labeled_data = {}

for label in cloud_labels:
    cloud_images = os.listdir("resources/cloud-images/CCSN_v2/" + label)
    for image in cloud_images:
        labeled_data[image] =  [{'label': label}]

sample_image = cv2.imread("resources/cloud-images/CCSN_v2/Ac/Ac-N001.jpg")
image = cv2.cvtColor(sample_image, cv2.COLOR_BGR2RGB)
plt.imshow(image)
plt.show()

## Preprocessing the Images

In [None]:
# Images are being converted to grayscale, resized, and flattened.
# TODO:
#   - [] find the image colour channel with the most contrast when turned to grayscale and employ that
#   - [] use a built in function to add more contrast to the images


train_images = []
train_labels = []

start_datetime = datetime.now()

for (i, image_file) in enumerate(labeled_data):
    #read image
    path = 'resources/cloud-images/CCSN_v2/'+ labeled_data[image_file][0]['label'] + '/' + image_file
    print(path)
    image = cv2.imread(path)
    #make images gray
    plt.imshow(image)
    plt.show()
    image = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    #label image using the annotations
    label = cloud_labels.index(labeled_data[image_file][0]['label'])
    tmp_label = labeled_data[image_file][0]['label']
    # resize image
    image = cv2.resize(image, (32, 32))
    # flatten the image
    pixels = image.flatten()
    #Append flattened image to
    train_images.append(pixels)
    train_labels.append(label)
    print('Loaded...', '\U0001F483', 'Image', str(i+1), 'is a', tmp_label)
    plt.imshow(image)
    plt.show()

end_datetime = datetime.now()

In [None]:
train_images = np.array(train_images)
train_labels = np.array(train_labels)

X, y = train_images,train_labels

In [None]:
random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
y = y[permutation]
X = X.reshape((X.shape[0], -1))

print(len(X))

In [None]:
print('Image Processing Duration: ' + str(end_datetime-start_datetime))

# Splitting the data
test_size = 0.3

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=None)

In [None]:
clf = LogisticRegression(C=0.01, penalty="l1", solver="saga", tol=0.1)

print(clf)

clf.fit(X_train, y_train)
sparsity = np.mean(clf.coef_ == 0) * 100
score = clf.score(X_test, y_test)
# print('Best C % .4f' % clf.C_)
print("Sparsity with L1 penalty: %.2f%%" % sparsity)
print("Test score with L1 penalty: %.4f" % score)