In [None]:
!pip install pyspark==3.4.2

In [1]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
import findspark
findspark.init()

In [None]:
!pip install grpcio-status

In [2]:
spark = SparkSession.builder.appName("Images").master("spark://spark-master:7077").getOrCreate()

In [None]:
spark

In [None]:
config = SparkConf().set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false")

In [None]:
sc = SparkContext(config)

In [None]:
sc

In [None]:
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
import re
# import tensorflow as tf
import cv2

I will use the functionality of Image Datasource of spark to collect and process images in bytes, and then start processing the images for the neural network

In [None]:
images_folders = ['hdfs://jose-virtualbox:9000/CA1/Ireland/','hdfs://jose-virtualbox:9000/CA1/Honduran/']

In [None]:
images_rdd = spark.sparkContext.binaryFiles(','.join(images_folders))

In [None]:
images_rdd

In [None]:
def extract_data(data):
    file_path, image_data = data
#     image = Image.open(BytesIO(image_data))
#     image_array = np.array(image)
    
    file_name = file_path.split('/')[-1]
    file_name_without_ext = file_name.split(".")[0]
    label,name = file_name_without_ext.split('_')[0], file_name_without_ext.split('_')[1]
    
    return name, label, image_data

In [None]:
imageDf = images_rdd.map(lambda x: extract_data(x)).toDF(["Name","Label","Data"])

In [None]:
imageDf.show()

In [None]:
pandasImagesDF = imageDf.toPandas()

In [None]:
def createNpArrayFromBytes(data):
    imgbytes = BytesIO(data)
    image = Image.open(imgbytes)
    array = np.asarray(image)
    return array

In [None]:
pandasImagesDF["Data"] = pandasImagesDF["Data"].apply(lambda x: createNpArrayFromBytes(x))

In [None]:
pandasImagesDF

Saving dataframe in a CSV, to not process the images each time I start the proyect

In [None]:
pandasImagesDF.to_csv("./Dataset/images.csv", index=False)

Read CSV saved in the project, to save time

In [None]:
pandasImagesDF = pd.read_csv("./Dataset/images.csv")

In [None]:
pandasImagesDF = pandasImagesDF.sort_values(by=["Name"])

In [None]:
pandasImagesDF.info()

In [None]:
def convert_string_to_np(string_repr):
    # Extract hexadecimal values from the string representation
    hex_values = re.findall(r'\\x([0-9a-fA-F]{2})', string_repr)
    # Convert hexadecimal values to bytes
    byte_data = bytes.fromhex(''.join(hex_values))
    # Convert bytes to NumPy array
    np_array = np.frombuffer(byte_data, dtype=np.uint8)
    return np_array

In [None]:
pandasImagesDF["Data"] = pandasImagesDF["Data"].apply(convert_string_to_np)

In [None]:
pandasImagesDF["Data"] = pandasImagesDF["Data"].split("\n").

In [None]:
pandasImagesDF

In [None]:
pandasImagesDF["Data"] = [float(str(i).replace(",", "")) for i in pandasImagesDF["Data"]]

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import io

In [None]:
np.array(pandasImagesDF["Data"][9]).astype("float32")

In [None]:
# Determine the dimensions of the image
width =  225
height =  255

# Convert the byte array to a numpy array of uint8 data type
image_data = np.array(pandasImagesDF["Data"][0], dtype=np.float32)

# Reshape the array to match the dimensions of the image
image_data = image_data.reshape((height, width))

# Create an image from the byte array
image = Image.fromarray(image_data)

# Display the image
image.show()

In [None]:
array

In [None]:
IMG_SIZE = 200

In [None]:
image_array = pandasImagesDF["Data"]

In [None]:
image_array[0]

In [None]:
from skimage.transform import resize

In [None]:
new_array = image_array.map(lambda x: resize(x, (IMG_SIZE, IMG_SIZE)))

In [None]:
new_array

In [None]:
from matplotlib import pyplot as plt
plt.imshow(image_array[0], interpolation='nearest')
plt.show()

In [None]:
new_array = cv2.resize(image_array,(IMG_SIZE,IMG_SIZE) )

Importing from Keras functionality necessary to implement CNN

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten
from keras import utils
from sklearn.metrics import accuracy_score

For Now there is only 20 photos per Category, the training size will be 28 and test 12, This is for testing. After this I'll get more images to improve the CNN

In [None]:
X_train = pandasImagesDF.iloc[:-12,2].values
y_train = pandasImagesDF.iloc[:-12,1].values
X_test = pandasImagesDF.iloc[-12:,2].values
y_test = pandasImagesDF.iloc[-12:,1].values


In [None]:
X_train

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse_output=False)
y_train = np.array(y_train).reshape(-1, 1)
y_train = encoder.fit_transform(y_train)

y_test = np.array(y_test).reshape(-1,1)
y_test = encoder.fit_transform(y_test)

In [None]:
X_train = X_train.reshape(-1,1)
X_test = X_test.reshape(-1,1)

In [None]:
X_train

In [None]:
len(X_train[7][0])

In [None]:
X_train = np.reshape(X_train, (X_train.shape[0], 1080,1))
X_test = np.reshape(X_test,(X_test.shape[0],X_test.shape[1],1))
y_train = np.reshape(y_train, (y_train.shape[0], 1080,1))
y_test = np.reshape(y_test, (y_test.shape[0], y_test.shape[1],1))

In [None]:
X_train.shape

Because the pixels are from 0 to 255, We have to normalize the pixels

In [None]:
X_train/=255
X_test/=255

In [None]:
model = Sequential()
model.add(Conv2D(25,kernel_size=(3,3),strides=(1,1),padding="valid", activation="relu", input_shape=(1080,1080,1)))
model.add(MaxPool2D(pool_size=(1,1)))
model.add(Flatten())

model.add(Dense(100, activation="relu"))
model.add(Dense(10,activation="softmax"))

In [None]:
model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer="adam")

In [None]:
y_train.shape

In [None]:
z =np.(X_train)

In [None]:
x_train = np.asarray(X_train)
Y_train = np.asarray(y_train)
x_test = np.asarray(X_test)
Y_test = np.asarray(y_test)

In [None]:
model.fit(x_train, Y_train, batch_size=14, epochs=10, validation_data=(x_test, Y_test))