In [18]:
import sys
sys.path.append('..')
print(sys.path)
from DeepSolarEye.dl_logic.model import regression_ResNet

import tensorflow as tf
import pandas as pd
import numpy as np
import os
import cv2
from datetime import datetime
import random
import shutil

from tensorflow.keras.callbacks import EarlyStopping

['/Users/peterbitman/Desktop/Deep_Solar_Eye/notebooks', '/Users/peterbitman/code/pemabi/04-Decision-Science/01-Project-Setup/data-context-and-setup', '/Users/peterbitman/Desktop/Deep_Solar_Eye/notebooks', '/Users/peterbitman/.pyenv/versions/3.10.6/lib/python310.zip', '/Users/peterbitman/.pyenv/versions/3.10.6/lib/python3.10', '/Users/peterbitman/.pyenv/versions/3.10.6/lib/python3.10/lib-dynload', '', '/Users/peterbitman/.pyenv/versions/3.10.6/envs/Deep_Solar_Eye/lib/python3.10/site-packages', '..', '..', '..']


In [30]:
def train_test_val_split(train_ratio=0.64, test_ratio=0.2, val_ratio=0.16):
    """
    Function to randomly split data into three dirs 'train', 'test', 'val'.
    If any 'train, test, val' folders already exist, they are overwritten.
    Default splits are set to 64% train, 20% test, and 16% val of total files.
    """
    input_folder="../raw_data/PanelImages"
    output_folder = "../raw_data/"

    # Check if the input folder exists
    if not os.path.exists(input_folder):
        raise FileNotFoundError(f"Input folder '{input_folder}' not found.")

    # Create output folders if they don't exist
    for folder in ['train_data', 'test_data', 'val_data']:
        os.makedirs(os.path.join(output_folder, folder), exist_ok=True)

    # List files in the input folder and filter out files that do not end with .jpg
    image_files = [file for file in os.listdir(input_folder) if file.lower().endswith('.jpg')]

    # Shuffle the list of image files
    random.shuffle(image_files)

    # Calculate the number of files for each split
    num_files = len(image_files)
    num_train = int(train_ratio * num_files)
    num_test = int(test_ratio * num_files)
    num_val = int(val_ratio * num_files)

    # Split the files into train, test, and val sets by slicing list of filenames
    train_files = image_files[:num_train]
    test_files = image_files[num_train:num_train+num_test]
    val_files = image_files[num_train+num_test:]

    # Copy files to their respective folders
    for files, folder in zip([train_files, test_files, val_files], ['train_data', 'test_data', 'val_data']):
        for file in files:
            shutil.copy(os.path.join(input_folder, file), os.path.join(output_folder, folder, file))

    print(f'train size: {num_train}\ntest_size: {num_test}\nval_size: {num_val}')


In [7]:
def preprocess_data(size=('full', 'noon', '15_mins')) -> (pd.DataFrame, np.ndarray):
    """
    Preprocesses images from file. Returns metadata in a dataframe, and a np array of image data.
    Use 'size' kwarg to decide what split of the dataset will be processed and returned.
    'full' = c. 45k images
    'noon' = c. 3.7k images
    '15_mins' = c. 1k images

    Returns: Metadata Dataframe, Tensor np.ndarray
    """

    folder_path = "../raw_data/Deep_Solar_Eye_noon"
    image_data = [] # initialise an empty array to stack the images
    metadata = []
    # Regular expression pattern to extract date and intensity values from the filename
    # Regular expression pattern to extract date and intensity values from the filename
    minute_range = np.arange(0, 15, 1)
    # Convert the numpy array to a list of strings
    minute_range_strings = [str(num) for num in minute_range]
    read_count = 0

    # capped at 1000 for now
    for filename in os.listdir(folder_path):
        if not filename.endswith(".jpg"):
            continue
        split_name = filename.split('_')
        hour = split_name[4]
        minute = split_name[6]
        # put in the break
        if size in ['noon', '15_mins'] and hour != '12':
            continue
        if size == '15_mins' and minute not in minute_range_strings:
            continue
        read_count += 1
        weekday = split_name[1]
        month = split_name[2]
        day = split_name[3]
        second = split_name[8]
        year = split_name[9]
        datetime_obj = datetime.strptime(f"{month} {day} {year} {hour}:{minute}:{second}", "%b %d %Y %H:%M:%S")
        age_loss = split_name[11]
        irradiance_level = split_name[13][:-4]

        # append metadata to list
        filename_info = [month, weekday, day, hour, minute, second, year, datetime_obj, age_loss, irradiance_level]

        metadata.append(filename_info)
    df = pd.DataFrame(metadata, columns=['Month', 'Day', 'Date', 'Hour', 'Minute', 'Second', 'Year',
                                        'Datetime', 'Percentage Loss', 'Irradiance Level'])

    df = df.astype({'Month': str, 'Day': str, 'Date': int, 'Hour': int, 'Minute': int, 'Second': int, 'Year': int,

                                       'Datetime': 'datetime64[ns]', 'Percentage Loss': float, 'Irradiance Level': float})
    return df

In [8]:
df = preprocess_data('noon')
x_df = df[['Hour', 'Irradiance Level']]
y_df = df[['Percentage Loss']]

In [9]:
path_imgs = ('../raw_data/Deep_Solar_Eye_noon/*.jpg')
images = tf.data.Dataset.list_files(path_imgs, shuffle=False)

def load_and_process_image(path_imgs):
    img = tf.io.read_file(path_imgs)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [224, 224])
    img = img / 255
    return img


#def load_images(path_imgs):
#  return tf.image.decode_image(tf.io.read_file(path_imgs))

#images_ds = images.map(load_images).batch(16)
images_ds = images.map(load_and_process_image).batch(16)

In [10]:
df_ds = tf.data.Dataset.from_tensor_slices(df[['Hour', 'Irradiance Level']])
y_ds= tf.data.Dataset.from_tensor_slices(df[['Percentage Loss']])

x_ds = tf.data.Dataset.zip((images_ds, df_ds))
all_ds = tf.data.Dataset.zip((x_ds, y_ds))

df_ds = tf.data.Dataset.from_tensor_slices(df[['Hour', 'Irradiance Level']].values.astype(np.float32))
df_ds = df_ds.batch(16)
y_ds = tf.data.Dataset.from_tensor_slices(df[['Percentage Loss']].values.astype(np.float32))
y_ds = y_ds.batch(16)

x_ds = tf.data.Dataset.zip((images_ds, df_ds))
all_ds = tf.data.Dataset.zip((x_ds, y_ds))

In [13]:
DATASET_SIZE = sum(_ for _ in all_ds)
DATASET_SIZE

TypeError: unsupported operand type(s) for +: 'int' and 'tuple'

In [None]:
train_size = int(0.7 * DATASET_SIZE)
val_size = int(0.15 * DATASET_SIZE)
test_size = int(0.15 * DATASET_SIZE)

full_dataset = tf.data.TFRecordDataset(FLAGS.input_file)
full_dataset = full_dataset.shuffle()
train_dataset = full_dataset.take(train_size)
test_dataset = full_dataset.skip(train_size)
val_dataset = test_dataset.skip(test_size)
test_dataset = test_dataset.take(test_size)

In [11]:
model = regression_ResNet(model_name='ResNet50', input_shape=(224, 224, 3),input_time_irradiance=(2,), num_units=512, pretrained=True)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])


In [12]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=1,
    restore_best_weights=True
)

history = model.fit(all_ds,
            epochs=2,
            batch_size=16,
            callbacks=early_stopping
           )

Epoch 1/2
Epoch 2/2

KeyboardInterrupt: 