# Data treatment and Neural Network Training using IPMA data
João Oliveira and Edgar Mendes

In [1]:
#-----------------------Imports

#Data request libraries
import requests
import json

#Mathematics libraries
import matplotlib.pyplot as plt
import pydot
import ipyplot 
import numpy as np

#Time variables libraries
from datetime import datetime, timedelta
import time
import datetime
import pytz
from datetime import date
from datetime import datetime

#File management libraries
import io
from io import BytesIO
import os
import shutil
from PIL import Image, ImageDraw

#Progress bar libraries
from ipywidgets import IntProgress
from IPython.display import display

#Image augmentation libraries
import imageio
import imgaug as ia
import imgaug.augmenters as iaa

#Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn import svm
import tensorflow as tf
import tensorflow.keras as keras
from keras import layers
from keras import callbacks
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten, Dropout, MaxPooling2D
from keras.utils.vis_utils import plot_model

#-----------------------Constantes

#Recorte para cada distrito
boxVianaDoCastelo = (570, 428, 770, 628)
boxLeiria = (574, 902, 774, 1102)
boxAveiro = (603, 687, 803, 887)
boxBeja =  (735, 1546, 935, 1746)
boxBraga =  (645, 463, 845, 663)
boxBraganca = (953, 401, 1153, 601)
boxCasteloBranco = (817, 886, 1017, 1086)
boxPortalegre = (829, 1012, 1029, 1212)
boxPorto = (611, 562, 811, 762)
boxSantarem = (597, 1026, 797, 1226)
boxCoimbra = (645, 792, 845, 992)
boxEvora = (740, 1183, 940, 1383)
boxFaro = (736, 1546, 936, 1746)
boxGuarda = (859, 712, 1059, 912)
boxLisboa = (513, 1149, 713, 1349)
boxSetubal = (559, 1193, 759, 1393)
boxVilaReal = (770, 527, 970, 727)
boxViseu = (740, 682, 940, 882)

#Dados das estações
idVianaDoCastelo, idLeiria, idAveiro, idBeja, idBraga, idBraganca, idCasteloBranco, idPortalegre, idPorto, idSantarem, idCoimbra, idEvora, idFaro, idGuarda, idLisboa, idSetubal, idVilaReal, idViseu = 1240610, 1210718, 1210702, 1200562, 6212124, 1200575, 1200570, 1200571, 1240903, 1210734, 1210707, 1200558, 1200554, 1210683, 7240919, 1210770, 1240566, 1240675
ids = np.array([1240610, 1210718, 1210702, 1200562, 6212124, 1200575, 1200570, 1200571, 1240903, 1210734, 1210707, 1200558, 1200554, 1210683, 7240919, 1210770, 1240566, 1240675])
station_box_dict = {idVianaDoCastelo: boxVianaDoCastelo, idLeiria: boxLeiria, idAveiro: boxAveiro, idBeja: boxBeja, idBraga: boxBraga, idBraganca: boxBraganca, idCasteloBranco: boxCasteloBranco, idPortalegre: boxPortalegre, idPorto: boxPorto, idSantarem: boxSantarem, idCoimbra: boxCoimbra, idEvora: boxEvora, idFaro: boxFaro, idGuarda: boxGuarda, idLisboa: boxLisboa, idSetubal: boxSetubal, idVilaReal: boxVilaReal, idViseu: boxViseu}

#Valores do augmentation
hflip, vflip = iaa.Fliplr(p=1.0), iaa.Flipud(p=1.0)

#Valores da rede neuronal
N_CHANNELS = 4 
N_CLASSES = 100 
IMAGE_SIZE = 200

#-----------------------Funções

#Data request functions

def get_data(url):
    response = requests.get(f"{url}")
    if response.status_code == 200:
        #print("Sucessfully fetched the data!") 
        return response.json() #https://www.educative.io/answers/how-to-make-api-calls-in-python
    else:
        print(f"Hello there, there's a {response.status_code} error with your request.")
        
def normalize_precipitation_value(precipitation_value):
    return int(round((precipitation_value/240)*100,0)) #o valor normalizado ta a ser arredondado pq as pastas sao de valores inteiros. https://www.ipma.pt/pt/oclima/extremos.clima/ Vou usar o valor máximo aqui como referencia

def list_dataset_folders():
    folders = []
    folder_path = os.getcwd()
    for name in os.listdir(folder_path):
        if os.path.isdir(os.path.join(folder_path, name)):
            if name.endswith("dataset"):
                folders.append(name)
    return folders

def remove_black_pixels(image):
    # Convert the image to RGBA mode (if it's not already in RGBA mode)
    image = image.convert("RGBA")
    # Get the pixel data as a list of tuples
    pixels = list(image.getdata())
    # Replace every black pixel with transparent
    new_pixels = []
    for pixel in pixels:
        if pixel[0] == 0 and pixel[1] == 0 and pixel[2] == 0:
            new_pixels.append((0, 0, 0, 0))
        else:
            new_pixels.append(pixel)

    # Create a new image with the same size and mode as the original image
    new_image = Image.new(image.mode, image.size)
    # Update the new image with the new pixel data
    new_image.putdata(new_pixels)
    # Return the new image
    return new_image

#Manage dictionaries and arrays

def create_zero_dict(input_dict):
    zero_dict = {key: 0 for key in input_dict} #cria um dicionario onde as chaves são as mesmas do dicionario que recebeu mas o value é 0 em todas
    return zero_dict

def get_dict_keys(input_dict):
    keys_array = list(input_dict.keys())
    return keys_array

def dict_to_array1D(input_dict):
    # Create an empty list to hold the values
    output_array = np.empty(0)
    # Loop over each key-value pair in the dictionary
    for date_dict in input_dict.values():
        for value in date_dict.values():
            # Append the value to the output array
            output_array = np.append(output_array,value)
    # Return the output array
    return output_array

def array2D_to_array1D(array_2D):
    array1D = np.ravel(array_2D)
    return array1D

def array1D_to_array2D(array_1D):
    array2D = np.empty(0)
    array2D = np.reshape(array_1D, (-1, 1))
    return array2D

#Dataset labels array management

def count_how_many_occurences_of_each_value(arr):
    counts = {}
    for num in arr:
        if num in counts:
            counts[num] += 1
        else:
            counts[num] = 1
    return counts

def get_objective_hour():
    # get current time
    now = datetime.datetime.now()
    # extract current hour from the current time
    current_hour = now.hour
    objective_hour = current_hour - 2
    objective_hour = str(current_hour) + ':00'
    return objective_hour

def extract_hour_data(json_obj,objective_hour):    
    hour_data = []
    for date in json_obj:
        if objective_hour in json_obj[date]:
            hour_data.append(json_obj[date][objective_hour])
    return hour_data

In [2]:
DADOS_ULTIMAS_3_HORAS = "https://api.ipma.pt/open-data/observation/meteorology/stations/obs-surface.geojson" #por exemplo se forem 9PM tem os dados entre 5PM e 7PM de hora a hora para todas as estações
local_tz = pytz.timezone('Europe/Lisbon') #Define o fuso horário local

def get_images_and_data_from_ipma():
    
    data = get_data(DADOS_ULTIMAS_3_HORAS)
    specific_datasets = list_dataset_folders()
    # Faz a requisição à apiEstacoes e à apiMeteo para obter as informações meteorológicas 
    final_result = {}
    for feature in data['features']:
        f.value+=1
        if feature['properties']['idEstacao']  in ids:
            station_data = feature['properties']
            id_estacao = station_data['idEstacao']
            # Converte a hora da estação para o fuso horário local
            date_time_utc = datetime.fromisoformat(station_data['time'])
            date_time = datetime.fromisoformat(station_data['time']).replace(tzinfo=pytz.utc).astimezone(local_tz)
            date_str, hour_str = date_time.strftime('%Y-%m-%d %H:%M').split()
            precipitation = station_data['precAcumulada']

            if date_str not in final_result:
                final_result[date_str] = {hour_str: precipitation}
            else:
                final_result[date_str][hour_str] = normalize_precipitation_value(max(0, precipitation))

            url_image = f"https://www.ipma.pt/resources.www/transf/radar/por/pcr-{date_time_utc.strftime('%Y-%m-%d')}T{date_time_utc.strftime('%H%M')}.png"
            response = requests.get(url_image)
            image_data = io.BytesIO(response.content)
            image = Image.open(image_data)
            #print(f"{url_image} ({hour_str}h)")
                    
            # remover os pixeis pretos 
            image = remove_black_pixels(image)

            # Cor-te da imagem
            region = image.crop(station_box_dict[id_estacao])
            for specific_dataset in specific_datasets:

                # verifica se a pasta id_estacao existe e cria se não existir
                if not os.path.exists(f"{specific_dataset}/images/{id_estacao}"):
                    os.makedirs(f"{specific_dataset}/images/{id_estacao}")

                # verifica se a pasta date_str existe e cria se não existir
                if not os.path.exists(f"{specific_dataset}/images/{id_estacao}/{date_str}"):
                    os.makedirs(f"{specific_dataset}/images/{id_estacao}/{date_str}")

                #---- 135º----------------------------------------------------------
                #certo
                if specific_dataset == '135_rotated_dataset':
                    rotated_135_region = region.rotate(135)
                    rotated_135_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #certo
                elif specific_dataset == '135_rotated_horizontal_and_vertical_flip_dataset':            
                    img_np = np.array(region)
                    input_hf = hflip.augment_image(img_np)
                    input_vf = vflip.augment_image(input_hf)
                    rotated_135_horizontal_and_vertical_flip_region = Image.fromarray(input_vf)
                    rotated_135_horizontal_and_vertical_flip_region = rotated_135_horizontal_and_vertical_flip_region.rotate(135)
                    rotated_135_horizontal_and_vertical_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #certo
                elif specific_dataset == '135_rotated_horizontal_flip_dataset':
                    img_np = np.array(region)            
                    input_hf = hflip.augment_image(img_np)
                    rotated_135_horizontal_flip_region = Image.fromarray(input_hf)
                    rotated_135_horizontal_flip_region = rotated_135_horizontal_flip_region.rotate(135)
                    rotated_135_horizontal_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #certo
                elif specific_dataset == '135_rotated_vertical_flip_dataset':
                    img_np = np.array(region)            
                    input_vf = vflip.augment_image(img_np)
                    rotated_135_vertical_flip_region = Image.fromarray(input_vf)
                    rotated_135_vertical_flip_region = rotated_135_vertical_flip_region.rotate(135)
                    rotated_135_vertical_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")
                
                #---- 45º----------------------------------------------------------

                #certo
                elif specific_dataset == '45_rotated_dataset':
                    rotated_45_region = region.rotate(45)
                    rotated_45_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #certo
                elif specific_dataset == '45_rotated_horizontal_and_vertical_flip_dataset':                    
                    img_np = np.array(region)            
                    input_hf = hflip.augment_image(img_np)
                    input_vf = vflip.augment_image(input_hf)
                    rotated_45_horizontal_and_vertical_flip_region = Image.fromarray(input_vf)
                    rotated_45_horizontal_and_vertical_flip_region = rotated_45_horizontal_and_vertical_flip_region.rotate(45)
                    rotated_45_horizontal_and_vertical_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #certo
                elif specific_dataset == '45_rotated_horizontal_flip_dataset':
                    img_np = np.array(region)            
                    input_hf = hflip.augment_image(img_np)
                    rotated_45_horizontal_flip_region = Image.fromarray(input_hf)
                    rotated_45_horizontal_flip_region = rotated_45_horizontal_flip_region.rotate(45)
                    rotated_45_horizontal_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #certo
                elif specific_dataset == '45_rotated_vertical_flip_dataset':
                    img_np = np.array(region)            
                    input_vf = vflip.augment_image(img_np)
                    rotated_45_vertical_flip_region = Image.fromarray(input_vf)
                    rotated_45_vertical_flip_region = rotated_45_vertical_flip_region.rotate(45)
                    rotated_45_vertical_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #---- Só flips---------------------------------------------------------

                #certo
                elif specific_dataset == 'dataset':
                    region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")
                
                #certo
                elif specific_dataset == 'horizontal_and_vertical_flip_dataset':
                    img_np = np.array(region)            
                    input_hf = hflip.augment_image(img_np)
                    input_vf = vflip.augment_image(input_hf)
                    horizontal_and_vertical_flip_region = Image.fromarray(input_vf)
                    horizontal_and_vertical_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #certo
                elif specific_dataset == 'horizontal_flip_dataset':
                    img_np = np.array(region)            
                    input_hf = hflip.augment_image(img_np)
                    horizontal_flip_region = Image.fromarray(input_hf)
                    horizontal_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                #certo
                elif specific_dataset == 'vertical_flip_dataset':
                    img_np = np.array(region)            
                    input_vf = vflip.augment_image(img_np)
                    vertical_flip_region = Image.fromarray(input_vf)
                    vertical_flip_region.save(f"{specific_dataset}/images/{id_estacao}/{date_str}/{date_time.strftime('%Y-%m-%dT%H%M')}.png")

                # Verifica se o arquivo JSON para esta estação já existe, se não, cria o arquivo
                filename = f"{specific_dataset}/precipitation/{id_estacao}.json"
                if not os.path.isfile(filename):
                    with open(filename, 'w') as file:
                        json.dump({}, file)

                # Carrega o conteúdo do arquivo JSON para a variável "precipitation_data"
                with open(filename, 'r') as file:
                    precipitation_data = json.load(file)

                # Adiciona as informações meteorológicas ao arquivo JSON
                for date in final_result:
                    if date not in precipitation_data:
                        precipitation_data[date] = final_result[date]
                    else:
                        precipitation_data[date].update(final_result[date])

                # Escreve o conteúdo atualizado no arquivo JSON
                with open(filename, 'w') as file:
                    json.dump(precipitation_data, file, indent=4)
    print("Dados atualizados com sucesso!")

In [14]:
progress_bar_length = 495
f = IntProgress(min=0, max=progress_bar_length) # instantiate the bar
display(f) # display the bar
get_images_and_data_from_ipma()

IntProgress(value=0, max=495)

Dados atualizados com sucesso!


# Data treatment

In [3]:
specific_datasets = list_dataset_folders()

flag = False
data_array = np.empty(0)
images_array = np.empty((0, IMAGE_SIZE, IMAGE_SIZE, 4))


#objective_hour = get_objective_hour()
objective_hour = '12:00'
images_end_with ='T'+ objective_hour[:2]+'00.png'

for dataset in specific_datasets:
    for stationJson in ids:
        currentDir = dataset+'/precipitation/'+str(stationJson)+'.json'
        with open(currentDir) as f:
            # Load the JSON data
            data = json.load(f)
        f.close()
        current_data_array = np.empty(0)
        current_data_array = extract_hour_data(data,objective_hour)
        data_array = np.concatenate((data_array, current_data_array))
        if not flag:
            flag = True
            progress_bar_length = len(specific_datasets)*len(ids)*len(data_array)
            progress_bar = IntProgress(min=0, max=progress_bar_length) # instantiate the bar
            display(progress_bar) # display the bar
    for stationFolder in ids:
        images_folder_path = dataset+'/images/'+str(stationFolder)+'/'
        # Get a list of all the files in the current folder
        file_list = os.listdir(images_folder_path)
        # Filter the list to only include folders files
        days_folders_list = [file for file in file_list if os.path.isdir(os.path.join(images_folder_path, file))]
        for day_folder in days_folders_list:
            current_path = images_folder_path + day_folder
            #Get a list of all files in the folder 
            files_list = os.listdir(current_path)
            # Filter the list to only include images files
            image_list = [file for file in files_list if file.endswith(images_end_with)]
            for i in range(len(image_list)):
                image_path = os.path.join(current_path, image_list[i])
                #print(image_path)
                image = Image.open(image_path)
                img_np = np.array(image)
                images_array = np.append(images_array,[img_np], axis=0) #NAO TIRAR PARENTESIS RETOS!!!!!!!!!!!!!!
                progress_bar.value += 1
                
data_array = array1D_to_array2D(data_array)
print(f"Number of images: {len(images_array)}")

IntProgress(value=0, max=1512)

Number of images: 1512


In [4]:
new_data_array = np.empty(0)
new_images_array = np.empty((0, IMAGE_SIZE, IMAGE_SIZE, 4))

how_many_of_each_value = count_how_many_occurences_of_each_value(array2D_to_array1D(data_array))
smallest_value = min(how_many_of_each_value.values())
dict_counter = create_zero_dict(how_many_of_each_value)

how_many_different_values = len(how_many_of_each_value) * smallest_value

f = IntProgress(min=0, max=how_many_different_values) # instantiate the bar
display(f) # display the bar

for index in range(len(data_array)):
    if len(images_array) == how_many_different_values:
        break
    if dict_counter[data_array[index][0]] < smallest_value:
        dict_counter[data_array[index][0]]+=1
        new_data_array = np.append(new_data_array,data_array[index])
        new_images_array = np.append(new_images_array,[images_array[index]], axis=0)
        f.value+=1
new_data_array = array1D_to_array2D(new_data_array)
print(f"Number of images: {len(new_data_array)}")

IntProgress(value=0, max=432)

Number of images: 432



#  Model Training

In [5]:
# perm = np.random.permutation(len(new_data_array))
# new_images_array = new_images_array[perm]
# new_data_array = new_data_array[perm]

train_images, val_images, train_values, val_values = train_test_split(new_images_array, new_data_array, test_size=0.15, random_state=10)

train_images = tf.keras.utils.normalize(train_images, axis=1)
val_images = tf.keras.utils.normalize(val_images, axis=1)

In [6]:
from tensorflow.keras.applications import EfficientNetB7
from tensorflow.keras.applications import ResNet50

n_neuronios = 16
filter_size = 3 
max_pool_size = (2,2) 
n_epochs = 200 
n_strides = 1
dropout_value = 0.25 

model = ResNet50()

# model = tf.keras.models.Sequential()
# model.add(layers.Conv2D(n_neuronios, max_pool_size, activation='relu', input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, N_CHANNELS)))
# model.add(tf.keras.layers.Conv2D(n_neuronios, filter_size, strides=n_strides, padding='same', activation='relu'))
# model.add(tf.keras.layers.BatchNormalization())
# model.add(tf.keras.layers.Dropout(dropout_value))
# model.add(tf.keras.layers.MaxPooling2D(pool_size=max_pool_size))
# model.add(tf.keras.layers.BatchNormalization())
# model.add(tf.keras.layers.Conv2D(N_NEURONIOS*4, FILTER_SIZE, padding='same', activation='relu'))
# model.add(tf.keras.layers.BatchNormalization())
# model.add(tf.keras.layers.MaxPooling2D(pool_size=MAX_POOL_SIZE))
# model.add(tf.keras.layers.BatchNormalization())
# model.add(tf.keras.layers.Conv2D(N_NEURONIOS*4, FILTER_SIZE, padding='same', activation='relu'))
# model.add(tf.keras.layers.BatchNormalization())
# model.add(tf.keras.layers.MaxPooling2D(pool_size=MAX_POOL_SIZE))
# model.add(tf.keras.layers.BatchNormalization())
# model.add(tf.keras.layers.Flatten())
# model.add(tf.keras.layers.Dense(n_neuronios*2, activation='relu'))
# model.add(tf.keras.layers.BatchNormalization())
# model.add(tf.keras.layers.Dropout(dropout_value))
# model.add(tf.keras.layers.Dense(N_CLASSES, activation='softmax'))

# #model.summary()
# #plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')

# model.compile(
#     loss='sparse_categorical_crossentropy',
#     optimizer='adam',
#     metrics=['accuracy']
# )


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5


In [7]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 313031923247350500
xla_global_id: -1
]


In [8]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications import ResNet50
input_shape = (200, 200, 4) # input shape with 200x200 pixels and 4 channels for RGBA

# create the EfficientNetB7 model
model = ResNet50(include_top=True, weights=None, input_shape=input_shape, classes=100)

# define the input tensor with the specified shape
input_tensor = tf.keras.layers.Input(shape=input_shape)

# pass the input tensor through the EfficientNetB7 model
output_tensor = model(input_tensor)

# create the final model
model = tf.keras.models.Model(inputs=input_tensor, outputs=output_tensor)

  input_shape = imagenet_utils.obtain_input_shape(


In [9]:
model.compile('adam', 'sparse_categorical_crossentropy')

In [10]:
history = model.fit(train_images, train_values, epochs= n_epochs, validation_data=(val_images, val_values))

Epoch 1/200
Epoch 2/200

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\joao\AppData\Local\Temp\ipykernel_18176\2303919647.py", line 1, in <module>
    history = model.fit(train_images, train_values, epochs= n_epochs, validation_data=(val_images, val_values))
  File "C:\ProgramData\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
    return fn(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1650, in fit
    tmp_logs = self.train_function(iterator)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\util\traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\polymorphic_function\polymorphic_function.py", line 880, in __call__
    result = self.

TypeError: object of type 'NoneType' has no len()

In [None]:
import matplotlib.pyplot as plt


def plot_hist(history):
    plt.plot(history.history["accuracy"])
    plt.plot(history.history["val_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()


plot_hist(history)