In [68]:
import os
import numpy as np, array
import pandas as pd
import tensorflow as tf
from keras import Sequential, layers, models, utils
from keras.utils import Sequence
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, Dense, Flatten, LSTM, Embedding, RepeatVector
from keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from PIL import Image
from numpy import concatenate
import csv

In [69]:
# Path:

base_path = "D:/UIT/DACN/"
csv_folder_path = os.path.join(base_path, "train_csv/")
img_folder_path = os.path.join(base_path, "train_img/")
img_path_test = ""
img_path = ""

class SequenceGenerator(Sequence):
    def __init__(self, df, sequence_col, target_col, batch_size):
        self.df = df
        self.sequence_col = sequence_col
        self.target_col = target_col
        self.batch_size = batch_size

    def __len__(self):
        return len(self.df) // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size
        sequences = self.df.iloc[batch_start:batch_end][self.sequence_col].values
        targets = self.df.iloc[batch_start:batch_end][self.target_col].values
        return sequences, targets

    def on_epoch_end(self):
        np.random.shuffle(self.df.index)


In [77]:
# Build & Train model

def train_model(df, path):
    # Split data into training and validation sets
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    train_df["Rain (mm)"] = train_df["Rain (mm)"].astype(str)
    val_df["Rain (mm)"] = val_df["Rain (mm)"].astype(str)
    batch_size = 8
    
    img_gen =  ImageDataGenerator(
        rescale=1 / 255.0,
        rotation_range=20,
        zoom_range=0.05,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.05,
        horizontal_flip=True,
        fill_mode="nearest",)
    
    train_generator = img_gen.flow_from_dataframe(
        dataframe=train_df,
        directory=path,
        x_col="IMG_Path",
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode="categorical",
        subset='training',
        shuffle=True,
        seed=42)

    valid_generator = img_gen.flow_from_dataframe(
        dataframe=train_df,
        directory=path,
        x_col="IMG_Path",
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode="categorical",
        subset='validation',
        shuffle=True,
        seed=42)
    
    seq_gen = SequenceGenerator(train_df, 'Sequences', 'Rain (mm)', batch_size=batch_size)
    seq_embedding = Embedding(input_dim=100, output_dim=128, input_length=10)
    
    # Define CNN model
    img_input = Input(shape=(224, 224, 3))
    x = Conv2D(32, (3, 3), activation='relu')(img_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    img_output = Dense(128, activation='relu')(x)
    
    # Define sequence input
    seq_input = Input(shape=(3,))
    seq_embedding_output = seq_embedding(seq_input)
    
    # Concatenate CNN and sequence inputs
    concat_input = concatenate([img_output, seq_embedding_output])
    # define LSTM layer
    lstm_output = LSTM(128)(concat_input)
    # Define output layer
    output = Dense(1)(lstm_output)

    # Define model
    model = Model(inputs=[img_input, seq_input], outputs=output)

    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')

    model.fit(x = [train_generator, seq_gen],
              validation_data = [valid_generator, seq_gen],
              steps_per_epoch = train_generator.n//train_generator.batch_size,
              validation_steps = valid_generator.n//valid_generator.batch_size,
              epochs=5)

In [71]:
# Function:

def get_csv_file_name(file_path):
    csv_file = os.path.basename(file_path)
    csv_name = csv_file[:-4]
    return csv_name

def find_img_folder_matched(csv_name):
    for img_folder in os.listdir(img_folder_path):
        full_img_folder_path = os.path.join(img_folder_path, img_folder)   
        if os.path.isdir(full_img_folder_path) and img_folder == csv_name:
            matching_img_folder = full_img_folder_path
            break
    return matching_img_folder

def preprocess_image(image_path, target_size):
    image = Image.open(image_path)
    image = image.resize(target_size)
    # Perform any additional preprocessing steps as needed (e.g., normalization)
    return np.array(image)  # Convert PIL image to numpy array

def preprocess_sequence(sequence):
    # Perform any preprocessing steps (e.g., normalization)
    return np.array(sequence)

def prepare_data_to_train(file, path, file_count, csv_file_paths):
    label_file_path = os.path.join(path, file)             # file csv được chọn ban đầu đặt làm file target (đường dẫn)
    target_file_path = csv_file_paths[file_count - 1]
    target_file_name = get_csv_file_name(target_file_path)  # từ đường dẫn file target -> tên file (time file)
    img_folder_matched = find_img_folder_matched(target_file_name)  # từ tên file (time file) -> folder chứa ảnh tương ứng
    
    # Đọc file CSV và chỉ đọc các cột được chỉ định vào DataFrame
    columns_to_read = ["City", "Rain (mm)"]                      
    df = pd.read_csv(label_file_path, usecols=columns_to_read)
    
    num_row = df.shape[0]
    sequences = np.empty((num_row, 3))
    for i in range(3):  # Vòng lặp từ i=0 đến i=2
        # Đọc file CSV
        file_index = file_count + i - 3 
        new_df = pd.read_csv(csv_file_paths[file_index])
        # Gán giá trị vào cột i của ma trận sequences
        sequences[:, i] = new_df["Rain (mm)"]

    # Tạo DataFrame từ ma trận sequences với tên cột là "Sequences"
    sequences_df = pd.DataFrame(sequences)
    sequences_df["Sequences"] = sequences_df.apply(lambda row: row.tolist(), axis=1)

    df.rename(columns={"City": "IMG_Path"}, inplace=True)
    # Áp dụng hàm lambda vào cột "IMG_Path"
    df["IMG_Path"] = df["IMG_Path"].apply(lambda city_name: os.path.join(img_folder_matched, f"{city_name}.png"))
    
    # Ghép DataFrame sequences_df vào giữa 2 cột của DataFrame df
    df = pd.concat([df.iloc[:, :1], sequences_df, df.iloc[:, 1:]], axis=1)
    pd.set_option("display.max_colwidth", None)
    # df.rename(columns={"Rain (mm)": "Rain_mm"}, inplace=True)
    
    global img_path
    img_path = img_folder_matched
    
    return df.drop(columns=df.columns[1:4])


In [78]:
# Train:

csv_file_paths = []
file_count = 0
df=pd.DataFrame()


for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith(".csv"):
        csv_paths = os.path.join(csv_folder_path, csv_file)
        csv_file_paths.append(csv_paths)   

        if file_count >= 3:
            df = prepare_data_to_train(csv_file, csv_folder_path, file_count, csv_file_paths)
            train_model(df, img_path)
            # df.to_csv(output_csv_path, index=False)
            # print(df.to_string(index=False))
        file_count += 1
        

KeyError: 'class'

In [82]:
# Evaluate

csv_test_path = "D:/UIT/DACN/test_csv/2024-01-15-11-02-05.csv"
columns_to_keep = ["City", "Rain (mm)"]
test_df = pd.read_csv(csv_test_path, usecols=columns_to_keep)
test_df['City'] = test_df['City'].apply(lambda x: x + '.png')
test_df['Rain (mm)'] = test_df['Rain (mm)'].astype(str)

train_datagen = ImageDataGenerator(
        rescale=1 / 255.0,
        rotation_range=20,
        zoom_range=0.05,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.05,
        horizontal_flip=True,
        fill_mode="nearest",
        validation_split=0.20)

valid_generator = train_datagen.flow_from_dataframe(
        dataframe=test_df,
        directory=img_path_test,
        x_col="City",
        y_col="Rain (mm)",
        target_size=(224, 224),
        batch_size=8,
        class_mode="categorical",
        subset='validation',
        shuffle=True,
        seed=42)

score = model.evaluate(valid_generator)
print('\nTest loss:', score[0])
print('Test accuracy:', score[1])


Found 15 validated image filenames belonging to 2 classes.

Test loss: 0.44794896245002747
Test accuracy: 1.0


In [80]:
# Predict:

test_datagen = ImageDataGenerator(
        rescale=1 / 255.0,
        rotation_range=20,
        zoom_range=0.05,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.05,
        horizontal_flip=True,
        fill_mode="nearest",
        validation_split=0.20)        

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=img_path_test,
    x_col="City",
    target_size=(224, 224),
    batch_size=1,
    class_mode=None,
    shuffle=False,)

predict = model.predict(test_generator, steps=len(test_generator.filenames))

Found 76 validated image filenames.


In [62]:
result_predict = predict.argmax(axis=-1)
print(result_predict)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


In [76]:
real_result = np.array(test_df['Rain (mm)'].astype(int))
print(np.array_str(real_result, precision=0, suppress_small=True))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


In [79]:
# Classification report

labels = test_df['Rain (mm)'].astype(int).tolist()

predicted_labels = np.argmax(predict, axis=1)

print(classification_report(labels, predicted_labels, zero_division=1))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        75
           1       1.00      0.00      0.00         1

    accuracy                           0.99        76
   macro avg       0.99      0.50      0.50        76
weighted avg       0.99      0.99      0.98        76

