In [1]:
import os
import glob
import shutil
import subprocess
import gc
import time
from typing import List, Tuple
from contextlib import contextmanager

import numpy as np
import pandas as pd
import rasterio
from osgeo import gdal
from osgeo_utils import gdal_merge
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
@contextmanager
def cwd(path: str) -> None:
    
    """
    Context manager para mudar o diretório de trabalho.
    Mantém o diretório original após a execução do bloco de código.
    """
    
    oldpwd = os.getcwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(oldpwd)

def listdir_fullpath(d: str) -> List[str]:
    
    """
    Retorna uma lista de caminhos completos para os arquivos em um diretório.
    """
    
    return [os.path.join(d, f) for f in sorted(os.listdir(d))]


class Preprocessing:
    def __init__(self, caminho: str) -> None:
        self.caminho: str = caminho  # Directory where .SAFE images are located
        self.abs_caminho: str = os.path.abspath(self.caminho)
        self.imgs_diretorio: List[str] = os.listdir(self.caminho)  # Names of each .SAFE image in the directory
        self.diretorios_tif: List[str] = sorted([diretorio.replace('.SAFE', '.TIF') for diretorio in self.imgs_diretorio if diretorio.endswith('.SAFE')])
        self.caminho_completo_lista: List[str] = [item for item in listdir_fullpath(self.caminho) if item.endswith('.SAFE')]

    def merge_tif_files(self, nome_TIF: str, output_name: str) -> None:
        """
        Merges TIFF files into a single file using gdal_merge.
        """
        with cwd(nome_TIF):
            if os.path.exists(output_name):
                os.remove(output_name)
            arquivos_tif: List[str] = glob.glob('*B*.tif')
            arquivos_tif: List[str] = self._sort_files(arquivos_tif, sufix='.tif')
            parameters = ['', '-o', output_name] + arquivos_tif + ['-separate', '-co', 'BIGTIFF=YES', '-co', 'COMPRESS=LZW']
            gdal_merge.main(parameters)
            list(map(os.remove, arquivos_tif))

    def _create_tif_folder(self) -> None:
        """
        Creates folders that will be used for the model results.
        """
        for tif_dir in self.diretorios_tif:
            if os.path.exists(tif_dir):
                shutil.rmtree(f'./{tif_dir}')
            os.makedirs(tif_dir)
        self._modify_img_diretorio()

    def _modify_img_diretorio(self) -> None:
        self.imgs_diretorio: List[str] = [diretorio for diretorio in self.imgs_diretorio if diretorio.endswith('.SAFE')]

    def jp2_to_tif(self, tif_dim: List[str] = ['10980', '10980'], output_name: str = 'merge.tif', create_folder: bool = True) -> None:
        """
        Converts JP2 files to TIFF using gdal_translate.
        """
        with cwd(self.caminho):
            if create_folder:
                self._create_tif_folder()
            for diretorio in self.imgs_diretorio:
                files: List[str] = glob.glob(os.path.join(f'{diretorio}', 'GRANULE', '*', 'IMG_DATA', '*B*.jp2'))
                files: List[str] = self._sort_files(files, sufix='.jp2')
                nome_TIF: str = diretorio.replace('.SAFE', '.TIF')
                commands: List[List[str]] = []

                for f in files:
                    input_path: str = f
                    output_path: str = nome_TIF + '/' + os.path.splitext(os.path.basename(f))[0] + '.tif'
                    if os.path.exists(output_path):
                        os.remove(output_path)
                    cmd: List[str] = ['gdal_translate', input_path, '-ot', 'Float32', '-of', 'Gtiff', '-outsize', tif_dim[0], tif_dim[1], output_path, '-co', 'BIGTIFF=YES']
                    commands.append(cmd)

                for cmd in commands:
                    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

                self.merge_tif_files(nome_TIF, output_name=output_name)

    def _sort_files(self, lista: list, sufix: str) -> List[str]:
        """
        Sorts the files list with specific sorting for band B08 and B8A.
        """
        lista = sorted(lista)
        b08_name: str = [item for item in lista if item.endswith(f"B08{sufix}")][0]
        b08_index: int = lista.index(b08_name)
        b8A_name: str = [item for item in lista if item.endswith(f"B8A{sufix}")][0]
        b8A_index: int = lista.index(b8A_name)
        lista.insert(b08_index + 1, lista.pop(b8A_index))
        return lista


class Modelos(Preprocessing):
    def __init__(self, caminho: str) -> None:
        super().__init__(caminho)

    def start(self, create_folder: bool) -> None:
        """
        Starts the preprocessing stage of .jp2 files.
        """
        self.jp2_to_tif(create_folder=create_folder)

In [None]:
def salvar_mascara_tiff(mascara: np.ndarray, banda_exemplo_path: str, output_path: str) -> None:
    """
    Saves a mask as a TIFF file with georeferencing.
    """
    with rasterio.open(banda_exemplo_path) as src:
        transform = src.transform
        crs = src.crs
        profile = src.profile

        profile.update(
            dtype=rasterio.float32,
            count=1,
            compress='lzw'
        )

        with rasterio.open(output_path, 'w', **profile) as dst:
            dst.write(mascara.astype(rasterio.float32), 1)


class ModelPreprocessing(Modelos):
    @staticmethod
    def load_and_prepare_data() -> pd.DataFrame:
        """
        Loads and prepares the data for modeling.
        """
        pixels = pd.concat([pd.read_csv(x) for x in os.listdir() if x.endswith('_pixels.csv')]).drop([f'band_{i}' for i in range(14, 17)], axis=1).drop(['Unnamed: 0', 'x', 'y', 'datetime'], axis=1)
        return pixels

    @staticmethod
    def fmask_replace(pixels: pd.DataFrame) -> pd.Series:
        """
        Replaces the labels for the Fmask model.
        """
        chave = {0: 0, 1: 1, 2: 2, 4: 3}
        fmask = pixels[pixels['modelo'] == 'Fmask']['GrndTruth'].replace(chave)
        return fmask

    @staticmethod
    def kappa_replace(pixels: pd.DataFrame) -> pd.Series:
        """
        Replaces the labels for the Kappa model.
        """
        chave = {1: 0, 2: 2, 3: 4, 4: 3}
        kappa = pixels[pixels['modelo'].isin(['Kappamask', 'KappaMask'])]['GrndTruth'].replace(chave)
        return kappa

    @staticmethod
    def sen2cor_replace(pixels: pd.DataFrame) -> pd.Series:
        """
        Replaces the labels for the Sen2Cor model.
        """
        chave = {2: 2, 3: 2, 4: 0, 5: 0, 6: 1, 8: 3, 9: 3, 10: 4}
        sen2cor = pixels[pixels['modelo'].isin(['Sen2Core', 'Sen2Cor'])]['GrndTruth'].replace(chave)
        return sen2cor

    @staticmethod
    def replace_labels(pixels: pd.DataFrame) -> pd.DataFrame:
        """
        Replaces the labels in the pixels DataFrame.
        """
        fmask_rotulos = ModelPreprocessing.fmask_replace(pixels)
        kappa_rotulos = ModelPreprocessing.kappa_replace(pixels)
        sen2cor_rotulos = ModelPreprocessing.sen2cor_replace(pixels)
        pixels['GrndTruth'] = pd.concat([fmask_rotulos, kappa_rotulos, sen2cor_rotulos]).values
        return pixels

    @staticmethod
    def prepare_features_and_labels(pixels: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
        """
        Prepares features and labels for modeling.
        """
        pixels = pixels.reset_index().drop('index', axis=1).drop('modelo', axis=1)
        X = pixels.drop(['GrndTruth'], axis=1).dropna()
        y = pixels.loc[X.index, 'GrndTruth']
        return X, y

    @staticmethod
    def split_data(X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        """
        Splits the data into training and testing sets.
        """
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)
        return X_train, X_test, y_train, y_test

    def infer_model(self, diretorio: str, model: object, X: pd.DataFrame) -> pd.DataFrame:
        """
        Infers the model on the data.
        """
        if os.path.exists(diretorio):
            with cwd(diretorio):
                start = time.time()

                pipeline = Pipeline([
                    ('scaler', StandardScaler()),  # Data normalization
                    ('pca', PCA(n_components=11)),  # Dimensionality reduction with PCA
                    ('classifier', model)  # Classifier
                ])

                dataset = gdal.Open('./merge.tif')
                cols = dataset.RasterXSize
                rows = dataset.RasterYSize
                bands = dataset.RasterCount

                transposed_shape = (rows, cols, bands)
                bandas = np.memmap('transposed_array.dat', dtype=np.float32, mode='r', shape=transposed_shape)
                del dataset
                gc.collect()

                bandas = (bandas - 1000) / 10000
                bandas = bandas.clip(min=0)

                start = time.time()
                pipeline.fit(X, y)

                column_names = [f'band_{i+1}' for i in range(bands)]
                df = pd.DataFrame(bandas, columns=column_names)
                chunk_size = 10000
                output_file = 'predictions.csv'

                with open(output_file, 'w') as f:
                    f.write('predictions\n')

                def process_chunk(chunk: pd.DataFrame) -> np.ndarray:
                    predictions = pipeline.predict(chunk)
                    return predictions

                num_pixels = df.shape[0]

                for start in range(0, num_pixels, chunk_size):
                    end = min(start + chunk_size, num_pixels)
                    chunk = df.iloc[start:end]
                    chunk_predictions = process_chunk(chunk)

                    with open(output_file, 'a') as f:
                        for prediction in chunk_predictions:
                            f.write(f"{prediction}\n")

                    del chunk_predictions, chunk
                    gc.collect()

                end = time.time()
                print("Tempo total para detecção de nuvens:", end - start, "segundos")
                gc.collect()
                return pd.read_csv(output_file)


def main(obj: ModelPreprocessing, diretorio: str) -> Tuple[List[str], dict]:
    """
    Main function to train models and make predictions.
    """
    pixels = ModelPreprocessing.load_and_prepare_data()
    pixels = ModelPreprocessing.replace_labels(pixels)
    X, y = ModelPreprocessing.prepare_features_and_labels(pixels)

    models = {
        'Voting_XGB_AdaBoost': VotingClassifier(estimators=[
            ('xgb', XGBClassifier(random_state=42)),
            ('adaboost', AdaBoostClassifier(random_state=42))
        ], voting='soft'),
        'ExtraTrees': ExtraTreesClassifier(random_state=42),
        'XGBClassifier': XGBClassifier(random_state=42),
        'Voting_LGBM_AdaBoost': VotingClassifier(estimators=[
            ('lgbm', LGBMClassifier(random_state=42)),
            ('adaboost', AdaBoostClassifier(random_state=42))
        ], voting='soft'),
        'LGBMClassifier': LGBMClassifier(random_state=42)
    }

    results = {}
    for model_name, model in models.items():
        model.fit(X, y)
        predictions = obj.infer_model(diretorio, model, X)
        results[model_name] = predictions

    return list(results.keys()), results


if __name__ == "__main__":
    ponto_de_montagem = '/media/jean/90D8B801D8B7E41E/Ubuntu/' # Pasta onde as imagens .SAFE estão 
    for i in sorted(os.listdir(ponto_de_montagem)):
        destino = os.path.join(os.path.abspath(ponto_de_montagem), i)
        obj = ModelPreprocessing(destino)
        obj.start(create_folder=True)
        for diretorio in obj.diretorios_tif:
            caminho_completo = os.path.join(obj.caminho, diretorio)
            nomes_modelos, resultados = main(obj, caminho_completo)

            banda_path = os.path.join(destino, diretorio, 'merge.tif')

            for nome in nomes_modelos:
                output_path = os.path.join(destino, diretorio, f"{nome}.tif")
                salvar_mascara_tiff(resultados[nome], banda_path, output_path)
                gc.collect()