In [2]:
import os
import re
import datetime
import rasterio
import numpy as np
import multiprocessing as mp
from multiprocessing import Pool
import matplotlib.pyplot as plt
import pickle

In [3]:
directory_2020_m = "data/Busia/mVIs_Narok_North_East_monthly_2020"

# Get a list of all the TIFF files in the directory
tiff_files_2020_m = sorted([os.path.join(directory_2020_m, file) for file in os.listdir(directory_2020_m) if file.endswith(".tif")])

In [4]:
num_processes = mp.cpu_count()

# Define a function to read data from a single file
def read_data(file):
    with rasterio.open(file) as src:
        data = src.read()
        data = np.nan_to_num(data, nan=np.nan, posinf=np.nan, neginf=np.nan)
        return data

In [6]:
with Pool(num_processes) as pool:
    # Apply the read_data() function to each file in parallel
    data_arrays_2020_m = pool.map(read_data, tiff_files_2020_m)

# Stack the data arrays along a new axis to create the datacube
datacube_2020_m = np.stack(data_arrays_2020_m, axis=1)
print(datacube_2020_m.shape)

(3, 12, 11289, 9467)


In [7]:
NDVI_stack_2020 = datacube_2020_m[0,:,:,:]
NDMI_stack_2020 = datacube_2020_m[1, :, :, :]
GVI_stack_2020 = datacube_2020_m[2, :, :, :]

print(NDVI_stack_2020.shape,NDMI_stack_2020.shape,GVI_stack_2020.shape)

(12, 11289, 9467) (12, 11289, 9467) (12, 11289, 9467)


In [8]:
def replace_nan_with_mean(pixel):
    time_series = pixel[:]
    valid_values = time_series[np.logical_not(np.isnan(time_series))]
    
    if np.isnan(valid_values).all():
        # Return the original time series if it's fully NaN
        return time_series
    
    mean_value = np.nanmean(valid_values)
    time_series[np.isnan(time_series)] = mean_value
    
    return time_series

In [9]:
t, r, c = NDVI_stack_2020.shape

# Reshape the stack to a 2D array for parallel processing
NDVI_stack_2020_reshaped = NDVI_stack_2020.reshape(t, -1)
NDMI_stack_2020_reshaped = NDMI_stack_2020.reshape(t, -1)
GVI_stack_2020_reshaped = GVI_stack_2020.reshape(t, -1)

# Determine the number of processes to use

# Create a pool of workers
pool = Pool(num_processes)

# Apply the function to each pixel in parallel
processed_pixels_ndvi = pool.map(replace_nan_with_mean, NDVI_stack_2020_reshaped.T)
processed_pixels_ndmi = pool.map(replace_nan_with_mean, NDMI_stack_2020_reshaped.T)
processed_pixels_gvi = pool.map(replace_nan_with_mean, GVI_stack_2020_reshaped.T)

# Close the pool to release resources
pool.close()
pool.join()

# Reshape the processed pixels back to the original shape
NDVI_stack_2020_processed = np.array(processed_pixels_ndvi).T.reshape(t, r, c)
NDMI_stack_2020_processed = np.array(processed_pixels_ndmi).T.reshape(t, r, c)
GVI_stack_2020_processed = np.array(processed_pixels_gvi).T.reshape(t, r, c)

In [10]:
GVI_stack_2020_processed[:,4000,3000]

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
      dtype=float32)

In [11]:
stacked_vi_2020 = np.concatenate((NDVI_stack_2020_processed, NDMI_stack_2020_processed, GVI_stack_2020_processed), axis=0)
print(stacked_vi_2020.shape)

(36, 11289, 9467)


In [12]:
stacked_vi_2020_2d = np.moveaxis(stacked_vi_2020, 0, -1)

# Print the shape of the resulting stacked tuples
print(stacked_vi_2020_2d.shape)

(11289, 9467, 36)


In [13]:
stacked_vi_2020_2d_2 = np.moveaxis(stacked_vi_2020, 0, -1)

# Print the shape of the resulting stacked tuples
print(stacked_vi_2020_2d_2.shape)

(11289, 9467, 36)


In [14]:
stacked_vi_2020_2d_2[4000,3000,:]

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], dtype=float32)

In [15]:
stacked_vi_2020_2d_ml_m = stacked_vi_2020_2d_2.reshape(r*c,t*3)

print(stacked_vi_2020_2d_ml_m.shape)

(106872963, 36)


In [51]:
# stacked_vi_2022_2d_ml_m_3 = stacked_vi_2022_2d.reshape(r*c,t*3)

# print(stacked_vi_2022_2d_ml_m_2.shape)

(52333392, 36)


In [16]:
# # Load the model from the file
with open('RF_Narok_NE_18_22.pkl', 'rb') as f:
    RF_Narok_NE_18_22 = pickle.load(f)

In [17]:
stacked_vi_2020_2d_ml_m[:,1]

array([nan, nan, nan, ..., nan, nan, nan], dtype=float32)

In [22]:
unseen_nan_mask_1 = (stacked_vi_2020_2d_ml_m == -9999).all(axis=1) ## or (stacked_vi_2022_2d_ml_m_3 == -9999).all(axis=1)
print(unseen_nan_mask_1)

[ True  True  True ...  True  True  True]


In [21]:
nan_mask = np.isnan(stacked_vi_2020_2d_ml_m).all(axis=1)
print(nan_mask)

[False False False ... False False False]


In [20]:
# Create a placeholder label for fully NaN time series
placeholder_value = -9999  # Set the desired placeholder label for fully NaN time series
stacked_vi_2020_2d_ml_m[np.isnan(stacked_vi_2020_2d_ml_m)] = placeholder_value
# Predict labels for the unseen data, considering only non-NaN time series
predicted_labels = np.where(nan_mask, placeholder_value, RF_Narok_NE_18_22.predict(stacked_vi_2020_2d_ml_m))

In [23]:
print(predicted_labels.shape)
print(stacked_vi_2020_2d_ml_m.shape)

# Load the input GeoTIFF file
with rasterio.open(tiff_files_2020_m[0]) as src:
    profile = src.profile  # Get the profile of the input GeoTIFF
    r, c = src.shape  # Get the spatial dimensions of the input GeoTIFF

    # Read the input data as a NumPy array
    input_data = src.read()

# Reshape the predicted labels array to match the spatial dimensions of the input data
predicted_labels_2d = predicted_labels.reshape((r, c))
print(predicted_labels_2d.shape)

(106872963,)
(106872963, 36)
(11289, 9467)


In [24]:
# Add the predicted labels as a new band to the input data
output_data = np.zeros((r, c), dtype=input_data.dtype)

# Assign the predicted labels to the corresponding pixels in the output array
output_data = np.where(input_data[0, :, :] != 0, predicted_labels_2d, output_data)
print(output_data.shape)

# Update the profile of the input GeoTIFF to reflect the changes
profile.update(count=1)

# Save the output data as a new GeoTIFF file
with rasterio.open('Narok_NE_2020_1.tif', 'w', **profile) as dst:
    dst.write(output_data, 1)

(11289, 9467)


In [56]:

# Create a mask for lines with fully -9999 values
masked_lines = (stacked_vi_2022_2d_ml_m_3 == -9999).all(axis=1)

# Create a placeholder label for masked lines
placeholder_value = -9999

# Create a copy of the input data to preserve the original values
masked_stacked_vi_2d_ml = np.copy(stacked_vi_2022_2d_ml_m_3)

# Replace the values in masked lines with the placeholder value
masked_stacked_vi_2d_ml[masked_lines] = placeholder_value

# Predict labels for the non-masked lines
predicted_labels = RF_Busia_18_22.predict(masked_stacked_vi_2d_ml)

# Create an output array with the same shape as the input data
output_data = np.zeros_like(stacked_vi_2022_2d_ml_m_3, dtype=np.float32)

# Assign predicted labels to the non-masked lines
output_data[~masked_lines] = predicted_labels

# Load the input GeoTIFF
with rasterio.open('tiff_files_2022_m[0]') as src:
    # Read the input data as a NumPy array
    input_data = src.read(1)

    # Create a mask for non-zero pixels in the input data
    non_zero_mask = (input_data != 0)

    # Assign the predicted labels to the corresponding pixels in the output array
    output_data[non_zero_mask] = output_data[non_zero_mask]

    # Update the profile of the input GeoTIFF to reflect the changes
    profile = src.profile

# Save the output data as a new GeoTIFF file
with rasterio.open('output_2.tif', 'w', **profile) as dst:
    dst.write(output_data, 1)


ValueError: shape mismatch: value array of shape (52333392,) could not be broadcast to indexing result of shape (8736011,36)

In [60]:
unseen_nan_mask_1 = (stacked_vi_2022_2d_ml_m_3 == -9999).all(axis=1) ## or (stacked_vi_2022_2d_ml_m_3 == -9999).all(axis=1)
print(unseen_nan_mask_1)
# Create a placeholder label for fully NaN time series
placeholder_value = -9999  # Set the desired placeholder label for fully NaN time series
stacked_vi_2022_2d_ml_m_2[np.isnan(stacked_vi_2022_2d_ml_m_2)] = placeholder_value
# Predict labels for the unseen data, considering only non-NaN time series
predicted_labels = np.where(unseen_nan_mask_1, placeholder_value, RF_Busia_18_22.predict(stacked_vi_2022_2d_ml_m_2))

[ True  True  True ...  True  True  True]


In [61]:
print(predicted_labels.shape)
print(stacked_vi_2022_2d_ml_m_3.shape)

# Load the input GeoTIFF file
with rasterio.open(tiff_files_2022_m[0]) as src:
    profile = src.profile  # Get the profile of the input GeoTIFF
    r, c = src.shape  # Get the spatial dimensions of the input GeoTIFF

    # Read the input data as a NumPy array
    input_data = src.read()

# Reshape the predicted labels array to match the spatial dimensions of the input data
predicted_labels_2d = predicted_labels.reshape((r, c))
print(predicted_labels_2d.shape)

(52333392,)
(52333392, 36)
(8952, 5846)


In [62]:
# Add the predicted labels as a new band to the input data
output_data = np.zeros((r, c), dtype=input_data.dtype)

# Assign the predicted labels to the corresponding pixels in the output array
output_data = np.where(input_data[0, :, :] != 0, predicted_labels_2d, output_data)
print(output_data.shape)

# Update the profile of the input GeoTIFF to reflect the changes
profile.update(count=1)

# Save the output data as a new GeoTIFF file
with rasterio.open('output_3.tif', 'w', **profile) as dst:
    dst.write(output_data, 1)  # Write the output data to the first band of the output GeoTIFF

(8952, 5846)


In [38]:
nan_mask = np.isnan(stacked_vi_2022_2d_ml_m_2).all(axis=1)

# Create a placeholder label for fully NaN rows
placeholder_value = -9999

# Predict labels for non-NaN rows
predicted_labels = RF_Busia_18_22.predict(stacked_vi_2022_2d_ml_m_2[~nan_mask])

# Create an output array with the same shape as the input data
output_data = np.full(stacked_vi_2022_2d_ml_m_2.shape, placeholder_value, dtype=np.float32)

# Assign predicted labels to non-NaN rows
output_data[~nan_mask, :] = predicted_labels.reshape(-1, 1)

In [41]:
# Load the input GeoTIFF
with rasterio.open(tiff_files_2022_m[0]) as src:
    # Read the input data as a NumPy array
    input_data = src.read(1)

    # Create a mask for non-zero pixels in the input data
    non_zero_mask = (input_data != 0)

    # Assign the predicted labels and placeholder value to the corresponding pixels in the output array
    output_data[non_zero_mask] = output_data[non_zero_mask]

    # Update the profile of the input GeoTIFF to reflect the changes
    profile = src.profile

# Save the output data as a new GeoTIFF file
with rasterio.open('Busia_2022_7.tif', 'w', **profile) as dst:
    dst.write(output_data, 1)

IndexError: boolean index did not match indexed array along dimension 0; dimension is 52333392 but corresponding boolean dimension is 8952

In [25]:
Busia_2022_nan_mask_2 = np.isnan(stacked_vi_2022_2d_ml_m_2).all(axis=1)

# Create a placeholder label for fully NaN time series
placeholder_value_2 = -9999  # Set the desired placeholder label for fully NaN time series
# stacked_vi_2022_2d_ml_m[np.isnan(stacked_vi_2022_2d_ml_m)] = placeholder_value
# Predict labels for the unseen data, considering only non-NaN time series
predicted_labels_Busia_2 = np.where(Busia_2022_nan_mask_2, placeholder_value_2, RF_Busia_18_22.predict(stacked_vi_2022_2d_ml_m_2))

In [26]:
# Load the input GeoTIFF file
with rasterio.open(tiff_files_2022_m[0]) as src:
    profile = src.profile  # Get the profile of the input GeoTIFF
    r, c = src.shape  # Get the spatial dimensions of the input GeoTIFF

    # Read the input data as a NumPy array
    input_data = src.read()

# Reshape the predicted labels array to match the spatial dimensions of the input data
predicted_labels_2d_2 = predicted_labels_Busia_2.reshape((r, c))
print(predicted_labels_2d_2.shape)

(8952, 5846)


In [27]:
# Add the predicted labels as a new band to the input data
output_data = np.zeros((r, c), dtype=input_data.dtype)

# Assign the predicted labels to the corresponding pixels in the output array
output_data = np.where(input_data[0, :, :] != 0, predicted_labels_2d_2, output_data)
print(output_data.shape)

# Update the profile of the input GeoTIFF to reflect the changes
profile.update(count=1)

# Save the output data as a new GeoTIFF file
with rasterio.open('Busia_2022_3.tif', 'w', **profile) as dst:
    dst.write(output_data, 1)  # Write the output data to the first band of the output GeoTIFF

(8952, 5846)


In [31]:
predicted_labels_2d_3 = np.clip(predicted_labels_2d_2, 0, 3)

# Create a mask for pixels where the predicted labels are NaN
nan_mask = np.isnan(predicted_labels_2d_3)

# Set the placeholder value (-9999) for NaN pixels
predicted_labels_2d_3[nan_mask] = -9999

In [35]:
predicted_labels_2d_3

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])