In [1]:
from sklearn.model_selection import train_test_split
import xarray as xr
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from skimage.measure import block_reduce
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import sem
from scipy.stats import t

data_rain = xr.open_mfdataset(r"C:\Users\iarla\OneDrive\Documents\MSc_Project\HadUK_data\12km_Day_Rain\*.nc", parallel=False)
data_tmax = xr.open_mfdataset(r"C:\Users\iarla\OneDrive\Documents\MSc_Project\HadUK_data\12km_Day_tmax\*.nc", parallel=False)
data_tmin = xr.open_mfdataset(r"C:\Users\iarla\OneDrive\Documents\MSc_Project\HadUK_data\12km_Day_tmin\*.nc", parallel=False)

In [2]:
rain = np.array(data_rain['rainfall'])
tmax = np.array(data_tmax['tasmax'])
tmin = np.array(data_tmin['tasmin'])

min_length = len(tmax)
rainfall = rain[:min_length]
tmax = tmax[:min_length]
tmin = tmin[:min_length]

In [3]:
print(tmin.shape, rainfall.shape, tmax.shape)

(23011, 112, 82) (23011, 112, 82) (23011, 112, 82)


In [4]:
# Create a dictionary to hold all features (original and derived)
data = {
    'Max_Temp': tmax.flatten(),
    'Min_Temp': tmin.flatten(),
    'Rainfall': rainfall.flatten()
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

In [5]:
# List of columns to check for NaN values
columns_to_check = ['Max_Temp', 'Min_Temp', 'Rainfall']

# Initialize an empty list to store filtered chunks
filtered_chunks = []

# Define the chunk size
chunk_size = 100000

# Calculate the total number of chunks
num_chunks = len(df) // chunk_size + 1

# Iterate over chunks of the DataFrame
for i in range(num_chunks):
    # Calculate start and end indices for the current chunk
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(df))
    
    # Get the current chunk
    chunk = df.iloc[start_idx:end_idx]
    
    # Create a boolean mask indicating rows with NaN values in the specified columns for the current chunk
    nan_mask = chunk[columns_to_check].isnull().any(axis=1)
    
    # Filter the current chunk to keep rows without NaN values in the specified columns
    filtered_chunk = chunk[~nan_mask]
    
    # Append the filtered chunk to the list
    filtered_chunks.append(filtered_chunk)

# Concatenate the filtered chunks into a single DataFrame
df_filtered = pd.concat(filtered_chunks)

In [6]:
df_filtered

Unnamed: 0,Max_Temp,Min_Temp,Rainfall
852,11.177692,9.770614,0.000534
932,12.017380,8.694025,0.000894
933,11.211850,9.471471,0.000205
934,11.144086,9.588857,0.000101
1015,11.088012,9.736920,0.000097
...,...,...,...
211332586,2.908440,1.526325,1.768308
211332668,2.916356,1.815674,2.320380
211332669,2.411658,1.598756,2.755709
211332751,2.012478,0.998100,2.761733


In [7]:
# Check for NaN values in df_filtered
nan_values_exist = df_filtered.isna().any().any()

# Print the result
if nan_values_exist:
    print("There are NaN values in df_filtered.")
else:
    print("There are no NaN values in df_filtered.")

There are no NaN values in df_filtered.


In [8]:
def label_drought(rainfall, consecutive_days, threshold):
    """Label consecutive days with drought (less than threshold mm of rainfall for at least consecutive_days)."""
    drought_label = np.zeros_like(rainfall, dtype=bool)
    for i in range(rainfall.shape[1]):
        for j in range(rainfall.shape[2]):
            t = 0
            while t < rainfall.shape[0]:
                if rainfall[t, i, j] < threshold:
                    consecutive_count = 1
                    while consecutive_count < consecutive_days and t + consecutive_count < rainfall.shape[0]:
                        if rainfall[t + consecutive_count, i, j] < threshold:
                            consecutive_count += 1
                        else:
                            break
                    if consecutive_count == consecutive_days:
                        drought_label[t:t+consecutive_days, i, j] = True
                    t += consecutive_count
                else:
                    t += 1
    return drought_label

def label_heatwave(max_temperature, consecutive_days, threshold):
    """Label consecutive days with heatwave (max temperature is threshold or higher for at least consecutive_days)."""
    heatwave_label = np.zeros_like(max_temperature, dtype=bool)
    for i in range(max_temperature.shape[1]):
        for j in range(max_temperature.shape[2]):
            t = 0
            while t < max_temperature.shape[0]:
                if max_temperature[t, i, j] >= threshold:
                    consecutive_count = 1
                    while consecutive_count < consecutive_days and t + consecutive_count < max_temperature.shape[0]:
                        if max_temperature[t + consecutive_count, i, j] >= threshold:
                            consecutive_count += 1
                        else:
                            break
                    if consecutive_count == consecutive_days:
                        heatwave_label[t:t+consecutive_days, i, j] = True
                    t += consecutive_count
                else:
                    t += 1
    return heatwave_label

def label_tropical_night(min_temperature, threshold):
    """Label days with tropical night (day where min temp does not fall below threshold)."""
    return min_temperature >= threshold

def label_downpour(rainfall, threshold):
    """Label days with downpour (a day with more than threshold mm of rainfall)."""
    return rainfall > threshold

# Define thresholds and parameters for each event type
consecutive_drought_days = 15
heatwave_threshold = 28
tropical_night_threshold = 20
downpour_threshold = 100

# Label each day based on extreme weather events
drought_label = label_drought(rainfall, consecutive_drought_days, 0.2)
heatwave_label = label_heatwave(tmax, 3, heatwave_threshold)
tropical_night_label = label_tropical_night(tmin, tropical_night_threshold)
downpour_label = label_downpour(rainfall, downpour_threshold)

# Combine the labels into a single array
# Let's use numerical labels for simplicity: 0 - Normal, 1 - Drought, 2 - Heatwave, 3 - Tropical Night, 4 - Downpour
labels = np.zeros_like(rainfall, dtype=int)  # Initialize with normal days
labels[drought_label] = 1
labels[heatwave_label] = 2
labels[tropical_night_label] = 3
labels[downpour_label] = 4

# Check the distribution of labels
label_counts = np.bincount(labels.flatten())
print("Label distribution:", label_counts)

# Print the shape of the labels array
print("Labels shape:", labels.shape)

MemoryError: Unable to allocate 806. MiB for an array with shape (23011, 112, 82) and data type int32

In [None]:
print("Shape of df_filtered:", df_filtered.shape)
print("Shape of 'Rainfall' column:", df_filtered['Rainfall'].shape)
print("Shape of 'Max_Temp' column:", df_filtered['Max_Temp'].shape)
print("Shape of 'Min_Temp' column:", df_filtered['Min_Temp'].shape)

In [None]:
threshold = 1

# Temperature features
max_temp = np.max(tmax, axis=(1, 2))  # Maximum daily temperature
min_temp = np.min(tmin, axis=(1, 2))  # Minimum daily temperature
temp_range = max_temp - min_temp  # Daily temperature range
mean_temp = np.mean((tmax + tmin) / 2, axis=(1, 2))  # Mean daily temperature
temp_std = np.std((tmax + tmin) / 2, axis=(1, 2))  # Standard deviation of daily temperature

# Rainfall features
total_rainfall = np.sum(rainfall, axis=(1, 2))  # Total daily rainfall
max_rainfall = np.max(rainfall, axis=(1, 2))  # Maximum daily rainfall
rain_days = np.sum(rainfall > threshold, axis=(1, 2))  # Number of rain days
avg_rain_intensity = total_rainfall / rain_days  # Average rainfall intensity
rainfall_std = np.std(rainfall, axis=(1, 2))  # Standard deviation of daily rainfall

In [None]:
import pandas as pd

# Create a dictionary to hold all features (original and derived)
all_data = {
    'Max_Temp': max_temp.flatten(),
    'Min_Temp': min_temp.flatten(),
    'Temp_Range': temp_range.flatten(),
    'Mean_Temp': mean_temp.flatten(),
    'Temp_Std': temp_std.flatten(),
    'Total_Rainfall': total_rainfall.flatten(),
    'Max_Rainfall': max_rainfall.flatten(),
    'Rain_Days': rain_days.flatten(),
    'Avg_Rain_Intensity': avg_rain_intensity.flatten(),
    'Rainfall_Std': rainfall_std.flatten(),
    'Original_Max_Temp': max_temp.flatten(),
    'Original_Min_Temp': min_temp.flatten()
}

# Create a DataFrame from the dictionary
all_df = pd.DataFrame(all_data)

# Print the first few rows of the DataFrame
print(all_df.head())


In [None]:
# Define a list of arrays
arrays = [
    max_temp.flatten(),
    min_temp.flatten(),
    temp_range.flatten(),
    mean_temp.flatten(),
    temp_std.flatten(),
    total_rainfall.flatten(),
    max_rainfall.flatten(),
    rain_days.flatten(),
    avg_rain_intensity.flatten(),
    rainfall_std.flatten(),
    rainfall.flatten(),
    tmin.flatten(),
    tmax.flatten
]

# Create a dictionary to hold the lengths of the flattened arrays
array_lengths = {}

# Iterate over the arrays and store their lengths
for i, arr in enumerate(arrays):
    array_lengths[f'Array_{i+1}'] = len(arr)

# Print the lengths of the flattened arrays
for name, length in array_lengths.items():
    print(f"{name}: {length}")

In [None]:
# List of columns to check for NaN values
columns_to_check = ['Max_Temp', 'Min_Temp', 'Temp_Range', 'Mean_Temp', 'Temp_Std', 'Total_Rainfall', 'Max_Rainfall', 'Rain_Days', 'Avg_Rain_Intensity', 'Rainfall_Std', 'Original_Max_Temp', 'Original_Min_Temp']

# Create a boolean mask indicating rows with NaN values in the specified columns
nan_mask = all_df[columns_to_check].isnull().any(axis=1)

# Filter the DataFrame to keep rows without NaN values in the specified columns
df_filtered = all_df[~nan_mask]

In [None]:
df_filtered