In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist


# Load the data into a pandas dataframe
data = pd.read_csv('file.csv')

# Combine the year and month columns into a single datetime column
data['date'] = pd.to_datetime(data['year'].astype(str) + '-' + data['Month'], format='%Y-%B')

# Drop the original year and month columns
data = data.drop(['year', 'Month'], axis=1)

# Divide the date into quarters and add the quarter column
data['quarter'] = data['date'].dt.quarter
print(data)
# Select the columns for temperature, rainfall, GHG, and CH4
columns = ['Temperature', 'Rainfall', 'ghg', 'ch4']

# Fill in missing values with their respective mean value for that column
for col in columns:
    data[col] = data[col].fillna(data[col].mean())

# Divide the year into quarters and add the quarter column
data['Quarter'] = data['date'].dt.quarter

# Create a dictionary to store the correlation matrices for each quarter
corr_dict = {}

# Loop through each quarter and calculate the correlation matrix for temperature, rainfall, GHG, and CH4
for quarter in range(1, 5):
    # Select the data for the current quarter
    quarter_data = data[data['Quarter'] == quarter]

    # Select the columns for temperature, rainfall, GHG, and CH4
    quarter_data = quarter_data[columns]

    # Calculate the correlation matrix
    corr_matrix = quarter_data.corr()

    # Store the correlation matrix in the dictionary
    corr_dict[f'Q{quarter}'] = corr_matrix

# Plot the correlation matrices as lower triangle heatmaps
for i, quarter in enumerate(corr_dict.keys()):
    # Create a mask to plot only the lower triangle of the heatmap
    mask = np.zeros_like(corr_dict[quarter], dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True


    # Plot the heatmap with diagonal values included
    plt.subplot(2, 2, i+1)
    sns.heatmap(corr_dict[quarter], annot=True, cmap='coolwarm', mask=mask,
                xticklabels=['Temperature', 'Rainfall', 'GHG', 'CH4'],
                yticklabels=['Temperature', 'Rainfall', 'GHG', 'CH4'])

    # Set the title and axis labels
    plt.title(f'Correlation Matrix for {quarter} from 1901-2016')
    plt.xlabel('Variables')
    plt.ylabel('Variables')

# Adjust the spacing between the subplots
plt.subplots_adjust(hspace=0.5, wspace=0.3)

# Show the plot
plt.show()





FileNotFoundError: [Errno 2] No such file or directory: 'file.csv'

In [None]:
# Drop the original year and month columns
data = data.drop(['date', 'Quarter','quarter'], axis=1)
print(data)



In [None]:
# Split the data into training and testing sets
train_data = data.iloc[:int(len(data)*0.8), :]
test_data = data.iloc[int(len(data)*0.8):, :]

# Split the training data into training and validation sets
val_data = train_data.iloc[int(len(train_data)*0.8):, :]
train_data = train_data.iloc[:int(len(train_data)*0.8), :]

# Scale the data
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
test_data = scaler.transform(test_data)

def create_dataset(data, target, time_steps):
    X = []
    y = []
    for i in range(len(data)-time_steps):
        X.append(data[i:i+time_steps])
        y.append(target[i+time_steps])
    return np.array(X), np.array(y)


# Define the time steps and the number of features
time_steps = 5
num_features = train_data.shape[1]

# Create the training dataset
X_train, y_train = [], []
for i in range(time_steps, len(train_data)):
    X_train.append(train_data[i-time_steps:i])
    y_train.append(train_data[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)

# Create the testing dataset
X_test, y_test = [], []
for i in range(time_steps, len(test_data)):
    X_test.append(test_data[i-time_steps:i])
    y_test.append(test_data[i, 0])
X_test, y_test = np.array(X_test), np.array(y_test)

# Print the shapes of the datasets
print("Training set shape: ", X_train.shape, y_train.shape)
print("Validation set shape: ", val_data.shape)
print("Testing set shape: ", X_test.shape, y_test.shape)


In [None]:
# Create the model
model = Sequential()
model.add(GRU(256, return_sequences=True, input_shape=(None, num_features)))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(GRU(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(GRU(32))
model.add(Dropout(0.5))
model.add(Dense(units=1))

# Print the model summary
print(model.summary())

# Compile the model
model.compile(optimizer='adam', loss='mean_absolute_error')

# Train the model
history = model.fit(X_train, y_train, epochs=400, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Predict the test data
y_pred = model.predict(X_test)


In [None]:
n_features = 1
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data.reshape(-1, n_features))

# Inverse scale the data
y_test = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred = scaler.inverse_transform(y_pred.reshape(-1, 1))

# Calculate the evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
nmae_test = mae / (np.max(y_test) - np.min(y_test))
mse = mean_squared_error(y_test, y_pred)
nrmse_test = np.sqrt(mse) / (np.max(y_test) - np.min(y_test))

# Calculate the evaluation metrics for training set
y_train_pred = model.predict(X_train)
y_train = y_train.reshape(-1, 1)
y_train_pred = scaler.inverse_transform(y_train_pred.reshape(-1, 1))
mae_train = mean_absolute_error(y_train, y_train_pred)
nmae_train = mae_train / (np.max(y_train) - np.min(y_train))
mse_train = mean_squared_error(y_train, y_train_pred)
nrmse_train = np.sqrt(mse_train) / (np.max(y_train) - np.min(y_train))

# Calculate the average NMAE and NRMSE
nmae_avg = np.mean([nmae_train, nmae_test])
nrmse_avg = np.mean([nrmse_train, nrmse_test])


# Print the evaluation metrics
print("NMAE Train: ", nmae_train)
print("NMAE Test: ", nmae_test)
print("Average NMAE: ", nmae_avg)
print("NRMSE TEST:",nrmse_test)
print("NRMSE Train: ", nrmse_train)
print("Average NMAE: ", nmae_avg)


In [None]:
# Plot the training and validation losses
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Losses')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
import seaborn as sns
# Create a heatmap of the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Convert the actual and predicted values to a 1D numpy array
y_test = y_test.ravel()
y_pred= y_pred.ravel()

# Define the quarters and years
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
start_year = 1901
end_year = 2016
year_step = 3
years = np.arange(start_year, end_year+1, year_step)

# Create a figure with subplots for each quarter
fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(15, 10), sharex=True)

# Loop through each quarter and plot the actual and predicted values
for i, quarter in enumerate(quarters):
    # Get the indices for the current quarter
    quarter_indices = np.arange(i, len(years)*len(quarters), len(quarters))

    # Plot the actual and predicted values for the current quarter
    axs[i].plot(years, y_test[quarter_indices], label='Actual')
    axs[i].plot(years, y_pred[quarter_indices], label='Predicted')
    axs[i].set_title(f'Quarterly Rainfall ({quarter})')
    axs[i].set_ylabel('Rainfall (mm)')

# Add a legend and x-axis label to the bottom plot
axs[-1].set_xlabel('Year')
axs[-1].legend()

# Adjust the spacing between subplots and save the figure
plt.tight_layout()
plt.savefig('actual_vs_predicted_rainfall.png')

