In [None]:
# imports all dependencies
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from pandas._libs.tslibs.timestamps import Timestamp
from datetime import datetime

In [None]:
## data initialisation definitions for uses are 0=raw, 1=autoregressive, ...

class data_functions:

  def __init__(self, file_path, use):
    self.file_path =  file_path
    self.use = use

    # imports data in the format of a csv with index, Date, Average_flux for graphing raw data
    self.df = pd.read_csv(self.file_path)

    # sets date to datetime format and as index value
    self.df['Date'] = pd.to_datetime(self.df['Date'])
    self.df.set_index('Date', inplace=True)

    # for use 1=autoregressive model, sets back log (n=(1-back_log)) and turns raw into log flux
    if self.use == 1:
      back_log = 11
      for i in range(1, back_log):
        self.df[f'Average_Flux_{i}_days_ago'] = self.df['Average_Flux'].shift(i)
      # turns values from raw data to logarithm
      def transform_flux(value):
        return np.log10(abs(value) + 1e-10)
      columns_to_transform = ['Average_Flux'] + [f'Average_Flux_{i}_days_ago' for i in range(1, back_log)]
      self.df[columns_to_transform] = self.df[columns_to_transform].applymap(transform_flux)
      # adds appropriate label changes to column headers
      self.df = self.df.add_prefix('log_')
      columns_to_transform2 = ['log_Average_Flux'] + [f'log_Average_Flux_{i}_days_ago' for i in range(1, back_log)]
      #  optional absolute values
      #self.df[columns_to_transform2] = self.df[columns_to_transform2].applymap(abs)


    # drops N/A value rows
    self.df.dropna(inplace=True)
    # output as csv for self.df with format date (as index) and log average flux values for n days previous plus current day

  def graph(self):
    #flux log average graph
    if self.use == 1:
      self.df['log_Average_Flux'].plot(kind='line', figsize=(16, 8), title='log_Average_Flux', color='black')
      self.max_value = self.df['log_Average_Flux'].max()
      self.min_value = self.df['log_Average_Flux'].min()
      print(self.max_value, self.min_value)
    #flux raw data graph
    if  self.use == 0:
      self.df['Average_Flux'].plot(kind='line', figsize=(16, 8), title='Average_Flux', color='black')
      self.max_value = self.df['Average_Flux'].max()
      self.min_value = self.df['Average_Flux'].min()
      print(self.max_value, self.min_value)

  # Split the data into features and target, takes full csv as X bar the log_Average_Flux which is set to y
  def split(self):
    if self.use == 0:
      return 0, 0
    else:
      self.X = self.df.drop('log_Average_Flux', axis=1)
      self.y = self.df['log_Average_Flux']
      return self.X, self.y

In [None]:
## model, initialise sets parameters and trains the model
class NN_model:
  def __init__(self, opt, learn_rate, dataframe, X, y):

    #sets scaler
    self.scaler = MinMaxScaler()
    self.dataframe = dataframe
    self.X = X
    self.y = y

    # scales the data between 0 and 1
    for col in self.dataframe.columns:
      column_data = self.dataframe[col].values.reshape(-1, 1)
      # reshapes 1D array of columns in dataframe to 2D array with 1 column
      self.dataframe[col] = self.scaler.fit_transform(column_data)

    #sets random seed
    random_seed = 30
    tf.random.set_seed(random_seed)

    # Split into training and testing sets
    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)

    # Create a neural network model
    self.model = tf.keras.Sequential([
        tf.keras.Input(shape=self.X_train.shape[1]),
        tf.keras.layers.Dense(10, activation="relu"),
        tf.keras.layers.Dense(10, activation="relu"),
        tf.keras.layers.Dense(1, activation=None)
    ])

    #learning rate selection
    self.learning_rate = learn_rate

    # Optimizer selection
    if opt == 1:
      self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
    # stochastic gradient descent
    else:
      self.optimizer = tf.keras.optimizers.SGD(learning_rate=self.learning_rate)

    # Compile the model
    self.model.compile(loss=tf.keras.losses.mae, optimizer=self.optimizer, metrics=["mae"])

  def train(self):
    # Train the model
    self.history = self.model.fit(self.X_train, self.y_train, epochs=400, batch_size=150, validation_data=(self.X_test, self.y_test), shuffle=True)
    # Evaluate the model
    self.loss = self.model.evaluate(self.X_test, self.y_test)
    plt.plot(self.history.history['loss'], label='Training Loss')
    plt.plot(self.history.history['val_loss'], label='Validation Loss', linestyle='--')
    plt.title('Loss Over Time')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

  def predict(self):
    # Make predictions on the test set
    self.y_pred = self.model.predict(self.X_test)
    # y_pred is an array in the form of non-scaled logarithm value ~(5<y_pred>-1)

  def graph(self):
    # graphs predicted and actaul log average flux values against one another
    self.df_graph  = pd.DataFrame(self.y_test)
    self.df_graph['y_prediction'] = self.y_pred
    self.df_graph.sort_values(by='Date', inplace = True)

    # date range specification if wanting the full graph just set to really old date for start_date and future date for end_date
    start_date = Timestamp(datetime(1900, 1, 1), tz='UTC')
    end_date = Timestamp(datetime(2020, 12, 31), tz='UTC')
    if start_date <= self.df_graph.index[0]:
      start_date_str = self.df_graph.index[0].strftime('%Y-%m-%d')
    else:
      start_date_str = start_date.strftime('%Y-%m-%d')
    if end_date >= self.df_graph.index[-1]:
      end_date_str = self.df_graph.index[-1].strftime('%Y-%m-%d')
    else:
      end_date_str = end_date.strftime('%Y-%m-%d')

    # Filter the DataFrame based on the date range
    df_subset = self.df_graph[(self.df_graph.index >= start_date) & (self.df_graph.index <= end_date)]

    #plot
    plt.figure(figsize=(20, 12))
    plt.plot(df_subset.index, df_subset['log_Average_Flux'], label='Real Flux', color='black')
    plt.plot(df_subset.index, df_subset['y_prediction'], label='Flux prediction', linestyle='--', color='red', alpha=0.8)
    plt.xlabel('Date')
    plt.ylabel('Log Average Flux')
    plt.title('Log Average Flux and Flux Prediction between: ' + start_date_str + ' and ' + end_date_str)
    plt.legend()
    plt.show()

  def scatter_graph(self):
    # scatter plot currently of no use in format
    plt.scatter(self.y_test, self.y_pred, color='green', marker='x')
    plt.xlabel('Log Actual Values')
    plt.ylabel('Log Predicted Values')
    plt.title('Log Average Actual vs Predicted Values')
    plt.show()

  def validate(self):
    #validation with PE of the model
    mse = mean_squared_error(self.df_graph['log_Average_Flux'], self.df_graph['y_prediction'])
    variance = self.df_graph['log_Average_Flux'].var()
    PE = 1-(mse/variance)
    print(PE)

  def save(self, results_file_name, model_file_name):
    # saves the dataframe of result, does not save the model
    final_file_path = '/content/drive/MyDrive/4th year/diss/models and results/' + results_file_name + '.csv'
    self.df_graph.to_csv(final_file_path, index=True)
    folder_path = '/content/drive/MyDrive/4th year/diss/models and results/saved models/' + model_file_name
    !mkdir "$folder_path"
    self.model.save(folder_path)
    #print(final_file_path)



In [None]:
data = data_functions('/content/drive/MyDrive/4th year/diss/Data_for_FYP/Energetic particle sensor/GOES  9, 10, 11 /full/G9_10_11_1995-2010.csv', 1)
df_ave  = data.df
X, y = data_functions.split(data)
#data_functions.graph(data)
m = NN_model(1, 0.0001, df_ave, X, y)
NN_model.train(m)
NN_model.predict(m)

In [None]:
NN_model.graph(m)
NN_model.save(m, 'results 1', 'model 1')

In [None]:
NN_model.scatter_graph(m)