In [1]:
# Import required packages
import os
import datetime
import sys
import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from keras.preprocessing.sequence import TimeseriesGenerator  
from keras.models import Sequential
from sklearn.linear_model import LinearRegression
from keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
from statistics import mean
from ast import literal_eval
from scipy.stats import sem
from matplotlib import container
import smogn
from imblearn.over_sampling import SMOTENC

ModuleNotFoundError: ignored

In [4]:
os.getcwd()

'/content'

In [None]:
"""
The code in this cell uses the functions defined below to test use case 2. 
This is done by oversampling and settings run 17 and 25 as the test runs
"""
# Load the dataframe from the csv file
dir_path = os.getcwd()
df = open_dataframe(dir_path + "data/data.csv")
features =  ['blue_change', 'saturation_change', 'green_change', 'pressureE_change', 'hue_change',
              'luminance_percent_change', 'humidity_change', 'temperature', 'hydrazine_batches',
              'yeo-johnson_green_cumulative', 'best_temperature_std_dev_cumulative']
new_runs = ['Crocus-017', 'Crocus-025']
historic_runs = ['Crocus-014', 'Crocus-015', 'Crocus-016', 'Crocus-018', 'Crocus-019', 'Crocus-020',
     'Crocus-022', 'Crocus-023']
# Apply smogn to over/under sample the training runs
training_runs = training_sampler(df.loc[historic_runs], historic_runs)
run_labels = list(label_maker(training_runs))
# Initialise the variables that control the time ahead that is predicted, window size, and downsample rate
dir_used = 'use_case2/'
time_ahead = [1800, 3600, 7200]
time_used = 7200
downsample_rate = 90

dir_check = dir_path + dir_used
if not os.path.exists(dir_check):
  os.makedirs(dir_check)
make_oversample_df(df, training_runs, new_runs, run_labels, dir_used)
df_filepath = os.path.join(dir_path, dir_used, "dataframe.csv")
df = open_dataframe(path=df_filepath)
 # Start the pipeline used to predict product formation
a = startPipeline(dir_path, df, new_runs, run_labels, features, time_ahead, time_used, downsample_rate, dir_used, repeats=10)
a.pipeline()





In [None]:
class startPipeline:
  """This object calls and links the two main predictive objects, predictValues and predictAhead"""
  def __init__(self, dir_path, df, new_runs, historic_runs, features, time_ahead, time_used, downsample_rate, dir_used, repeats=1):
    self.dir_path = dir_path
    self.df = df
    self.crocus_ls = sorted(list(set(df.index)))
    self.new_runs = new_runs
    self.historic_runs = historic_runs
    self.features = features
    self.time_ahead = time_ahead
    self.time_used = time_used
    self.downsample_rate = downsample_rate
    self.base_dir = os.path.join((dir_path + dir_used))
    self.repeats = repeats
    self.dir_used = dir_used


  def pipeline(self):
    """ This gets the predicted values for the instantaneous results and saves them into a file
    Predicted values are obtained for all of the runs, including the training data.
    These predicted values are then used in part to the train the next model
    """
    runs_ls = self.historic_runs + self.new_runs 
    #  Initialise the predictValues object for later use
    self.a = predictValues(self.df, ['empty'], ['empty'], self.features, self.base_dir, runs_ls)
    # Make a list of which runs values have already been predicted
    if os.path.isfile(self.base_dir + "predictions_lstm.csv"):
      preds_df = pd.read_csv(self.base_dir + "predictions_lstm.csv", index_col='crocus_id')
      finished_id_ls = sorted(list(set(preds_df.index)))
    else:
      finished_id_ls = []
      
    for run in runs_ls:
      if run in finished_id_ls:
        # If a run has already been predicted, then skip
        continue
      # Remove the current test run from the training set
      training_runs = [x for x in self.historic_runs if x.split('_')[0] != run.split('_')[0]]
      # Call the predict values object again and make product predictions - these are saved and assigned later
      self.a = predictValues(self.df, run, training_runs, self.features, self.base_dir, runs_ls)
      self.a.get_predictions()
    # Add the predictions column to the dataframe
    runs_df = self.df.loc[runs_ls]
    runs_df['predictions'] = self.a.make_predictions_dataframe()
    self.features.append('predictions')

    for new_run in self.new_runs:
      # get mean absolute error for the instantaneous predictions
      mae = error_at_t_zero(self.dir_used, new_run)
      # save this value to results file
      new_df = pd.DataFrame({'new_runs': [new_run], 'historic_runs': [self.historic_runs], 'features': 'na', 'time_ahead': 0, 'time_used': 'na', 'downsample_rate': 'na', 'predictions': 'na', 'mean_err': mae, 'errors': 'na'})
      new_df.to_csv(self.base_dir + new_run + 'results0.csv')
      # Iterate through each time in the list of time aheads to predict
      for time_ahead in self.time_ahead:
        df_ls = []
        # Repeat according to specified number of repeats
        for i in range(self.repeats):
          # Make and return the ahead predictions and their mean absolute errors
          ahead_predictions, mean_err = predictAhead(runs_df, self.time_used, time_ahead, self.downsample_rate, self.historic_runs, [new_run], self.features, self.base_dir).get_predictions()
          new_df = pd.DataFrame({'new_runs': [new_run], 'historic_runs': [self.historic_runs], 'features': [self.features], 'time_ahead': [time_ahead], 'time_used': [self.time_used], 'downsample_rate': [self.downsample_rate], 'predictions':[ahead_predictions], 'mean_err': mean_err})
          df_ls.append(new_df)
        # concatenate and save the results
        new_df = pd.concat(df_ls)
        new_df.to_csv(self.base_dir + new_run + 'results'+ str(time_ahead) +'.csv')
        

In [None]:
class predictValues:
  """This class contains all variables and functions used to predict the current product"""
  def __init__(self, df, pred_run, train_runs, features, dir_base, all_runs):
    self.df = df
    self.crocus_ls = sorted(list(set(df.index)))
    self.pred_run = pred_run
    self.train_runs = train_runs
    self.runs_ls =  self.train_runs + [self.pred_run]
    self.features = features
    self.base_dir = dir_base
    self.all_runs = all_runs

  def get_predictions(self):
    """This function calls all other functions in the correct order"""
    self.test_train_split()
    self.linear_regression()
    self.make_lstm_sequence()
    self.train_lstm()
    self.evaluate_model()
    

  def test_train_split(self):
    # split the test and train data
    df_grouped = df.groupby("crocus_id", sort="datetime")
    self.training_dfs = [x for _, x in df_grouped]
    self.train_df = self.df.loc[self.train_runs]
    self.test_df = self.df.loc[self.pred_run]
    # scale the x data
    # get x and y
    self.x_train = self.train_df[self.features]
    self.y_train = np.array(self.train_df['Product'])
    self.x_test = self.test_df[self.features]
    self.y_test = np.array(self.test_df['Product'])
    ss = MinMaxScaler()
    self.x_train_scaled = np.array(ss.fit_transform(self.x_train))
    self.x_test_scaled = np.array(ss.transform(self.x_test))

  def linear_regression(self):
    # train model and predict values for the first 20 minutes because the NN requires 20 minutes of data, linear regression is used for this time
    regressor = LinearRegression(fit_intercept=True)
    regressor = LinearRegression(fit_intercept=True)
    regressor.fit(self.x_train_scaled, self.y_train)
    self.y_pred = regressor.predict(self.x_test_scaled)
    # save predictions, making a new file if one doesn't currently exist
    pred_file = os.path.join(self.base_dir + 'predictions_lr.csv')
    y_preds_len = len(self.y_pred)
    croc_id_arr = [self.pred_run] * y_preds_len
    if os.path.isfile(pred_file):
      preds_df = pd.read_csv(self.base_dir + 'predictions_lr.csv', index_col=0)
      time_val = self.test_df['time'].reset_index(drop=True).values
      new_df = pd.DataFrame({'crocus_id': croc_id_arr, 'time': time_val, 'predictions': self.y_pred})
      save_df = pd.concat([preds_df, new_df])
      save_df.to_csv(self.base_dir + 'predictions_lr.csv')
    else:
      time_val = self.test_df['time'].reset_index(drop=True).values
      new_df = pd.DataFrame({'crocus_id': croc_id_arr, 'time': time_val, 'predictions':self.y_pred})
      new_df.to_csv(self.base_dir + 'predictions_lr.csv')

  def make_lstm_sequence(self):
    # This scales and transforms the data and calls the dequence creator function to make a sliding window of data
    sc = MinMaxScaler(feature_range=(0, 1))
    sc.fit(self.x_train) 
    X_train = []
    y_train = []
    self.lookback_window_samples = 20 * 6
    self.lookahead_label_offset = 0 * 6
    X_test, y_test = sequence_creator(self.test_df, self.lookback_window_samples, sc, self.features, self.lookahead_label_offset, )
    self.x_test, self.y_test = np.array(X_test), np.array(y_test)
    for single_training_run in self.training_dfs:
      xt, yt = sequence_creator(single_training_run, self.lookback_window_samples, sc,  self.features, self.lookahead_label_offset)
      X_train.extend(xt)
      y_train.extend(yt)

    self.X_train = np.asarray(X_train)
    self.y_train = np.asarray(y_train)

  def train_lstm(self):
    # The LSTM is trained
    final_loss = 1000000
    # The LSTM will repeat the training process if the final loss is higher than expected
    while final_loss > 10:
      self.regressor = Sequential()
      self.regressor.add(
        LSTM(50, activation='relu', return_sequences=True, input_shape=(self.lookback_window_samples, self.x_train.shape[1])))
      self.regressor.add(LSTM(50, activation='relu', return_sequences=True))
      self.regressor.add(LSTM(50, activation='sigmoid', return_sequences=False))
      self.regressor.add(Dense(50))
      self.regressor.add(Dropout(0.2))
      self.regressor.add(Dense(1))
      self.regressor.compile(optimizer='adam', loss='mean_squared_error')
      self.regressor.fit(self.X_train, self.y_train, epochs=10, batch_size=128, shuffle=True)
      loss_per_epoch = self.regressor.history.history['loss']
      final_loss = loss_per_epoch[-1]
      if np.isnan(final_loss):
        final_loss = 10000

  def evaluate_model(self):
    # The trained model is evaluated by making predictions and comparing to the true yield to compute the MAE
    self.predicted_yield = self.regressor.predict(self.x_test).reshape(-1)
    predicted_yield = self.predicted_yield
    pred_file = os.path.join(self.base_dir + 'predictions_lstm.csv')
    y_preds_len = len(predicted_yield)
    croc_id_arr = [self.pred_run] * y_preds_len
    time_val = self.test_df['time'][self.lookback_window_samples:].reset_index(drop=True).values
    # calculate errors and save to predictions lstm csv
    mae = np.mean(np.abs(predicted_yield - self.y_test))
    
    if os.path.isfile(pred_file):
      preds_df = pd.read_csv(self.base_dir + 'predictions_lstm.csv', index_col=0)
      new_df = pd.DataFrame({'crocus_id': croc_id_arr, 'time': time_val, 'predictions': self.predicted_yield})
      save_df = pd.concat([preds_df, new_df])
      save_df.to_csv(self.base_dir + 'predictions_lstm.csv')
    else:
      new_df = pd.DataFrame({'crocus_id': croc_id_arr, 'time': time_val, 'predictions': self.predicted_yield})
      new_df.to_csv(self.base_dir + 'predictions_lstm.csv')
    self.mae = mae
   
  def make_predictions_dataframe(self):
    # returns all of the instantaneous predicted product values
    preds_ls = []
    lr_infile = os.path.join(self.base_dir + 'predictions_lr.csv')
    lr_df = pd.read_csv(lr_infile, header=0, index_col='crocus_id')
    lstm_infile = os.path.join(self.base_dir + 'predictions_lstm.csv')
    lstm_df = pd.read_csv(lstm_infile, header=0, index_col='crocus_id')
    df_ls = []

    for run in sorted(self.all_runs):
      preds_t_ls = []
      preds_t_ls.extend(lr_df.loc[run]['predictions'][0:120])
      preds_t_ls.extend(lstm_df.loc[run]['predictions'])
      croc_id_arr = [run] * len(preds_t_ls)
      df_t = pd.DataFrame({'crocus_id': croc_id_arr, 'predictions': preds_t_ls})
      df_ls.append(df_t)

    save_df = pd.concat(df_ls)
    save_df.to_csv(self.base_dir + 'all_predictions.csv')
    return save_df['predictions'].values



In [None]:
class predictAhead:
  """
  This class contains all the functions and variables needed to predict future product values
  t1 is the time to be used, t2 is the ahead to predict and t3 is the time between instances
  """
  def __init__(self, df, t1:int, t2:int, t3:int, training_runs:list, test_runs:list, features:list, base_dir):
    self.df = df
    self.t1 = t1  # window size
    self.t2 = t2  # time ahread to predict
    self.t3 = t3  # downsample rate
    self.training_runs = training_runs
    self.test_runs = test_runs
    self.features = features
    self.crocus_ls = sorted(list(set(df.index)))
    self.base_dir = base_dir

  def get_predictions(self):
    """This function calls all other functions in the correct order"""
    preds_df = pd.read_csv(self.base_dir + "all_predictions.csv", index_col='crocus_id')
    # skips if predictions have already been made
    if self.test_runs in sorted(list(set(preds_df.index))):
      return None, None  
    self.prepare_dataframe()
    self.split_windows()
    self.fix_arrays()
    self.make_model()
    self.fit_model()
    self.evaluate_model()
    return self.preds, self.mean_err

  def prepare_dataframe(self):
   # Downsamples the data - makes the window of data to be used for predictions and then predict x time ahead
    df_ls = []
    df = self.df
    # Downsamples the data
    df = df.iloc[::int(self.t3/10), :]
    for run in self.crocus_ls:
      df.loc[run]['time'] = df.loc[run]['time'] - df.loc[run]['time'][0]
    # Make all long runs start at x time
    df_ls = []
    run_ls = []
    for run in self.crocus_ls:
      if run in ['Crocus-023', 'Crocus-025']:
        df_t = df.loc[run][df.loc[run]['time'] > 7200]
        df_ls.append(df_t)
        run_ls.append({'run': run, 'length': len(df_t)})
      else:
        df_t = df.loc[run]
        df_ls.append(df_t)
        run_ls.append({'run': run, 'length': len(df_t)})

    df = pd.concat(df_ls)
    df_run = pd.DataFrame(run_ls)
    shortest_length = int(df_run['length'].min())
    # Make all runs the same length
    df_ls = []
    for run in self.crocus_ls:
      df_t = df.loc[run][0:shortest_length]
      df_ls.append(df_t)
    df = pd.concat(df_ls)
    self.df = df

  def split_windows(self):
    # Test train split of dataframes and make sliding window
    self.train_df = self.df.loc[self.training_runs]
    self.test_df = self.df.loc[self.test_runs]
    self.x_train, self.y_train = [], []
    self.train_crocus_ls = sorted(list(set(self.train_df.index)))
    seq_length = int(self.t1 / self.t3 - 1)  # instances to use
    ahead = int(self.t2 / self.t3) # how many instances ahead to predict
    for run in self.training_runs:
      for i in range(len(self.train_df.loc[run]) - seq_length - ahead -1):
        data_train = np.array(self.train_df.loc[run][self.features][i:i+seq_length])
        self.x_train.append(data_train)
        self.y_train.append(self.train_df.loc[run]['Product'][i+ahead:i+seq_length+ahead])

    self.x_test, self.y_test = [], []
    test_crocus_ls = sorted(list(set(self.test_df.index)))
    for run in self.test_runs:
      for i in range(len(self.test_df.loc[run]) - seq_length - ahead - 1):
        data_test = np.array(self.test_df.loc[run][self.features][i:i+seq_length])
        self.x_test.append(data_test)
        self.y_test.append(self.test_df.loc[run]['Product'][i+ahead:i+seq_length+ahead])

  def fix_arrays(self):
    # Scale and reshape into arrays suitable for input into the ML model
    seq_length = int(self.t1 / self.t3 - 1) 
    x_train, y_train = np.array(self.x_train), np.array(self.y_train)
    x_test, y_test = np.array(self.x_test), np.array(self.y_test)
    # scale
    x_scaler = MinMaxScaler()
    x_train = x_scaler.fit_transform(x_train.reshape(x_train.shape[0], -1))
    x_test = x_scaler.transform(x_test.reshape(x_test.shape[0], -1))
    # reshape
    self.x_train = x_train.reshape(x_train.shape[0], seq_length, len(self.features))
    self.y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)
    self.x_test = x_test.reshape(x_test.shape[0], seq_length, len(self.features))
    self.y_test = y_test.reshape(y_test.shape[0], y_test.shape[1], 1)

  def make_model(self):
    # Define and train model
    final_loss = 1000000
    # The LSTM will repeat the training process if the final loss is higher than expected
    while final_loss > 10:
      self.model = Sequential()
      self.model.add(
        LSTM(50, activation='relu', return_sequences=True, input_shape=(int(self.t1 / self.t3 - 1) , len(self.features))))
      self.model.add(LSTM(50, activation='relu', return_sequences=True))
      self.model.add(LSTM(50, activation='sigmoid', return_sequences=False))
      self.model.add(Dense(50))
      self.model.add(Dropout(0.2))
      self.model.add(Dense(1))
      self.model.compile(optimizer='adam', loss='mean_squared_error')
      self.model.summary()
      self.model.fit(self.x_train, self.y_train, epochs=20, verbose=True)
      loss_per_epoch = self.model.history.history['loss']
      final_loss = loss_per_epoch[-1]
      if np.isnan(final_loss):
        final_loss = 10000

  def evaluate_model(self):
    """Make predictions and evaluate the model by mean absolute error"""
    preds = self.model.predict(self.x_test)
    self.preds = preds
    actuals = self.y_test
    error_ls1 = []
    for idx, (pred, actual) in enumerate(zip(preds, actuals)):
      error = actual[-1] - pred
      error_ls1.append(float(abs(error)))
    self.mean_err = mean(error_ls1)


In [None]:
def open_dataframe(infile):
  # Load df from file
  in_file = os.path.join(infile)
  df = pd.read_csv(in_file, header=0, index_col='crocus_id', parse_dates=["datetime"])
  return df

In [None]:
def sequence_creator(xd, sample_lookback, sc, features, label_lookahead=0):
	x_runs = list()
	y_runs = list()
	for i in range(sample_lookback, len(xd) - label_lookahead):
		x_runs.append(sc.transform(xd.iloc[i - sample_lookback:i][features]))
		y_runs.append(xd.iloc[i + label_lookahead]["Product"])
	return x_runs, y_runs




In [None]:
def error_at_t_zero(path_dir, base_dir, run):
  """Get error for predictions at t=0"""
  # open file
  run = run.split('_')[0]
  filename = os.path.join(path_dir + base_dir + "all_predictions.csv")
  pred_df = pd.read_csv(filename, index_col=1)
  pred_run_df = pred_df.loc[run]['predictions']
  # open dataframe
  df = open_dataframe()
  run_df = df.loc[run]['Product']
  # lstm errors
  lstm_mae = mean(abs(pred_run_df.iloc[120:] - run_df.iloc[120:]))
  # lr errors
  lr_mae = mean(abs(pred_run_df.iloc[:120] - run_df.iloc[:120]))
  # overall errors
  mae = mean(abs(pred_run_df - run_df))
  return mae


 Errors for Crocus-025 in 25_test/: 
 LSTM: 0.670349974186891 
 Linear regression: 3.2661723757158936 
 Overall: 0.776699816516519 




0.776699816516519

In [None]:
def label_maker(training_runs):
  # Label training runs selected for oversampling
  seen = {}
  for x in training_runs:
      if x in seen:
          seen[x] += 1
          yield "%s_%d" % (x, seen[x])
      else:
          seen[x] = 0
          yield x

In [None]:
def training_sampler(df, historic_runs):
  # Select runs for over/undersampling using smogn and relevant training data
  dic_ls = []
  for run in historic_runs:
    run_df = df.loc[run]
    final_product = run_df['Product'].iloc[-1]
    start_product = run_df['Product'].iloc[0]
    final_green = run_df['green_cumulative'].iloc[-1]
    dic_ls.append({'crocus_id': run, 'Product': final_product, 'Start_product': start_product, 'green': final_green})
  new_df = pd.DataFrame(dic_ls)
  smogn_output = smogn.smoter(
      data=new_df,
      y='Product',
      rel_thres=0.1,
      rel_coef=0.75,
      # samp_method='balanced',
      rel_xtrm_type='low',
      rel_method='auto',
      replace=True
  )  
  return list(sorted(smogn_output['crocus_id']))

In [None]:
def make_oversample_df(df, infile, training_runs, new_runs, run_labels, dir):
  # Make the new dataframe to enable over/undersampling of data
  df_ls = []
  for run, label in zip(training_runs, run_labels):
    run_df = df.loc[run]
    run_df.index =  [label] * len(run_df)
    df_ls.append(run_df)
  for run in new_runs:
    run_df = df.loc[run]
    run_df.index = [run] * len(run_df)
    df_ls.append(run_df)
  new_df = pd.concat(df_ls)
  df_filepath = os.path.join(infile, dir_used, "dataframe.csv")
  new_df.index.rename('crocus_id', inplace=True)
  new_df.to_csv(df_filepath)

