<a href="https://colab.research.google.com/github/Jeffrey-Ede/Miscellaneous/blob/master/intellisense.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Solution to Mining Problem

In [1]:
# #Install auto-sklearn with build dependencies

!sudo apt-get install build-essential swig 
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install 
!pip install auto-sklearn==0.10.0

Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.4ubuntu1).
swig is already the newest version (3.0.12-1).
0 upgraded, 0 newly installed, 0 to remove and 13 not upgraded.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   207  100   207    0     0    605      0 --:--:-- --:--:-- --:--:--   605
Collecting scikit-learn<0.25.0,>=0.24.0
  Using cached https://files.pythonhosted.org/packages/e2/4c/6111b9a325f29527d7f262e2ee8c730d354b47a728d955e186dacad57a0d/scikit_learn-0.24.1-cp36-cp36m-manylinux2010_x86_64.whl
[31mERROR: auto-sklearn 0.10.0 has requirement scikit-learn<0.23,>=0.22.0, but you'll have scikit-learn 0.24.1 which is incompatible.[0m
Installing collected packages: scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.

In [2]:
#Libraries that are likely to be useful

import numpy as np
import pandas as pd
import scipy

import tensorflow as tf

import os
import itertools

In [3]:
#Download data from my Google Drive (probably best if I keep a copy as I don't control repo access)
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1IM9LTwud70QjHGlNQDbbCl7DSaTg88Hc' -O intellisense.zip && mkdir -p ~/data/intellisense

#Decompress datasets and other files
import zipfile

with zipfile.ZipFile('intellisense.zip', 'r') as zip_ref:
  zip_ref.extractall('intellisense')

#Specify dataset files
data_loc = "intellisense/testproject.dst.sag.lite-master/data/"
train_filename = data_loc + "sag_data_train.csv"
test_filename = data_loc + "sag_data_test.csv"

--2021-02-01 14:14:33--  https://docs.google.com/uc?export=download&id=1IM9LTwud70QjHGlNQDbbCl7DSaTg88Hc
Resolving docs.google.com (docs.google.com)... 64.233.188.102, 64.233.188.139, 64.233.188.113, ...
Connecting to docs.google.com (docs.google.com)|64.233.188.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-10-a8-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/uo52jr9la7cdudli7bb4givopli0bure/1612188825000/14151923763028593793/*/1IM9LTwud70QjHGlNQDbbCl7DSaTg88Hc?e=download [following]
--2021-02-01 14:14:36--  https://doc-10-a8-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/uo52jr9la7cdudli7bb4givopli0bure/1612188825000/14151923763028593793/*/1IM9LTwud70QjHGlNQDbbCl7DSaTg88Hc?e=download
Resolving doc-10-a8-docs.googleusercontent.com (doc-10-a8-docs.googleusercontent.com)... 108.177.125.132, 2404:6800:4008:c01::84
Connecting to doc-10-a8-docs.googleusercontent.com (doc-10-a8-

In [22]:
NUM_ADVANCE_ROWs = 5 #Predict 5 min in advance
NUM_INPUT_ROWS = 16 #Use 16 min of data

NUM_OUTPUT_COLS = 2 #Outputs are first two columns after the times column

min_chunk_rows = NUM_INPUT_ROWS + NUM_ADVANCE_ROWs

from sklearn.preprocessing import QuantileTransformer

#Load and preprocess data
def load_data(data_filename, qts=[]):

  is_qts = len(qts)
  
  #Load data from file
  df = pd.read_csv(data_filename, header=0)
  headers = list(df.columns.values)
  data = np.float32(df.values[:, 1:]) #Strip time column 

  #Strip any columns that are not finite or have very small values
  bad_row_idxs = [i for i, x in enumerate(data) if np.sum(1 - np.isfinite(x)) != 0 or np.sum(x < 0.01) != 0]


  #Can only create input data where there are chunks of min_chunk_rows contiguous rows
  good_data_inputs = []
  good_data_outputs = []
  prev_bad_idx = -1
  for i in bad_row_idxs:
    if i - prev_bad_idx > min_chunk_rows:
      good_data_chunk = data[prev_bad_idx+1: i]
      good_data_chunk = np.stack([good_data_chunk[j:-min_chunk_rows+j] for j in range(min_chunk_rows)], axis=-1)

      good_data_inputs.append( good_data_chunk[:,:,:NUM_INPUT_ROWS] )
      good_data_outputs.append( good_data_chunk[:,:NUM_OUTPUT_COLS,-1] )

    prev_bad_idx = i

  good_data_inputs = np.concatenate(tuple(good_data_inputs), axis=0)
  good_data_outputs = np.concatenate(tuple(good_data_outputs), axis=0)

  #Quantile normalization usually helps to avoid issues caused by with outliers
  if not is_qts:
    qts = []
  for i in range(data.shape[1]):
    if is_qts:
      qt = qts[i]
    else:
      qt = QuantileTransformer(random_state=0)
      qt.fit_transform(good_data_inputs[:,i:i+1,0])
      qts.append(qt)

    good_data_inputs[:,i:i+1] = np.stack([qt.transform(good_data_inputs[:,i:i+1,j]) for j in range(NUM_INPUT_ROWS)], axis=-1)

    if i < NUM_OUTPUT_COLS:
      good_data_outputs[:,i:i+1] = qt.transform(good_data_outputs[:,i:i+1])

  #Easiest to flatten after fitting quantile normalization
  good_data_inputs = good_data_inputs.flatten().reshape(len(good_data_inputs), -1)

  if not is_qts:
    return good_data_inputs, good_data_outputs, qts
  else:
    return good_data_inputs, good_data_outputs


train_x, train_y, qts = load_data(train_filename)
test_x, test_y = load_data(test_filename, qts=qts)

print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(130997, 96) (130997, 2)
(26441, 96) (26441, 2)


In [23]:
TRAIN_TIME = 60 #Training time in min for each regressor

from autosklearn.regression import AutoSklearnRegressor


#Train separate regressors for each output to avoid any issues associated with different
#error scales in multiregression
for i, qt in enumerate(qts[:2]):

  #Fit regression model to data using automatic machine learning
  automl = AutoSklearnRegressor(time_left_for_this_task=int(60*TRAIN_TIME), per_run_time_limit=int(0.1*60*TRAIN_TIME), n_jobs=-1)

  automl.fit(train_x, train_y[:,i:i+1])
  print(automl.show_models())

  pred_test_y = automl.predict(test_x)

  print(pred_test_y.shape)

  #Print RMS performance
  rms = np.sqrt(np.mean((
      qt.inverse_transform(np.expand_dims(pred_test_y, -1)) - qt.inverse_transform(test_y[:,i:i+1])
  )**2))

  if i == 0:
    print("Bearing Pressure RMS:", rms, "kPa")
  elif i == 1:
    print("Power Draw RMS:", rms, "MW")



  self.InputValidator.validate_target(y)
  y = self.validate_target(y, is_classification)


[(0.420000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'select_rates_regression', 'regressor:__choice__': 'gradient_boosting', 'feature_preprocessor:select_rates_regression:alpha': 0.2137780451968174, 'feature_preprocessor:select_rates_regression:mode': 'fdr', 'feature_preprocessor:select_rates_regression:score_func': 'f_regression', 'regressor:gradient_boosting:early_stop': 'valid', 'regressor:gradient_boosting:l2_regularization': 0.1823129544555166, 'regressor:gradient_boosting:learning_rate': 0.1776880286017515, 'regressor:gradient_boosting:loss': 'least_squares', 'regressor:gradient_boosting:max_bins': 255, 'regressor:gr

  self.InputValidator.validate_target(y)
  y = self.validate_target(y, is_classification)


[(0.540000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'median', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'select_rates_regression', 'regressor:__choice__': 'gradient_boosting', 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.0016605093251402941, 'feature_preprocessor:select_rates_regression:alpha': 0.080181965489516, 'feature_preprocessor:select_rates_regression:mode': 'fwe', 'feature_preprocessor:select_rates_regression:score_func': 'f_regression', 'regressor:gradient_boosting:early_stop': 'off', 'regressor:gradient_boosting:l2_regularization': 6.9216369968771745e-06, 'regressor:gradient_boosting:learning_rate': 0.09997