In [1]:
## Import all libraries
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from autofeat import AutoFeatClassifier

In [2]:
## Import the data, and delete unrelevant columns
data = pd.read_csv(r'C:\Users\Ardon\Documents\Thesis\processminer-sheet-break-rare-event-dataset.csv')
data = data.drop(['DateTime','Grade&Bwt','EventPress'], axis=1)

In [3]:
## Function to move Y back in time
sign = lambda x: (1, -1)[x < 0]

def shift_back(df, back):
    vector = df['SheetBreak'].copy()
    for s in range(abs(back)):
        tmp = vector.shift(sign(back))
        tmp = tmp.fillna(0)
        vector += tmp
    labelcol = 'SheetBreak'
    df.insert(loc=0, column=labelcol+'tmp', value=vector)
    df = df.drop(df[df[labelcol] == 1].index)
    df = df.drop(labelcol, axis=1)
    df = df.rename(columns={labelcol+'tmp': labelcol})
    df.loc[df[labelcol] > 0, labelcol] = 1
    return df

In [4]:
## Move Y back two time steps
data_time_back = shift_back(data, -2)
## Reset the index
data_time_back.reset_index(drop=True, inplace=True)

In [5]:
## Divide the dataframe into X and y variables
X = data_time_back.loc[:, data_time_back.columns != 'SheetBreak']
y = data_time_back.loc[:, data_time_back.columns == 'SheetBreak']

In [143]:
# Feature engineering and selection with 1 step, and 5 feature selection runs.
set_seed = 123
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
# run autofeat
afc = AutoFeatClassifier(verbose=1, feateng_steps=1)
# fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
X_train_tr = afc.fit_transform(X_train, y_train)
X_test_tr = afc.transform(X_test)
print("autofeat new features:", len(afc.new_feat_cols_))
print("autofeat Acc. on training data:", accuracy_score(y_train, afc.predict(X_train_tr)))
print("autofeat Acc. on test data:", accuracy_score(y_test, afc.predict(X_test_tr)))
X_1step_5runs_2 = afc.transform(X)
data_1step_5runs_2 = pd.concat([y, X_1step_5runs_2], axis=1)
data_1step_5runs_2.to_csv('data_1step_5runs.csv')

[AutoFeat] The 1 step feature engineering process could generate up to 413 features.
[AutoFeat] With 16258 data points this new feature matrix would use about 0.03 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 267 transformed features from 59 original features - done.
[feateng] Generated altogether 270 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 231 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 118 features after 5 feature selection runs
[featsel] 102 features after correlation filtering
[featsel] 68 features after noise filtering
[AutoFeat] Computing 45 new features.
[AutoFeat]    45/   45 new features ...done.
[AutoFeat] Final dataframe with 104 feature columns

In [8]:
# Feature engineering and selection with 2 steps, and 1 feature selection run.
start = time.time()
set_seed = 12345
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
# run autofeat
afc_2 = AutoFeatClassifier(verbose=1, feateng_steps=2, featsel_runs=1)
# fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
X_train_tr = afc_2.fit_transform(X_train, y_train)
X_test_tr = afc_2.transform(X_test)
print("autofeat new features:", len(afc_2.new_feat_cols_))
print("autofeat Acc. on training data:", accuracy_score(y_train, afc_2.predict(X_train_tr)))
print("autofeat Acc. on test data:", accuracy_score(y_test, afc_2.predict(X_test_tr)))
X_2steps_1run = afc_2.transform(X)
data_2steps_1run = pd.concat([y, X_2steps_1run], axis=1)
data_2steps_1run.to_csv('data_2steps_1run.csv')
stop = time.time()
print(stop-start)

[AutoFeat] The 2 step feature engineering process could generate up to 85491 features.
[AutoFeat] With 16258 data points this new feature matrix would use about 5.56 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 267 transformed features from 59 original features - done.
[feateng] Step 2: first combination of features
[feateng]           41500/          52975 feature tuples combined

  ret = umr_sum(x, axis, dtype, out, keepdims)


[feateng]           50600/          52975 feature tuples combined

  x = um.multiply(x, x, out=x)


[feateng] Generated 52691 feature combinations from 52975 original feature tuples - done.
[feateng] Generated altogether 53040 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level


  sqr = np.multiply(arr, arr, out=arr)


[feateng] Generated a total of 41023 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/1
[featsel] 2 features after 1 feature selection runs
[featsel] 2 features after correlation filtering
[featsel] 2 features after noise filtering
[AutoFeat] Computing 2 new features.
[AutoFeat]     2/    2 new features ...done.
[AutoFeat] Final dataframe with 61 feature columns (2 new).
[AutoFeat] Training final classification model.
[AutoFeat] Trained model: largest coefficients:
[-0.01120464]
0.031250 * exp(UpprHdTmpRL)/BleachedGWDFlow
0.000295 * 1/(CT1BLADEPSI*RSBWSCANAVG)
[AutoFeat] Final score: 0.9352
[AutoFeat] Computing 2 new features.
[AutoFeat]     2/    2 new features ...done.
autofeat new features: 2
autofeat Acc. on training data: 0.9352318858408168
autofeat Acc. on test data: 0.9306273062730628
[AutoFeat] Computing 2 new features.
[AutoFeat]     2/    2 new features ...done.
93567.92763996124
