<span style="color:blue"> This notebook automates data analytics phases of CRISP-DM methodology for vibration data analysis use case in manufacturing domain. The goal of the notebook is to assist a manufacturing domain expert to perform data analysis tasks with less difficulties. </span>
    
<span style="color:blue"> To operate this notebook, only the following cell needs to be executed by the user. Upon executing the cell, all the code cells will be hidden for a less complex appearance of the notebook. But the cells can be toggled back to view the code by clicking the '[show code]' button on the right side. To execute the following cell, click on it and press Shift+Enter</span>

<span style="color:blue">Some cells takes a while to process the dataset, construct features and finding out feature importance. Please wait till the cells finish execution and moves to the next user prompt.</span>

In [2]:
#importing necessary libraries
import os
import numpy as np 
import pandas as pd 
import scipy
from scipy.stats import entropy
from sklearn.decomposition import PCA
from IPython.display import Javascript
from ipywidgets import widgets
from sklearn.decomposition import PCA
import magic
import dtale
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
h2o.init()
display(Javascript("Jupyter.notebook.execute_cell_range(2,6)"))

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.17" 2022-10-18; OpenJDK Runtime Environment (build 11.0.17+8-post-Ubuntu-1ubuntu220.04); OpenJDK 64-Bit Server VM (build 11.0.17+8-post-Ubuntu-1ubuntu220.04, mixed mode, sharing)
  Starting server from /home/nayela/.local/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpo9rx6xwx
  JVM stdout: /tmp/tmpo9rx6xwx/h2o_nayela_started_from_python.out
  JVM stderr: /tmp/tmpo9rx6xwx/h2o_nayela_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,2 months and 16 days
H2O_cluster_name:,H2O_from_python_nayela_oas0ra
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.887 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


<IPython.core.display.Javascript object>

In [3]:
%%HTML 
<script>
    function luc21893_refresh_cell(cell) {
        if( cell.luc21893 ) return;
        cell.luc21893 = true;
        console.debug('New code cell found...' );
        
        var div = document.createElement('DIV');            
        cell.parentNode.insertBefore( div, cell.nextSibling );
        div.style.textAlign = 'right';
        var a = document.createElement('A');
        div.appendChild(a);
        a.href='#'
        a.luc21893 = cell;
        a.setAttribute( 'onclick', "luc21893_toggle(this); return false;" );

        cell.style.visibility='hidden';
        cell.style.position='absolute';
        a.innerHTML = '[show code]';        
                
    }
    function luc21893_refresh() {                
        if( document.querySelector('.code_cell .input') == null ) {            
            // it apeears that I am in a exported html
            // hide this code
            var codeCells = document.querySelectorAll('.jp-InputArea')
            codeCells[0].style.visibility = 'hidden';
            codeCells[0].style.position = 'absolute';                        
            for( var i = 1; i < codeCells.length; i++ ) {
                luc21893_refresh_cell(codeCells[i].parentNode)
            }
            window.onload = luc21893_refresh;
        }                 
        else {
            // it apperas that I am in a jupyter editor
            var codeCells = document.querySelectorAll('.code_cell .input')
            for( var i = 0; i < codeCells.length; i++ ) {
                luc21893_refresh_cell(codeCells[i])
            }            
            window.setTimeout( luc21893_refresh, 1000 )
        }        
    }
    
    function luc21893_toggle(a) {
        if( a.luc21893.style.visibility=='hidden' ) {
            a.luc21893.style.visibility='visible';        
            a.luc21893.style.position='';
            a.innerHTML = '[hide code]';
        }
        else {
            a.luc21893.style.visibility='hidden';        
            a.luc21893.style.position='absolute';
            a.innerHTML = '[show code]';
        }
    }
    
    luc21893_refresh()
</script>


# Phase 1: Business Understanding

## 1.1 Define Business Objectives 

In [4]:
# initialize lists to store iteration data for meta-learning
business_obj = []
feature_set = []
user_feedback = []
ml_model_error = []
feature_recommendation = []

print('Select a business objective')
def dropdown_event_handler(change):
    display(Javascript("Jupyter.notebook.execute_cell_range(6,8)"))
dropdown=widgets.Dropdown(
    value=None,
    options=[('Reduce maintenance costs'), ('Lower machine downtime')]
)
display(dropdown)
dropdown.observe(dropdown_event_handler, names='value')

Select a business objective


Dropdown(options=('Reduce maintenance costs', 'Lower machine downtime'), value=None)

<IPython.core.display.Javascript object>

## 1.2 Determine Data Mining Goals
This task maps the selected business objective to a data mining goal by using a demo knowledge base.

In [1]:
business_objective=dropdown.value
if business_objective == 'Reduce maintenance costs':
    dm_goal = 'RUL Prediction'
elif business_objective == 'Lower machine downtime':
    dm_goal = 'RUL Prediction'
print('Data Mining goal for the defined business objective is',dm_goal)
display(Javascript("Jupyter.notebook.execute_cell_range(8,11)"))

NameError: name 'dropdown' is not defined


# Phase 2: Data Understanding

## 2.1 Collect Data 
This task requires loading data into the data analysis tool. 

<span style="color:blue"> Provide the path to data directory file in the text field below. An example relative datapath in case data resides under the currect directory in a folder named 'data' is: ./data</span>

In [6]:
text = widgets.Text(
    placeholder='Provide dataset path here',
    disabled=False
)
display(text)

def callback(wdgt):
    display(Javascript("Jupyter.notebook.execute_cell_range(11,13)"))

text.on_submit(callback)

Text(value='', placeholder='Provide dataset path here')

<span style="color:blue"> The following input is applicable if the datafile contains metadata. If no metadata is included in the data file, enter 0. </span>

In [None]:
metadata_rows = widgets.Text(
    placeholder='Enter number of rows containing metadata',
    disabled=False
)
display(metadata_rows)

def callback2(wdgt):
    display(Javascript("Jupyter.notebook.execute_cell_range(13,14)"))

metadata_rows.on_submit(callback2)

In [7]:
columns_included = widgets.Text(
    placeholder='If the column names are included enter 1, or 0',
    disabled=False
)
display(columns_included)

def callback3(wdgt):
    display(Javascript("Jupyter.notebook.execute_cell_range(14,15)"))

columns_included.on_submit(callback3)

Text(value='', placeholder='If the column names are included enter 1, or 0')

<IPython.core.display.Javascript object>

In [None]:
# extracts data rows only and detects column separator for further processing
skip_rows = int(metadata_rows.value)
columns_included = int(columns_included.value)
dataset_path=text.value
common_delimiters= [',',';','\t',' ','|',':']
for filename in os.listdir(dataset_path):
    blob = open(os.path.join(dataset_path, filename), 'rb').read()
    m = magic.Magic(mime_encoding=True)
    encoding = m.from_buffer(blob)
    with open(os.path.join(dataset_path, filename), encoding=encoding) as f:
        sample_lines = [next(f).rstrip() for x in range(2)]

        for d in common_delimiters:
            ref = sample_lines[0].count(d)
            if ref > 0:
                if all([ ref == sample_lines[i].count(d) for i in range(1,2)]):
                    sep=d
    raw_data = pd.read_csv(os.path.join(dataset_path, filename), encoding=encoding, sep=sep,skiprows=skip_rows)
if columns_included == 1:
    display(Javascript("Jupyter.notebook.execute_cell_range(15,16)"))
else:
    display(Javascript("Jupyter.notebook.execute_cell_range(16,18)"))

In [None]:
columns = raw_data.columns
display(Javascript("Jupyter.notebook.execute_cell_range(19,24)"))

<span style="color:blue"> Enter the column names of the dataset separated by space in the following text field. Column names should be defined within a single word. If necessary use '_' or '-' to define column names.</span>

In [9]:
columns = widgets.Text(
    placeholder='Enter the column names here'
)
display(columns)

def callback2(wdgt):
    display(Javascript("Jupyter.notebook.execute_cell_range(18,19)"))

columns.on_submit(callback2)
#B1_x B1_y B2_x B2_y B3_x B3_y B4_x B4_y (for Nasa data)

Text(value='', placeholder='Enter the column names here')

<IPython.core.display.Javascript object>

In [None]:
columns=columns.value.split(" ")
raw_data.columns = columns
display(Javascript("Jupyter.notebook.execute_cell_range(19,24)"))


## 2.2-2.4 Describe Data, Explore Data, Verify Data Quality 

<span style="color:blue"> These three tasks can be executed using the D-Tale tool below. </span>
    
<span style="color:blue">    1. Use 'Describe’ option from the main menu to get summary of the dataset including some statistical information and plot of the data such as histogram, Q-Q plot, etc. </span>
    
<span style="color:blue">    2.  ’Charts’, ’Correlations' and 'Predictive Power Source' options can be used to explore the dataset. 'Charts’ provides a way to create 13 different kinds of visualizations with a few clicks including line plot, bar plot, scatter plot, etc. 'Correlations' and 'Predictive Power Source' finds out relationships between the features. </span>
    
<span style="color:blue">    3. ’Missing Analysis’ visualizes the missing values present in the dataset and ’Highlight Outliers’ detects the cells of numeric columns that exceed the lower or upper bounds defined by D-Tale.
</span>

In [11]:
# instructions about dtale here
d = dtale.show(raw_data)
d



# Phase 3: Data Preprocessing

## 3.2-3.5 Clean Data, Construct Data Features, Integrate Data, Format Data
These tasks cleans dataset if there is any erroneus/duplicate data, then creates necessary features, formats and integrates them if necessary

<span style="color:blue"> Enter the time format of the data file name in the following text field.</span>

<span style="color:blue"> Also enter the label/categorical features specific to the use case data set in next to the following cell before pressing enter</span>

In [12]:
print('Example time format: Enter %Y-%m-%d_%H-%M-%S if file name is 2022-03-45_12-06-24-txt')

time_format = widgets.Text(
    placeholder='Enter time format of filename',
    disabled=False
)
display(time_format)

def callback4(wdgt):
    display(Javascript("Jupyter.notebook.execute_cell_range(24,30)"))

time_format.on_submit(callback4)

Example time format: Enter %Y-%m-%d_%H-%M-%S if file name is 2022-03-45_12-06-24-txt


Text(value='', placeholder='Enter time format of filename')

<IPython.core.display.Javascript object>

In [None]:
# Constructs hand-crafted features
time_format= str(time_format.value)
# Root Mean Squared Sum
def calculate_rms(df):
    result = []
    for col in df:
        r = np.sqrt((df[col]**2).sum() / len(df[col]))
        result.append(r)
    return result

# extract peak-to-peak features
def calculate_p2p(df):
    return np.array(df.max().abs() + df.min().abs())

# extract shannon entropy (cut signals to 500 bins)
def calculate_entropy(df):
    ent = []
    for col in df:
        ent.append(entropy(pd.cut(df[col], 500).value_counts()))
    return np.array(ent)
# extract clearence factor
def calculate_clearence(df):
    result = []
    for col in df:
        r = ((np.sqrt(df[col].abs())).sum() / len(df[col]))**2
        result.append(r)
    return result

def create_features(dataset_path, columns, time_format):
    
    ''' Creates 12 time features from each datafile. The features are mean, standard deviation, skewness, kurtosis, entropy, rms, max, peak-to-peak,
    crest factor, clearance factor, shape factor and impulse.
    Args:
        path to dataset and column names
    Returns:
        dataframe with created features         
    '''
    
    time_features = ['mean','std','skew','kurtosis','entropy','rms','max','p2p', 'crest', 'clearence', 'shape', 'impulse']
    
    # initialize
    column_names = [c+'_'+tf for c in columns for tf in time_features]
    data = pd.DataFrame(columns=column_names) # Creating empty dataframe with renaming columns with relevant feature name
        
        
    for filename in os.listdir(dataset_path):
        
        # read dataset and clean data
        raw_data = pd.read_csv(os.path.join(dataset_path, filename), encoding=encoding, sep=sep, skiprows=skip_rows)
        
        if (raw_data.dtypes == 'object').all():
            raw_data=raw_data.applymap(lambda x: str(x.replace(',','.')))
            raw_data = raw_data.apply(pd.to_numeric)
        missing_columns=raw_data.columns[raw_data.isnull().any()]
        for column in missing_columns:
            raw_data[column] = raw_data[column].fillna(raw_data[column].mean()) # replace null values with average of the column value
        
        # time features
        mean_abs = np.array(raw_data.abs().mean())
        std = np.array(raw_data.std())
        skew = np.array(raw_data.skew())
        kurtosis = np.array(raw_data.kurtosis())
        entropy = calculate_entropy(raw_data)
        rms = np.array(calculate_rms(raw_data))
        max_abs = np.array(raw_data.abs().max())
        p2p = calculate_p2p(raw_data)
        crest = max_abs/rms
        clearence = np.array(calculate_clearence(raw_data))
        shape = rms / mean_abs
        impulse = max_abs / mean_abs
        
        mean_abs = pd.DataFrame(mean_abs.reshape(1,len(columns)), columns=[c+'_mean' for c in columns])
        std = pd.DataFrame(std.reshape(1,len(columns)), columns=[c+'_std' for c in columns])
        skew = pd.DataFrame(skew.reshape(1,len(columns)), columns=[c+'_skew' for c in columns])
        kurtosis = pd.DataFrame(kurtosis.reshape(1,len(columns)), columns=[c+'_kurtosis' for c in columns])
        entropy = pd.DataFrame(entropy.reshape(1,len(columns)), columns=[c+'_entropy' for c in columns])
        rms = pd.DataFrame(rms.reshape(1,len(columns)), columns=[c+'_rms' for c in columns])
        max_abs = pd.DataFrame(max_abs.reshape(1,len(columns)), columns=[c+'_max' for c in columns])
        p2p = pd.DataFrame(p2p.reshape(1,len(columns)), columns=[c+'_p2p' for c in columns])
        crest = pd.DataFrame(crest.reshape(1,len(columns)), columns=[c+'_crest' for c in columns])
        clearence = pd.DataFrame(clearence.reshape(1,len(columns)), columns=[c+'_clearence' for c in columns])
        shape = pd.DataFrame(shape.reshape(1,len(columns)), columns=[c+'_shape' for c in columns])
        impulse = pd.DataFrame(impulse.reshape(1,len(columns)), columns=[c+'_impulse' for c in columns])
        
        # remove extension from filename 
        if filename.endswith('.txt'):
            filename = os.path.splitext(filename)[0]  
        mean_abs.index = [filename]
        std.index = [filename]
        skew.index = [filename]
        kurtosis.index = [filename]
        entropy.index = [filename]
        rms.index = [filename]
        max_abs.index = [filename]
        p2p.index = [filename]
        crest.index = [filename]
        clearence.index = [filename]
        shape.index = [filename]
        impulse.index = [filename] 
        
        # concat
        merge = pd.concat([mean_abs, std, skew, kurtosis, entropy, rms, max_abs, p2p,crest,clearence, shape, impulse], axis=1)
        data = pd.concat([data, merge]) # For each datafile appends one row of 48 features            
        cols = [c+'_'+tf for c in columns for tf in time_features]
        data = data[cols]
        
    data.index = pd.to_datetime(data.index, format = time_format)
    data = data.sort_index()
    data=data.reset_index().rename({'index':'time'}, axis = 'columns')
    
    return data

data_features = create_features(dataset_path,columns,time_format)

In [None]:
#This cell needs to be edited by the domain expert to include label/categorical feature into the feature set

# nasa label features
B1_state = list()
B2_state = list()
B3_state = list()
B4_state = list()
cnt = 0

for row in data_features["time"]:
    cnt += 1
    # B1
    if cnt<=151:
        B1_state.append("early")
    if 151 < cnt <=600:
        B1_state.append("suspect")
    if 600 < cnt <=1499:
        B1_state.append("normal")
    if 1499 < cnt <=2098:
        B1_state.append("suspect")
    if 2098 < cnt <= 2156:
        B1_state.append("imminent_failure")
    #B2
    if cnt<=500:
        B2_state.append("early")
    if 500 < cnt <=2000:
        B2_state.append("normal")
    if 2000 < cnt <=2120:
        B2_state.append("suspect")
    if 2120< cnt <=2156:
        B2_state.append("imminet_failure")

    #B3
    if cnt<=500:
        B3_state.append("early")
    if 500 < cnt <= 1790:
        B3_state.append("normal")
    if 1790 < cnt <=2120:
        B3_state.append("suspect")
    if 2120 < cnt <=2156:
        B3_state.append("Inner_race_failure")
    #B4
    if cnt<=200:
        B4_state.append("early")
    if 200 < cnt <=1000:
        B4_state.append("normal")
    if 1000 < cnt <= 1435:
        B4_state.append("suspect")
    if 1435 < cnt <=1840:
        B4_state.append("Inner_race_failure")
    if 1840 < cnt <=2156:
        B4_state.append("Stage_two_failure")
#controlling the counts
from collections import Counter
print(Counter(B1_state))
print(Counter(B2_state))
print(Counter(B3_state))
print(Counter(B4_state))

data_features["B1_state"] = B1_state
data_features["B2_state"] = B2_state
data_features["B3_state"] = B3_state
data_features["B4_state"] = B4_state

# data_features.head()
B1_cols = [col for col in data_features.columns if "B1" in col]
B2_cols = [col for col in data_features.columns if "B2" in col]
B3_cols = [col for col in data_features.columns if "B3" in col]
B4_cols = [col for col in data_features.columns if "B4" in col]

B1 = data_features[B1_cols]
B2 = data_features[B2_cols]
B3 = data_features[B3_cols]
B4 = data_features[B4_cols]
cols = ['Bx_mean','Bx_std','Bx_skew','Bx_kurtosis','Bx_entropy','Bx_rms','Bx_max','Bx_p2p','Bx_crest', 'Bx_clearence', 'Bx_shape', 'Bx_impulse',
        'By_mean','By_std','By_skew','By_kurtosis','By_entropy','By_rms','By_max','By_p2p','By_crest', 'By_clearence', 'By_shape', 'By_impulse',
        'label']
B1.columns = cols
B2.columns = cols
B3.columns = cols
B4.columns = cols
data_features = pd.concat([B1,B2,B3,B4], axis=0, ignore_index=True)
data_features['label_encoded']=data_features['label']


<span style="color:blue"> The final feature dataset is again displayed here with D-Tale for enabling the user to explore the final feature set. </span>

In [None]:
#encode categorical labels
gle = LabelEncoder()
genre_labels = gle.fit_transform(data_features['label_encoded'])
data_features['label_encoded']=genre_labels
d = dtale.show(data_features,datetime_is_numeric=True)
d

## 3.1 Select Input Data
This step selects relevant features for further analysis

<span style="color:blue"> To find feature importance, the user needs to enter the target feature from the following drop-down. </span>

In [None]:
print('Select target feature')
def dropdown_select_target(change):
     display(Javascript("Jupyter.notebook.execute_cell_range(30,35)"))
feature_dropdown=widgets.Dropdown(
    value=None,
    options=data_features.columns
)
display(feature_dropdown)
feature_dropdown.observe(dropdown_select_target, names='value')

In [18]:
# RFECV feature selection
target_feature = str(feature_dropdown.value)
x_features=data_features.drop(['label'], axis=1)
x_features=x_features.drop(['label_encoded'], axis=1)
clf = RandomForestClassifier()
rfecv = RFECV(estimator=clf, cv=KFold(n_splits=10), min_features_to_select=2,
              scoring='accuracy')
selector=rfecv.fit(x_features.values, data_features[target_feature])
selector.ranking_

array([ 3,  1, 18,  2,  8,  1, 19, 12, 14,  9,  7, 15, 10,  1, 17,  5, 13,
        4, 20, 16, 22, 11,  6, 21])

In [19]:
rfecv_features = x_features[x_features.columns[rfecv.get_support(1)]]
print('Selected RFECV features:\n')
print(rfecv_features.columns)
rfecv_features['label'] = data_features['label']
rfecv_features.to_csv('rfecv_features.csv')

Selected RFECV features:

Index(['Bx_std', 'Bx_rms', 'By_std'], dtype='object')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [20]:
# PCA 
X = x_features
pca = PCA(n_components = 0.95)
pca.fit(X)
print(pca.n_components_)
explained_variance = pca.explained_variance_ratio_
columns = ['pca_comp_%i' % i for i in range(pca.n_components_)]
print('PCA columns generated:\n')
pca_features  = pd.DataFrame(pca.transform(X), columns=columns, index=X.index)
pca_features['label'] = data_features["label"]
pca_features.to_csv('pca_features.csv')

3
PCA columns generated:



<span style="color:blue">The following cell provides feature set recommendation to the user using previous user and ML model history. The user can select feature set as per the recommendation or can arbitrarily select a feature set. </span>
 
<span style="color:blue">In case of the very first iteration of the data analysis procedure, there will be no feature recommendation due to lack of data. </span>

In [21]:
# generates feature recommendation from previous user data for the similar business objective
try:
    recommendation_df=pd.read_csv("./recommendation_df.csv")
except FileNotFoundError: 
    recommendation_df = None
    
if recommendation_df is not None:
    recommendation_new = recommendation_df.loc[recommendation_df['Feature Recommendation'] == 'Yes']
    recommendation_new = recommendation_df.loc[recommendation_df['Business Objective'] == business_objective]
    min_error = recommendation_new.ML_model_error.min()
    recommendation = recommendation_df.loc[recommendation_df['ML_model_error'] == min_error, 'Feature_set']
    try: recommendation
    except NameError: recommendation = None
    if recommendation is not None:
        for item in recommendation:
            print('Recommended Feature Set from previous run is')
            print(item)
else :    
    try: recommendation
    except NameError: recommendation = None
    if recommendation is not None:
        for item in recommendation:
            print('Recommended Feature Set from previous iteration of same run is')
            print(item)

print('Select feature set')
def dropdown_select_featureset(change):
    display(Javascript("Jupyter.notebook.execute_cell_range(35,43)"))
featureset_dropdown=widgets.Dropdown(
    value=None,
    options=[('RFECV', 1), ('PCA', 2)]
)
display(featureset_dropdown)
featureset_dropdown.observe(dropdown_select_featureset, names='value')

Select feature set


Dropdown(options=(('RFECV', 1), ('PCA', 2)), value=None)

Executing shutdown due to inactivity...


2023-01-12 19:01:53,941 - INFO     - Executing shutdown due to inactivity...


Executing shutdown...


2023-01-12 19:01:53,964 - INFO     - Executing shutdown...


Exception on /shutdown [GET]
Traceback (most recent call last):
  File "/home/nayela/.local/lib/python3.8/site-packages/flask/app.py", line 2525, in wsgi_app
    response = self.full_dispatch_request()
  File "/home/nayela/.local/lib/python3.8/site-packages/flask/app.py", line 1822, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/home/nayela/.local/lib/python3.8/site-packages/flask/app.py", line 1820, in full_dispatch_request
    rv = self.dispatch_request()
  File "/home/nayela/.local/lib/python3.8/site-packages/flask/app.py", line 1796, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "/home/nayela/.local/lib/python3.8/site-packages/dtale/app.py", line 435, in shutdown
    shutdown_server()
  File "/home/nayela/.local/lib/python3.8/site-packages/dtale/app.py", line 421, in shutdown_server
    raise RuntimeError("Not running with the Werkzeug Server")
RuntimeError: Not running with the Werkzeug Server


2023-01-12 19:01:53,965 - ERROR    - Exception on /shutdown [GET]
Traceback (most recent call last):
  File "/home/nayela/.local/lib/python3.8/site-packages/flask/app.py", line 2525, in wsgi_app
    response = self.full_dispatch_request()
  File "/home/nayela/.local/lib/python3.8/site-packages/flask/app.py", line 1822, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/home/nayela/.local/lib/python3.8/site-packages/flask/app.py", line 1820, in full_dispatch_request
    rv = self.dispatch_request()
  File "/home/nayela/.local/lib/python3.8/site-packages/flask/app.py", line 1796, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "/home/nayela/.local/lib/python3.8/site-packages/dtale/app.py", line 435, in shutdown
    shutdown_server()
  File "/home/nayela/.local/lib/python3.8/site-packages/dtale/app.py", line 421, in shutdown_server
    raise RuntimeError("Not running with the Werkzeug Server")
RuntimeError: Not 

<IPython.core.display.Javascript object>

# Phase 4: Modeling

## 4.1 - 4.5 Select Modeling Technique, Test Design, Build Model, Assess Model

<span style="color:blue">The modeling phase is fully automated. The user can analyze the models and their parameters from the leaderboard below which appears after the models are trained.</span>



In [None]:
# initialize automl and start traininjg
if featureset_dropdown.value == 1:
    df = h2o.import_file('./rfecv_features.csv')
    df=df.drop('C1')
    print(df)
    train, test = df.split_frame(ratios=[0.8], seed = 1)
    x = train.columns 
    #print(x)
    y = 'label'
    x.remove(y)
if featureset_dropdown.value == 2:
    df = h2o.import_file('./pca_features.csv')
    df=df.drop('C1')
    train, test = df.split_frame(ratios=[0.8], seed = 1)
    x = train.columns
    y = 'label'
    x.remove(y)
aml = H2OAutoML(max_runtime_secs = 120)
aml.train(x = x, y = y, training_frame = train)

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)
best_model = aml.get_best_model()
#best_model.explain(train)

In [None]:
best_model = aml.get_best_model()
best_model.explain(test)

# Phase 5: Evaluation

## 5.1 - 5.3 Determine Next Steps, Review Process, Evaluate Results

In [None]:
print('Does the best model satisfy your business objective?')
def dropdown_decision(change):
    display(Javascript("Jupyter.notebook.execute_cell_range(43,45)"))
decision_dropdown=widgets.Dropdown(
    value=None,
    options=['Yes','No']
)
display(decision_dropdown)
decision_dropdown.observe(dropdown_decision, names='value')

In [None]:
# create and update df for feature recommendation
print(business_obj)
print(feature_set)
print(feature_recommendation)
print(ml_model_error)

business_obj.append(business_objective)
if featureset_dropdown.value == 1:
    feature_set.append('RFECV')
if featureset_dropdown.value == 2: 
    feature_set.append('PCA')
if featureset_dropdown.value == 3: 
    feature_set.append('RFECV & PCA')
user_feedback.append(str(decision_dropdown.value))
if str(decision_dropdown.value) == 'Yes':
    feature_recommendation.append('Yes')
else:
    feature_recommendation.append('No')
ml_model_error.append(best_model.auc())
recommendation_df = pd.DataFrame({'Business Objective': business_obj, 'Feature_set': feature_set, 'User_feedback': user_feedback, 'ML_model_error': ml_model_error,
                                 'Feature Recommendation': feature_recommendation})
print(recommendation_df)


In [None]:
decision = str(decision_dropdown.value)
if decision == 'Yes':
    recommendation = recommendation_df.loc[recommendation_df['Feature Recommendation'] == 'Yes', 'Feature_set']
    if len(recommendation)>=2:
        new_recommendation = recommendation_df.loc[recommendation_df['Feature Recommendation'] == 'Yes']
        min_error = new_recommendation.ML_model_error.max()
        recommendation = recommendation_df.loc[recommendation_df['ML_model_error'] == min_error, 'Feature_set']
    print('Do you want to explore different data features?')
    def dropdown_review(change):
        display(Javascript("Jupyter.notebook.execute_cell_range(45,46)"))
    review_dropdown=widgets.Dropdown(
        value=None,
        options=['Yes','No']
        )
    display(review_dropdown)
    review_dropdown.observe(dropdown_review, names='value')
if decision == 'No':
    print('To explore different feature sets, please navigate to' +"\033[1m" + " Select feature set "+ "\033[0m" + 'step at the end of phase 3')
    display(Javascript("Jupyter.notebook.execute_cell_range(32,34)"))

In [None]:
review_decision = str(review_dropdown.value)
if review_decision == 'Yes':
    print('To explore different feature sets, please navigate to' +"\033[1m" + " Select feature set "+ "\033[0m" + 'step at the end of phase 3')
    display(Javascript("Jupyter.notebook.execute_cell_range(34,35)"))
else:
    display(Javascript("Jupyter.notebook.execute_cell_range(46,47)"))

In [None]:
def run_cells(ev):
    display(Javascript("Jupyter.notebook.execute_cell_range(47,48)"))

save_button = widgets.Button(description="Save Model")
save_button.on_click(run_cells)
display(save_button)

In [None]:
# Save the model
recommendation_df.to_csv('recommendation_df.csv')
model_id = aml.leader.model_id
print('Final model saved at')
aml.download_mojo(genmodel_name=model_id, path="./"+model_id+".zip")