# Predictive Maintance Model
### Created with H2O Automatic Machine Learning

This notebook ingests a dataset, and trains many machine learning models intelligently searching the hyper-parameter space for optimal values. A leaderboard is maintained. Finally, an ensemble is created stacking together some of the base learners and the result is added to the leaderboard. The best model is deployed to production. 


In [1]:
%%capture
import h2o
from h2o.automl import H2OAutoML

import os
import plotly
!pip install cufflinks
import cufflinks
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
plotly.offline.init_notebook_mode(connected=True)

myPlotlyKey = os.environ['SECRET_ENV_BRETTS_PLOTLY_KEY']
py.sign_in(username='bretto777',api_key=myPlotlyKey)


# Suppress unwatned warnings
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

In [16]:
%%capture
h2o.init(max_mem_size="4g", nthreads=1)

In [34]:
#h2o.no_progress()
# Import some data from Amazon S3
h2oDF = h2o.import_file("https://s3-us-west-1.amazonaws.com/dsclouddata/predictive-maintenance/DataWithFeatures.csv")

#Split into Train/Test
train,test = h2oDF.split_frame(ratios=[0.3])



In [49]:
splomDF = h2oDF.as_data_frame(use_pandas=True)


splomDF = splomDF[['3 - Gas Turbine shaft torque (GTT) [kN m]','8 - HP Turbine exit temperature (T48) [C]','10 - GT Compressor outlet air temperature (T2) [C]','16 - Fuel flow (mf) [kg/s]','18 - GT Turbine decay state coefficient.']]

# Create scatter plot matrix of call data
splom = ff.create_scatterplotmatrix(splomDF.sample(frac=0.1), diag='histogram', index='18 - GT Turbine decay state coefficient.',  
                                  height=800, width=800,
                                  size=4, marker=dict(symbol='circle'))
py.iplot(splom)

In [36]:
# Identify predictors and response
x = train.columns
y = "18 - GT Turbine decay state coefficient."
x.remove(y)

In [37]:
# Run AutoML for 20 minutes or until leader fails to improve after 5 rounds
autoModel = H2OAutoML(max_runtime_secs = 300, stopping_rounds=5, stopping_tolerance=0.001)
autoModel.train(x = x, y = y,
          training_frame = train,
          leaderboard_frame = test)

## Leaderboard
Display the best models, sorted by descending AUC

In [38]:
leaders = autoModel.leaderboard
leaders

model_id,mean_residual_deviance,rmse,mae,rmsle
StackedEnsemble_0_AutoML_20171027_031002,2e-06,0.00131,0.000806,0.000659
GBM_grid_0_AutoML_20171027_031002_model_1,2e-06,0.001312,0.000806,0.00066
GBM_grid_0_AutoML_20171027_031002_model_0,2e-06,0.001411,0.000871,0.00071
GBM_grid_0_AutoML_20171027_031002_model_2,2e-06,0.001483,0.000959,0.000746
DRF_0_AutoML_20171027_031002,4e-06,0.002031,0.001331,0.001022
XRT_0_AutoML_20171027_031002,5e-06,0.002181,0.001484,0.001098
GLM_grid_0_AutoML_20171027_031002_model_1,4.8e-05,0.006937,0.005919,0.00349
GLM_grid_0_AutoML_20171027_031002_model_0,4.8e-05,0.006937,0.005919,0.00349




## Variable Importance - Best Model

In [45]:
leaders[1, 0]

u'DRF_0_AutoML_20171027_031002'

In [46]:
importances = h2o.get_model(leaders[1, 0]).varimp(use_pandas=True)
importances

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,13 - GT Compressor outlet air pressure (P2) [bar],0.152645,1.0,0.244423
1,5 - Gas Generator rate of revolutions (GGn) [rpm],0.139532,0.914094,0.223425
2,16 - Fuel flow (mf) [kg/s],0.075243,0.492926,0.120482
3,15 - Turbine Injecton Control (TIC) [%],0.065708,0.430462,0.105214
4,8 - HP Turbine exit temperature (T48) [C],0.055112,0.361048,0.088248
5,10 - GT Compressor outlet air temperature (T2) [C],0.037964,0.248708,0.06079
6,11 - HP Turbine exit pressure (P48) [bar],0.037049,0.242715,0.059325
7,3 - Gas Turbine shaft torque (GTT) [kN m],0.017059,0.111756,0.027316
8,7 - Port Propeller Torque (Tp) [kN],0.011809,0.077362,0.018909
9,6 - Starboard Propeller Torque (Ts) [kN],0.008638,0.056587,0.013831


In [50]:
importances = h2o.get_model(leaders[1, 0]).varimp(use_pandas=True)
importances = importances.loc[:,['variable','relative_importance']].groupby('variable').mean()
importances.sort_values(by="relative_importance", ascending=False).iplot(kind='bar', colors='#5AC4F2', theme='white')

## Leaderboard ROC Curves

In [22]:
Model0 = np.array(h2o.get_model(leaders[0, 1]).roc(valid=True))
Model1 = np.array(h2o.get_model(leaders[1, 1]).roc(valid=True))
Model2 = np.array(h2o.get_model(leaders[2, 1]).roc(valid=True))
Model3 = np.array(h2o.get_model(leaders[3, 1]).roc(valid=True))
Model4 = np.array(h2o.get_model(leaders[4, 1]).roc(valid=True))
Model5 = np.array(h2o.get_model(leaders[5, 1]).roc(valid=True))
Model6 = np.array(h2o.get_model(leaders[6, 1]).roc(valid=True))
Model7 = np.array(h2o.get_model(leaders[7, 1]).roc(valid=True))
Model8 = np.array(h2o.get_model(leaders[8, 1]).roc(valid=True))
#Model9 = np.array(h2o.get_model(leaders[9, 1]).roc(valid=True))


layout = go.Layout(autosize=False, width=725, height=575,  xaxis=dict(title='False Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')), 
                                                           yaxis=dict(title='True Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')))

Model0Trace = go.Scatter(x = Model0[0], y = Model0[1], mode = 'lines', name = 'Leader', line = dict(color = ('rgb(26, 58, 126)'), width = 3))
Model1Trace = go.Scatter(x = Model1[0], y = Model1[1], mode = 'lines', name = 'Model 1', line = dict(color = ('rgb(135, 160, 216)'), width = 3))
Model2Trace = go.Scatter(x = Model2[0], y = Model2[1], mode = 'lines', name = 'Model 2', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model3Trace = go.Scatter(x = Model3[0], y = Model3[1], mode = 'lines', name = 'Model 3', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model4Trace = go.Scatter(x = Model4[0], y = Model4[1], mode = 'lines', name = 'Model 4', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model5Trace = go.Scatter(x = Model5[0], y = Model5[1], mode = 'lines', name = 'Model 5', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model6Trace = go.Scatter(x = Model6[0], y = Model6[1], mode = 'lines', name = 'Model 6', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model7Trace = go.Scatter(x = Model7[0], y = Model7[1], mode = 'lines', name = 'Model 7', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model8Trace = go.Scatter(x = Model8[0], y = Model8[1], mode = 'lines', name = 'Model 8', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
#Model9Trace = go.Scatter(x = Model9[0], y = Model9[1], mode = 'lines', name = 'Model 9', line = dict(color = ('rgb(156, 190, 241)'), width = 1))


traceChanceLine = go.Scatter(x = [0,1], y = [0,1], mode = 'lines+markers', name = 'chance', line = dict(color = ('rgb(136, 140, 150)'), width = 4, dash = 'dash'))

fig = go.Figure(data=[Model0Trace,Model1Trace,Model2Trace,Model3Trace,Model4Trace,Model5Trace,Model7Trace,Model8Trace,traceChanceLine], layout=layout)


py.iplot(fig)

## Confusion Matrix

In [13]:
cm = autoModel.leader.confusion_matrix()
cm = cm.table.as_data_frame()
cm
confusionMatrix = ff.create_table(cm)
confusionMatrix.layout.height=300
confusionMatrix.layout.width=800
confusionMatrix.layout.font.size=17
py.iplot(confusionMatrix)

## Business Impact Matrix

Weighting Predictions With a Dollar Value
- Correctly predicting GOOD: +\$500
- Correctly predicting BAD: +\$800
- Incorrectly predicting GOOD: -\$1000
- Incorrectly predicting BAD: -\$100

In [23]:
CorrectPredictBad = cm.loc[0,'BAD']
CorrectPredictBadImpact = 500
cm1 = CorrectPredictBad*CorrectPredictBadImpact

IncorrectPredictBad = cm.loc[1,'BAD']
IncorrectPredictBadImpact = -100
cm2 = IncorrectPredictBad*IncorrectPredictBadImpact

IncorrectPredictGood = cm.loc[0,'GOOD']
IncorrectPredictGoodImpact = -1000
cm3 = IncorrectPredictGood*IncorrectPredictGoodImpact

CorrectPredictGood = cm.loc[0,'GOOD']
CorrectPredictGoodImpact = 800
cm4 = CorrectPredictGood*CorrectPredictGoodImpact


data_matrix = [['Business Impact', '($) Predicted BAD', '($) Predicted GOOD', '($) Total'],
               ['($) Actual BAD', cm1, cm3, '' ],
               ['($) Actual GOOD', cm2, cm4, ''],
               ['($) Total', cm1+cm2, cm3+cm4, cm1+cm2+cm3+cm4]]

impactMatrix = ff.create_table(data_matrix, height_constant=20, hoverinfo='weight')
impactMatrix.layout.height=300
impactMatrix.layout.width=800
impactMatrix.layout.font.size=17
py.iplot(impactMatrix)

In [None]:
h2o.save_model(model=autoModel.leader)

In [None]:
def approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,
                 Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length):
    # connect to the model scoring service
    h2o.connect()

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path='DRF_model_1496459915419_4') 

    # define a feature vector to evaluate with the model
    newData = pd.DataFrame({'Loan_Amount' : Loan_Amount,
                            'Term' : Term,
                            'Interest_Rate' : Interest_Rate,
                            'Employment_Years' : Employment_Years,
                            'Home_Ownership' : Home_Ownership,
                            'Annual_Income' : Annual_Income,
                            'Verification_Status' : Verification_Status,
                            'Loan_Purpose' : Loan_Purpose,
                            'State' : State,
                            'Debt_to_Income' : Debt_to_Income,
                            'Delinquent_2yr' : Delinquent_2yr,
                            'Revolving_Cr_Util' : Revolving_Cr_Util,
                            'Total_Accounts' : Total_Accounts,
                            'Longest_Credit_Length' : Longest_Credit_Length}, index=[0])
    
    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
    predictionsOut = h2o.as_list(predictions, use_pandas=False)
    prediction = predictionsOut[1][0]
    probabilityBad = predictionsOut[1][1]
    probabilityGood = predictionsOut[1][2]
    return "Prediction: " + str(prediction) + " |Probability of Bad Loan: " + str(probabilityBad) + " |Probability of Good Loan: " + str(probabilityGood)

In [None]:
Loan_Amount = 5000
Term = "60 months"
Interest_Rate=13
Employment_Years=5
Home_Ownership="RENT"
Annual_Income=75000
Verification_Status="VERIFIED - income"
Loan_Purpose="credit_card"
State="CA"
Debt_to_Income="16.12"
Delinquent_2yr="0"
Revolving_Cr_Util=37
Total_Accounts=6
Longest_Credit_Length=97
approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length)