In [1]:
cd "~/IBM_Attrition_DataChallenge/"  

/Users/naveenmirapuri/IBM_Attrition_DataChallenge


In [2]:
import pandas as pd
import numpy as np

# to be used for graphing
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls


# to be used for machine learning
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, log_loss, classification_report)
import xgboost
 

# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

First read in the raw data. Then as a standard practice, return the first few rows to check the dataframe is correct.

In [3]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')         # reads in dataset using pandas
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


We can realize there are four columns that are the same/arbitrary for each person, so we will remove these below.

In [4]:
df.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis="columns", inplace=True)

We will check for missing data to ensure that no rows have to be removed/edited. If some values are missing, there are multiple ways to correct the issue. We could use a deletion method (listwise, pairwise, etc.) to remove rows or variables with missing data; however, this is not best practice because it could result in losing lots of information. We could also try to fill in the data using some imputation method. Imputation methods can range from simple mean/median/mode calculations to more complex techniques like the 'nearest neighbors' method, which extracts an estimate only from rows with similar traits.

In [5]:
df.isnull().values.any()

False

There are no null values! This means we don't have to bother with missing data and can move on to the next step.

Next, we will remove any columns with the same value in each row, as they are the same for everyone and will provide no additional information.

We will now check that each employee is only entered in the dataset once. When calculating attrition, it would be fine if an employee was counted twice, say if attrition changes after some promotion/work-life change. However, when gathering general statistics on IBM employees (number of workers, distribution of jobs, etc.), we want to avoid double counting. We will do this through noticing that there is an Employee Number column with unique identifiers.

In [6]:
df["EmployeeNumber"].is_unique

True

Great! Because the column is the same before and after duplicates are removed, there are no employee recurrences to worry about! We can now eliminate EmployeeNumber, as it is an arbitratry variable as well.

In [7]:
df.drop(['EmployeeNumber'], axis="columns", inplace=True)

We also have to eliminate correlated data from the dataframe. Correlated data are variables that have a strong association with eachother and will thus tend to follow related patterns (note: not causation). This is an important step in cleaning the data, as we do not want multiple independent variables conveying the same information to the model. Correlated data can make models unstable and introduce variance/ovrfitting to the dataset by double-counting.

We can only check correlations between numeric values, so we will filter out other types.

In [8]:
#numerics = df.select_dtypes([np.number]); # this is no longer needed as corr already implements a filtering function

We will now use the corr method to return correlation values for all the variables. A correlation close to +/- 1 represents that the variables on each side are highly correlated. As shown, there is a diagonal of 1s where each variable intersects wiith itself and the values in the upper right triangle are perfectly mirrored across the diagonal. The corr function uses Pearson's rank-based measure of association. 

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# This will display the correlation heatmap
plt.figure(figsize=(30, 30))
#sns.heatmap(df.corr(), annot=True, cmap="RdYlGn", annot_kws={"size":15})

<Figure size 2160x2160 with 0 Axes>

<Figure size 2160x2160 with 0 Axes>

We will use the same method used to generate the matrix again to identify the upper right portion. With the upper right portion, we will be able to check if any correlation values are above a certain threshold and remove them if they are. We only use the upper right because, as mentioned earlier, the matrix is perfeectly reflected across the diagonal. The code below uses a standard 0.77 cutoff value

In [10]:
# calculates correlations and isolates one triangle
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# stores columns that have a threshold higher than 0.77 and should thus be dropped
to_drop = [column for column in upper.columns if any(upper[column] > 0.77)]
print(to_drop)  # displays which columns will be dropped due to high correlation

['MonthlyIncome', 'PerformanceRating', 'TotalWorkingYears']


In [11]:
df = df.drop(df[to_drop], axis=1) # drops the columns found above from the dataframe
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,OverTime,PercentSalaryHike,RelationshipSatisfaction,StockOptionLevel,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,Yes,11,1,0,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,No,23,4,1,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,Yes,15,2,0,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,Yes,11,3,0,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,No,12,4,1,3,3,2,2,2,2


Before feeding the data into the model, we must first replace all of the categorical variables with dummies. Dummies are created when a categorical column is converted into multiple integer columns of binary values. For example, a column of Gender with category values of Male and Female, would now become two columns in the dataframe, one of 'GenderMale' and 'GenderFemale', each with either a 1 or 0 to indicate whether the row contains that gender. This is done to allow the model to read in and make inferences on categorical data.


We will first have to round up all of the categorical columns, then generate the dummies. We will exclude Attrition when assigning dummies, as we don't want multiple columns of the predictive column.

In [12]:
dfTemp = df.drop(['Attrition'], axis=1)  # drop attrition from the temp datafram to avoid multiple


Common practice here is to loop through the dataframe to identify which columns are categorical and need dummies

In [13]:
categorical = []
for col, value in dfTemp.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

In [14]:
dummies = pd.get_dummies(df[categorical])  # once we have the category names, we can generate the dummies and preview
dummies.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,0,0,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
1,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
4,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0


In [15]:
# with the dummies created, we will remove the existing categorical columns from the df and append the new dummy 
# versions to the end
df = pd.concat([df.drop(categorical, axis=1),dummies], axis=1)
df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,41,Yes,1102,1,2,2,94,3,2,4,...,0,0,0,1,0,0,0,1,0,1
1,49,No,279,8,1,3,61,2,2,2,...,0,0,1,0,0,0,1,0,1,0
2,37,Yes,1373,2,2,4,92,2,1,3,...,0,0,0,0,0,0,0,1,0,1
3,33,No,1392,3,4,4,56,3,1,3,...,0,0,1,0,0,0,1,0,0,1
4,27,No,591,2,1,1,40,3,1,2,...,0,0,0,0,0,0,1,0,1,0


Lastly in data preparation, we finish with our target. Before trying to build a model, it is important to check the distribution of the data you are trying to solve for. If the data is skewed heavily, then perhaps even a simple 'always yes' or 'always no' model would perform better than any program you could create. Thus, we will map and count the values for each entry of attrition to understand more about the skew.

In [16]:
attr_map = {'Yes':1, 'No':0}
attr_real = df["Attrition"].apply(lambda x: attr_map[x])

In [17]:
df["Attrition"].value_counts()


No     1233
Yes     237
Name: Attrition, dtype: int64

We can see that there are substantially more No values than Yes in terms of attrition. In fact, if you were to create a model which only predicted No, you would be correct over 83% of the time! This is sure to be larger than the models some people create, so it is important to be wary of this number when moving forward. The usual fix for skewed data is to take the logarithm of all values, but seeing as Attrition is a categorical variable, this won't work. Instead, we will have to use a program to create synthetic, realistic minority cases (Yes cases, such that the dataset will be balanced. We will get into which exact libraries to call in a few lines

For now, lets start building the model and get to work instantiating the hyperparameters for the program. We will be using the automated ML framework sklearn to construct our model




In [18]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [19]:
# initiating the basic hyperparameters into the sklearn data structure for holding training/testing data and parameters
train, test, target_train, target_val = train_test_split(df.drop(['Attrition'], axis=1), 
                                                         attr_real, 
                                                         train_size= 0.75,
                                                         random_state=0);

Seriously though, this is where we are going to be fixing the skewness mentioned earlier. We will use a program called SMOTE specialized for classification tasks. As explained, SMOTE creates inference cases to expand the minority portion of the dataset.

In [20]:
from imblearn.over_sampling import SMOTE
regularizer = SMOTE(random_state=0)
train_regularized, target_regularized = regularizer.fit_sample(train,target_train)

In [21]:
seed = 0   # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
#     'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

rf = RandomForestClassifier(**rf_params)

rf.fit(train_regularized, target_regularized)
print("Fitting of Random Forest finished")

Fitting of Random Forest finished


In [22]:
rf_predictions = rf.predict(test)
print("Predictions finished")

Predictions finished


In [23]:
print("Accuracy score: {}".format(accuracy_score(target_val, rf_predictions)))
print("="*80)
print(classification_report(target_val, rf_predictions))

Accuracy score: 0.8614130434782609
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       310
           1       0.57      0.50      0.53        58

    accuracy                           0.86       368
   macro avg       0.74      0.71      0.73       368
weighted avg       0.85      0.86      0.86       368



We can now use our random forrest design to identify which features are the most important! According to the graph below, we see that overtime, Job Level, and Stock Option Level are the three most important features. See read.me for more analysis!

In [24]:
trace = go.Scatter(
    y = rf.feature_importances_,
    x = df.drop(['Attrition'], axis=1).columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = rf.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = df.drop(['Attrition'], axis=1).columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

----END OF CODE-----------

In [25]:
df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,41,Yes,1102,1,2,2,94,3,2,4,...,0,0,0,1,0,0,0,1,0,1
1,49,No,279,8,1,3,61,2,2,2,...,0,0,1,0,0,0,1,0,1,0
2,37,Yes,1373,2,2,4,92,2,1,3,...,0,0,0,0,0,0,0,1,0,1
3,33,No,1392,3,4,4,56,3,1,3,...,0,0,1,0,0,0,1,0,0,1
4,27,No,591,2,1,1,40,3,1,2,...,0,0,0,0,0,0,1,0,1,0


In [26]:
#type(df['Attrition'])

Below is a basic implementation of the automated library h2o, unfortunately, if you wish to explore below, you will
see that the results were not great, so the previous model was preferred

In [27]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_111"; Java(TM) SE Runtime Environment (build 1.8.0_111-b14); Java HotSpot(TM) 64-Bit Server VM (build 25.111-b14, mixed mode)
  Starting server from /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/q_/x63jnvxs4r72xyfgkgfvlgc80000gn/T/tmpqadq12w1
  JVM stdout: /var/folders/q_/x63jnvxs4r72xyfgkgfvlgc80000gn/T/tmpqadq12w1/h2o_naveenmirapuri_started_from_python.out
  JVM stderr: /var/folders/q_/x63jnvxs4r72xyfgkgfvlgc80000gn/T/tmpqadq12w1/h2o_naveenmirapuri_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.2
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_naveenmirapuri_9i7lvx
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.556 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [28]:
dfH2O = h2o.H2OFrame(df)
#df.to_csv('dataframeFinal.csv')
#dfH2O = h2o.import_file("dataframeFinal.csv")
dfH2O['Attrition'] = dfH2O['Attrition'].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [29]:
 dl = H2ODeepLearningEstimator(distribution="tweedie",
                               hidden=[1],
                               epochs=1000,
                               train_samples_per_iteration=-1,
                               reproducible=True,
                               activation="Tanh",
                               single_node_mode=False,
                               balance_classes=False,
                               force_load_balance=False,
                               seed=23123,
                               tweedie_power=1.5,
                               score_training_samples=0,
                               score_validation_samples=0,
                               stopping_rounds=0)

In [30]:
dl.train(x=list(range(3)),
          y="Attrition",
          training_frame=dfH2O)

H2OResponseError: ModelBuilderErrorV3  (water.exceptions.H2OModelBuilderIllegalArgumentException):
    timestamp = 1599893004285
    error_url = '/3/ModelBuilders/deeplearning'
    msg = 'Illegal argument(s) for DeepLearning model: DeepLearning_model_python_1599892999421_1.  Details: ERRR on field: _distribution: tweedie distribution is not allowed for classification.\n\nFor more information visit:\n  http://jira.h2o.ai/browse/TN-2'
    dev_msg = 'Illegal argument(s) for DeepLearning model: DeepLearning_model_python_1599892999421_1.  Details: ERRR on field: _distribution: tweedie distribution is not allowed for classification.\n\nFor more information visit:\n  http://jira.h2o.ai/browse/TN-2'
    http_status = 412
    values = {'messages': [{'_log_level': 5, '_field_name': '_keep_cross_validation_models', '_message': 'Only for cross-validation.'}, {'_log_level': 5, '_field_name': '_keep_cross_validation_predictions', '_message': 'Only for cross-validation.'}, {'_log_level': 5, '_field_name': '_keep_cross_validation_fold_assignment', '_message': 'Only for cross-validation.'}, {'_log_level': 5, '_field_name': '_fold_assignment', '_message': 'Only for cross-validation.'}, {'_log_level': 5, '_field_name': '_quantile_alpha', '_message': 'Quantile (alpha) is only used for Quantile regression.'}, {'_log_level': 5, '_field_name': '_max_after_balance_size', '_message': 'Balance classes is false, hide max_after_balance_size'}, {'_log_level': 5, '_field_name': '_max_hit_ratio_k', '_message': 'Max K-value for hit ratio is only applicable to multi-class classification problems.'}, {'_log_level': 5, '_field_name': '_max_confusion_matrix_size', '_message': 'Only for multi-class classification problems.'}, {'_log_level': 5, '_field_name': '_max_after_balance_size', '_message': 'Only used with balanced classes'}, {'_log_level': 5, '_field_name': '_class_sampling_factors', '_message': 'Class sampling factors is only applicable if balancing classes.'}, {'_log_level': 5, '_field_name': '_score_validation_samples', '_message': 'score_validation_samples requires a validation frame.'}, {'_log_level': 5, '_field_name': '_regression_stop', '_message': 'regression_stop is used only with regression.'}, {'_log_level': 5, '_field_name': '_score_validation_sampling', '_message': 'score_validation_sampling requires classification and a validation frame.'}, {'_log_level': 5, '_field_name': '_hidden_dropout_ratios', '_message': 'hidden_dropout_ratios requires a dropout activation function.'}, {'_log_level': 5, '_field_name': '_replicate_training_data', '_message': 'replicate_training_data is only valid with cloud size greater than 1.'}, {'_log_level': 5, '_field_name': '_rate', '_message': 'rate is not used with adaptive_rate.'}, {'_log_level': 5, '_field_name': '_rate_annealing', '_message': 'rate_annealing is not used with adaptive_rate.'}, {'_log_level': 5, '_field_name': '_rate_decay', '_message': 'rate_decay is not used with adaptive_rate.'}, {'_log_level': 5, '_field_name': '_momentum_start', '_message': 'momentum_start is not used with adaptive_rate.'}, {'_log_level': 5, '_field_name': '_momentum_ramp', '_message': 'momentum_ramp is not used with adaptive_rate.'}, {'_log_level': 5, '_field_name': '_momentum_stable', '_message': 'momentum_stable is not used with adaptive_rate.'}, {'_log_level': 5, '_field_name': '_initial_weight_scale', '_message': 'initial_weight_scale is not used if initial_weight_distribution == UniformAdaptive.'}, {'_log_level': 1, '_field_name': '_distribution', '_message': 'tweedie distribution is not allowed for classification.\n\nFor more information visit:\n  http://jira.h2o.ai/browse/TN-2'}, {'_log_level': 5, '_field_name': '_elastic_averaging_moving_rate', '_message': 'Elastic averaging is required for this parameter.'}, {'_log_level': 5, '_field_name': '_elastic_averaging_regularization', '_message': 'Elastic averaging is required for this parameter.'}], 'algo': 'DeepLearning', 'parameters': {'_train': {'name': 'py_1_sid_8f34', 'type': 'Key'}, '_valid': None, '_nfolds': 0, '_keep_cross_validation_models': True, '_keep_cross_validation_predictions': False, '_keep_cross_validation_fold_assignment': False, '_parallelize_cross_validation': True, '_auto_rebalance': True, '_seed': 23123, '_fold_assignment': 'AUTO', '_categorical_encoding': 'AUTO', '_max_categorical_levels': 10, '_distribution': 'tweedie', '_tweedie_power': 1.5, '_quantile_alpha': 0.5, '_huber_alpha': 0.9, '_ignored_columns': ['Department_Sales', 'JobRole_Sales Representative', 'MaritalStatus_Married', 'EnvironmentSatisfaction', 'HourlyRate', 'YearsAtCompany', 'JobLevel', 'MonthlyRate', 'BusinessTravel_Travel_Rarely', 'BusinessTravel_Non-Travel', 'JobRole_Manufacturing Director', 'JobInvolvement', 'JobSatisfaction', 'EducationField_Marketing', 'MaritalStatus_Divorced', 'JobRole_Healthcare Representative', 'BusinessTravel_Travel_Frequently', 'YearsWithCurrManager', 'JobRole_Research Scientist', 'EducationField_Other', 'EducationField_Technical Degree', 'JobRole_Research Director', 'EducationField_Life Sciences', 'JobRole_Manager', 'OverTime_No', 'Education', 'RelationshipSatisfaction', 'WorkLifeBalance', 'MaritalStatus_Single', 'JobRole_Laboratory Technician', 'Department_Human Resources', 'JobRole_Human Resources', 'EducationField_Human Resources', 'OverTime_Yes', 'Gender_Male', 'EducationField_Medical', 'Department_Research & Development', 'PercentSalaryHike', 'TrainingTimesLastYear', 'YearsSinceLastPromotion', 'YearsInCurrentRole', 'Gender_Female', 'JobRole_Sales Executive', 'NumCompaniesWorked', 'StockOptionLevel', 'DistanceFromHome'], '_ignore_const_cols': True, '_weights_column': None, '_offset_column': None, '_fold_column': None, '_check_constant_response': True, '_is_cv_model': False, '_score_each_iteration': False, '_max_runtime_secs': 0.0, '_stopping_rounds': 0, '_stopping_metric': 'AUTO', '_stopping_tolerance': 0.0, '_response_column': 'Attrition', '_balance_classes': False, '_max_after_balance_size': 5.0, '_class_sampling_factors': None, '_max_confusion_matrix_size': 20, '_checkpoint': None, '_pretrained_autoencoder': None, '_custom_metric_func': None, '_custom_distribution_func': None, '_export_checkpoints_dir': None, '_gainslift_bins': -1, '_overwrite_with_best_model': True, '_autoencoder': False, '_use_all_factor_levels': True, '_standardize': True, '_activation': 'Tanh', '_hidden': [1], '_epochs': 1000.0, '_train_samples_per_iteration': -1, '_target_ratio_comm_to_comp': 0.05, '_adaptive_rate': True, '_rho': 0.99, '_epsilon': 1e-08, '_rate': 0.005, '_rate_annealing': 1e-06, '_rate_decay': 1.0, '_momentum_start': 0.0, '_momentum_ramp': 1000000.0, '_momentum_stable': 0.0, '_nesterov_accelerated_gradient': True, '_input_dropout_ratio': 0.0, '_hidden_dropout_ratios': None, '_l1': 0.0, '_l2': 0.0, '_max_w2': 3.4028235e+38, '_initial_weight_distribution': 'UniformAdaptive', '_initial_weight_scale': 1.0, '_initial_weights': None, '_initial_biases': None, '_loss': 'Automatic', '_score_interval': 5.0, '_score_training_samples': 0, '_score_validation_samples': 0, '_score_duty_cycle': 0.1, '_classification_stop': 0.0, '_regression_stop': 1e-06, '_quiet_mode': False, '_score_validation_sampling': 'Uniform', '_diagnostics': True, '_variable_importances': True, '_fast_mode': True, '_force_load_balance': False, '_replicate_training_data': True, '_single_node_mode': False, '_shuffle_training_data': False, '_missing_values_handling': 'MeanImputation', '_sparse': False, '_col_major': False, '_average_activation': 0.0, '_sparsity_beta': 0.0, '_max_categorical_features': 2147483647, '_reproducible': True, '_export_weights_and_biases': False, '_elastic_averaging': False, '_elastic_averaging_moving_rate': 0.9, '_elastic_averaging_regularization': 0.001, '_mini_batch_size': 1}, 'error_count': 2}
    exception_msg = 'Illegal argument(s) for DeepLearning model: DeepLearning_model_python_1599892999421_1.  Details: ERRR on field: _distribution: tweedie distribution is not allowed for classification.\n\nFor more information visit:\n  http://jira.h2o.ai/browse/TN-2'
    stacktrace =
        water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for DeepLearning model: DeepLearning_model_python_1599892999421_1.  Details: ERRR on field: _distribution: tweedie distribution is not allowed for classification.

For more information visit:
  http://jira.h2o.ai/browse/TN-2
        water.exceptions.H2OModelBuilderIllegalArgumentException.makeFromBuilder(H2OModelBuilderIllegalArgumentException.java:19)
        hex.ModelBuilder.trainModelOnH2ONode(ModelBuilder.java:318)
        water.api.ModelBuilderHandler.handle(ModelBuilderHandler.java:51)
        water.api.ModelBuilderHandler.handle(ModelBuilderHandler.java:16)
        water.api.RequestServer.serve(RequestServer.java:470)
        water.api.RequestServer.doGeneric(RequestServer.java:301)
        water.api.RequestServer.doPost(RequestServer.java:227)
        javax.servlet.http.HttpServlet.service(HttpServlet.java:707)
        javax.servlet.http.HttpServlet.service(HttpServlet.java:790)
        org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:865)
        org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:535)
        org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:255)
        org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1317)
        org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:203)
        org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:473)
        org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:201)
        org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1219)
        org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:144)
        org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:126)
        org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)
        water.webserver.jetty9.Jetty9ServerAdapter$LoginHandler.handle(Jetty9ServerAdapter.java:130)
        org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:126)
        org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)
        org.eclipse.jetty.server.Server.handle(Server.java:531)
        org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:352)
        org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:260)
        org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:281)
        org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:102)
        org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:118)
        org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:333)
        org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:310)
        org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:168)
        org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:126)
        org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:366)
        org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:762)
        org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:680)
        java.lang.Thread.run(Thread.java:745)
    parameters = {'__meta': {'schema_version': 3, 'schema_name': 'DeepLearningParametersV3', 'schema_type': 'DeepLearningParameters'}, 'model_id': None, 'training_frame': {'__meta': {'schema_version': 3, 'schema_name': 'FrameKeyV3', 'schema_type': 'Key<Frame>'}, 'name': 'py_1_sid_8f34', 'type': 'Key<Frame>', 'URL': '/3/Frames/py_1_sid_8f34'}, 'validation_frame': None, 'nfolds': 0, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': False, 'keep_cross_validation_fold_assignment': False, 'parallelize_cross_validation': True, 'distribution': 'tweedie', 'tweedie_power': 1.5, 'quantile_alpha': 0.5, 'huber_alpha': 0.9, 'response_column': {'__meta': {'schema_version': 3, 'schema_name': 'ColSpecifierV3', 'schema_type': 'VecSpecifier'}, 'column_name': 'Attrition', 'is_member_of_frames': None}, 'weights_column': None, 'offset_column': None, 'fold_column': None, 'fold_assignment': 'AUTO', 'categorical_encoding': 'AUTO', 'max_categorical_levels': 10, 'ignored_columns': ['Department_Sales', 'JobRole_Sales Representative', 'MaritalStatus_Married', 'EnvironmentSatisfaction', 'HourlyRate', 'YearsAtCompany', 'JobLevel', 'MonthlyRate', 'BusinessTravel_Travel_Rarely', 'BusinessTravel_Non-Travel', 'JobRole_Manufacturing Director', 'JobInvolvement', 'JobSatisfaction', 'EducationField_Marketing', 'MaritalStatus_Divorced', 'JobRole_Healthcare Representative', 'BusinessTravel_Travel_Frequently', 'YearsWithCurrManager', 'JobRole_Research Scientist', 'EducationField_Other', 'EducationField_Technical Degree', 'JobRole_Research Director', 'EducationField_Life Sciences', 'JobRole_Manager', 'OverTime_No', 'Education', 'RelationshipSatisfaction', 'WorkLifeBalance', 'MaritalStatus_Single', 'JobRole_Laboratory Technician', 'Department_Human Resources', 'JobRole_Human Resources', 'EducationField_Human Resources', 'OverTime_Yes', 'Gender_Male', 'EducationField_Medical', 'Department_Research & Development', 'PercentSalaryHike', 'TrainingTimesLastYear', 'YearsSinceLastPromotion', 'YearsInCurrentRole', 'Gender_Female', 'JobRole_Sales Executive', 'NumCompaniesWorked', 'StockOptionLevel', 'DistanceFromHome'], 'ignore_const_cols': True, 'score_each_iteration': False, 'checkpoint': None, 'stopping_rounds': 0, 'max_runtime_secs': 0.0, 'stopping_metric': 'AUTO', 'stopping_tolerance': 0.0, 'gainslift_bins': -1, 'custom_metric_func': None, 'custom_distribution_func': None, 'export_checkpoints_dir': None, 'balance_classes': False, 'class_sampling_factors': None, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_hit_ratio_k': 0, 'activation': 'Tanh', 'hidden': [1], 'epochs': 1000.0, 'train_samples_per_iteration': -1, 'target_ratio_comm_to_comp': 0.05, 'seed': 23123, 'adaptive_rate': True, 'rho': 0.99, 'epsilon': 1e-08, 'rate': 0.005, 'rate_annealing': 1e-06, 'rate_decay': 1.0, 'momentum_start': 0.0, 'momentum_ramp': 1000000.0, 'momentum_stable': 0.0, 'nesterov_accelerated_gradient': True, 'input_dropout_ratio': 0.0, 'hidden_dropout_ratios': None, 'l1': 0.0, 'l2': 0.0, 'max_w2': 3.4028235e+38, 'initial_weight_distribution': 'UniformAdaptive', 'initial_weight_scale': 1.0, 'initial_weights': None, 'initial_biases': None, 'loss': 'Automatic', 'score_interval': 5.0, 'score_training_samples': 0, 'score_validation_samples': 0, 'score_duty_cycle': 0.1, 'classification_stop': 0.0, 'regression_stop': 1e-06, 'quiet_mode': False, 'score_validation_sampling': 'Uniform', 'overwrite_with_best_model': True, 'autoencoder': False, 'use_all_factor_levels': True, 'standardize': True, 'diagnostics': True, 'variable_importances': True, 'fast_mode': True, 'force_load_balance': False, 'replicate_training_data': True, 'single_node_mode': False, 'shuffle_training_data': False, 'missing_values_handling': 'MeanImputation', 'sparse': False, 'col_major': False, 'average_activation': 0.0, 'sparsity_beta': 0.0, 'max_categorical_features': 2147483647, 'reproducible': True, 'export_weights_and_biases': False, 'mini_batch_size': 1, 'elastic_averaging': False, 'elastic_averaging_moving_rate': 0.9, 'elastic_averaging_regularization': 0.001, 'pretrained_autoencoder': None}
    messages = [{'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'keep_cross_validation_models', 'message': 'Only for cross-validation.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'keep_cross_validation_predictions', 'message': 'Only for cross-validation.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'keep_cross_validation_fold_assignment', 'message': 'Only for cross-validation.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'fold_assignment', 'message': 'Only for cross-validation.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'quantile_alpha', 'message': 'Quantile (alpha) is only used for Quantile regression.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_after_balance_size', 'message': 'Balance classes is false, hide max_after_balance_size'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_hit_ratio_k', 'message': 'Max K-value for hit ratio is only applicable to multi-class classification problems.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_confusion_matrix_size', 'message': 'Only for multi-class classification problems.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_after_balance_size', 'message': 'Only used with balanced classes'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'class_sampling_factors', 'message': 'Class sampling factors is only applicable if balancing classes.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'score_validation_samples', 'message': 'score_validation_samples requires a validation frame.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'regression_stop', 'message': 'regression_stop is used only with regression.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'score_validation_sampling', 'message': 'score_validation_sampling requires classification and a validation frame.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'hidden_dropout_ratios', 'message': 'hidden_dropout_ratios requires a dropout activation function.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'replicate_training_data', 'message': 'replicate_training_data is only valid with cloud size greater than 1.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'rate', 'message': 'rate is not used with adaptive_rate.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'rate_annealing', 'message': 'rate_annealing is not used with adaptive_rate.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'rate_decay', 'message': 'rate_decay is not used with adaptive_rate.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'momentum_start', 'message': 'momentum_start is not used with adaptive_rate.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'momentum_ramp', 'message': 'momentum_ramp is not used with adaptive_rate.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'momentum_stable', 'message': 'momentum_stable is not used with adaptive_rate.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'initial_weight_scale', 'message': 'initial_weight_scale is not used if initial_weight_distribution == UniformAdaptive.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'ERRR', 'field_name': 'distribution', 'message': 'tweedie distribution is not allowed for classification.\n\nFor more information visit:\n  http://jira.h2o.ai/browse/TN-2'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'elastic_averaging_moving_rate', 'message': 'Elastic averaging is required for this parameter.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'elastic_averaging_regularization', 'message': 'Elastic averaging is required for this parameter.'}]
    error_count = 2


In [None]:
#predictors = list(df.drop(['Attrition'], axis=1))
#response = "Attrition"
#trainH2O = h2o.H2OFrame(train_regularized)
#testH2O = h2o.H2OFrame(test)

#trainH2O[response] = trainH2O[response].asfactor()
#testH2O[response] = testH2O[response].asfactor()


#df = h2o.H2OFrame(df)
#df["Attrition"] = df["Attrition"].asfactor()

In [None]:
#train, valid = df.split_frame(ratios=[0.8], seed=1234)

In [None]:
#h2o_Model = H2ORandomForestEstimator(ntrees=30,
#                                     max_depth=15,
#                                     min_rows=10,
#                                     calibrate_model=True,
#                                     calibration_frame=valid,
#                                     binomial_double_trees=True)

#h2o_Model.train(x=predictors,
#               y=response,
#               training_frame=train,
#               validation_frame=valid)

perf = h2o_Model.model_performance()
perf