In [123]:
## importing the relevant packages:

# clear the workspace
%reset -f

# print list of files in directory
import os
print(os.listdir())

# print/display all plots inline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# the base packages
import collections # for the Counter function
import csv # for reading/writing csv files
import pandas as pd, numpy as np, time, gc, bisect

# the various packages/modules used across processing (sklearn), modelling (lightgbm) and bayesian optimization (hyperopt, bayes_opt)
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing, decomposition
from sklearn.cross_validation import cross_val_score, StratifiedKFold, StratifiedShuffleSplit
from sklearn.base import TransformerMixin
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import TSNE

from bayes_opt import BayesianOptimization
from tqdm import tqdm
from hyperopt import hp, tpe, STATUS_OK, fmin, Trials
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample

# modelling algorithms
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier

# Evaluation of the model
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer

# Exporting packages for SHAP/LIME
import shap
import lime
import lime.lime_tabular

# missing value imputation
from fancyimpute import KNN, MICE #, NuclearNormMinimization

# define the global variables used later
MAX_EVALS = 10 # number of iterations/parameter sets created towards tuning
N_FOLDS = 5 # number of cv folds
randomseed = 1 # the value for the random state used at various points in the pipeline

['.ipynb_checkpoints', '.svn', 'archive', 'branches', 'FEATURE_ENGINEERING.ipynb', 'Missing Data Imputation.ipynb', 'MODEL INTERPRETER.ipynb', 'MODEL_SELECTION_TUNING_TEST_2017Dec_model3.ipynb', 'MODEL_TOP_FEATURES_DISTRIBUTION_Graphs.ipynb', 'OTHER_MODELS.ipynb', 'preparation', 'PREPARATION.ipynb', 'salary_bonus_2017.csv', 'snippets', 'tags', 'test2.csv', 'test3.csv', 'test_final.csv', 'train_dec2017.csv', 'train_final.csv', 'train_june2017.csv', 'trunk', 'Turnover2018tillapril.xlsx', 'valid_final.csv', 'wrong', 'X_train.csv', 'X_valid.csv']


In [124]:
#### MAIN CLASSES ####

## Two defined for now ##
# 1. DataFrame Imputer
#    - for imputing missing values
# 2. Prepare Data
#    - for sourcing, processing, and returning the train/test datasets

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
        
    def fit(self, X, y=None):
        X.groupby(['pay scale group', 'abinbev entity2'])
        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], 
                              index=X.columns)
        X.groupby('abinbev entity2')
        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], 
                              index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
    def num_missing(self):
        return sum(self.isnull())
    
    def imputer_mean(self, column):
        x = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
        return x.fit_transform(self[[column]]).ravel()
    
    def imputer_median(self, column):
        x = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
        return x.fit_transform(self[[column]]).ravel()
    
    def imputer_mode(self, column):
        x = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
        return x.fit_transform(self[[column]]).ravel()
    
    def fancy_impute(X, which_method):
        """ currently supported algorithms are KNN, NNM and MICE from the fancyimpute package
        which_method = ['KNN', 'NNM', 'MICE']
        """
        if which_method == 'NNM': X = NuclearNormMinimization().complete(X) # NNM method
        if which_method == 'KNN': X = KNN(k=7, verbose=False).complete(X) # KNN method
        if which_method == 'MICE':
            X_complete_df = X.copy()
            mice = MICE(verbose=False)
            X_complete = mice.complete(np.asarray(X.values, dtype=float))
            X_complete_df.loc[:, X.columns] = X_complete[:][:]
            X = X_complete_df
        return X

class prepare_data():
    
    def __init__(self):
        """ To prepare data,
                1. read in data
                2. pre-processing/cleaning
                3. creating helper objects for later steps
                4. processing for modelling
                5. function return objects are the train, valid, response, categ cols/indices, feature names
        """
    
    def labelEncoder(train_df, valid_df, cat_columns, test_df = None):
        categorical_names = {}
        for feature in tqdm(cat_columns):
            le = preprocessing.LabelEncoder()
            le.fit(train_df[feature].astype(str))
            train_df[feature] = le.transform(train_df[feature].astype(str))
            if test_df is not None : test_df[feature] = test_df[feature].map(lambda i: 'No Data' if i not in le.classes_ else i)
            valid_df[feature] = valid_df[feature].map(lambda i: 'No Data' if i not in le.classes_ else i)
            le_classes = le.classes_.tolist()
            bisect.insort_left(le_classes, 'No Data')
            le.classes_ = le_classes
            if test_df is not None : test_df[feature] = le.transform(test_df[feature].astype(str))
            valid_df[feature] = le.transform(valid_df[feature].astype(str))
            categorical_names[feature] = le.classes_
        if test_df is not None :
            return train_df, test_df, valid_df, categorical_names
        if test_df is None :
            return train_df, valid_df, categorical_names
    
    ## function to get frequency count of elements in a vector/list
    def freq_count(input_vector):
        return collections.Counter(input_vector)
    
    def categ_feats(train_df, valid_df, test_df = None):
        x = list(train_df.dtypes)
        x_1 = [1 if x == 'O' else 0 for x in x]
        categorical_idx = [i for i, x in enumerate(x_1) if x == 1]

        # Get feature names and their values for categorical data (needed for LIME)
        cat_columns = train_df.select_dtypes(include=['object']).columns.values
        
        if test_df is not None:
            train_df, test_df, valid_df, categorical_names = prepare_data.labelEncoder(train_df, valid_df, cat_columns, test_df)
            return train_df, test_df, valid_df, categorical_names, categorical_idx
        elif test_df is None:
            train_df, valid_df, categorical_names = prepare_data.labelEncoder(train_df, valid_df, cat_columns)
            return train_df, valid_df, categorical_names, categorical_idx

    def create(input_file_path, input_file_path_2, response, cols_to_remove = ['id'], random_seed = 1):
        train = pd.read_csv(input_file_path, na_values=['No Data', ' ', 'UNKNOWN'])
        test = pd.read_csv(input_file_path_2, na_values=['No Data', ' ', 'UNKNOWN'])
        
        train = pd.DataFrame(train)
        test = pd.DataFrame(test)
        
        train.drop(cols_to_remove, axis = 1, inplace = True)
        test = pd.DataFrame(data = test[train.columns])
        
        print(train.shape, '\n')
        train.dropna(thresh=0.5*(train.shape[0]), axis=1, inplace = True)
        train.dropna(thresh=0.4*(train.shape[1]), axis=0, inplace = True)
        print(train.shape, '\n')
        test = test[train.columns]
        test.dropna(thresh=0.5*(test.shape[0]), axis=1, inplace = True)
        train = train[test.columns]
        
        # calling the missing value imputation function
        #print(train.apply(DataFrameImputer.num_missing, axis=0), '\n')
        imputer_object = DataFrameImputer()
        imputer_object.fit(train)
        train = imputer_object.transform(train)
        test = imputer_object.transform(test)
        
        print(prepare_data.freq_count(train[response]), '\n')

        # shuffle the dataframes so that the training is done in a random order.
        train = shuffle(train)
        test = shuffle(test)
        
        # creating the response vector
        y_train = train[response].values
        X_train = train.drop([response], axis = 1)
        y_valid = test[response].values
        X_valid = test.drop([response], axis = 1)
        
        ##  segment for usage if doing the train/test split ##
        ##  not essential if doing tuning using cross-validation ##
        X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state = random_seed)
        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)
        X_valid = pd.DataFrame(X_valid)
        
        X_train, X_test, X_valid, categ_names, categ_idx = prepare_data.categ_feats(X_train, X_test, X_valid)
        
        # returning as pandas dataframes to retain feature names for LIME and feature importance plots
        X_train = pd.DataFrame(data=X_train, columns=X_train.columns.values)
        X_test = pd.DataFrame(data=X_test, columns=X_test.columns.values)
        X_valid = pd.DataFrame(data=X_valid, columns=X_valid.columns.values)
        
        return X_train, X_test, X_valid, y_train, y_test, y_valid, categ_names, categ_idx
    
    def create_without_split(input_file_path, input_file_path_2, response, cols_to_remove = ['id'], random_seed = 1):
        train = pd.read_csv(input_file_path, na_values=['No Data', ' ', 'UNKNOWN'])
        test = pd.read_csv(input_file_path_2, na_values=['No Data', ' ', 'UNKNOWN'])
        
        train = pd.DataFrame(train)
        test = pd.DataFrame(test)
        
        train.drop(cols_to_remove, axis = 1, inplace = True)
        test = pd.DataFrame(data = test[train.columns])
        
        print(train.shape, '\n')
        train.dropna(thresh=0.5*(train.shape[0]), axis=1, inplace = True)
        train.dropna(thresh=0.4*(train.shape[1]), axis=0, inplace = True)
        print(train.shape, '\n')
        test = test[train.columns]
        test.dropna(thresh=0.5*(test.shape[0]), axis=1, inplace = True)
        train = train[test.columns]
                
        # calling the missing value imputation function
        #print(train.apply(DataFrameImputer.num_missing, axis=0), '\n')
#         imputer_object = DataFrameImputer()
#         imputer_object.fit(train)
#         train = imputer_object.transform(train)
#         test = imputer_object.transform(test)
        
        print(prepare_data.freq_count(train[response]), '\n')

        # shuffle the dataframes so that the training is done in a random order.
        train = shuffle(train)
        test = shuffle(test)
        
        # creating the response vector
        y_train = train[response].values
        X_train = train.drop([response], axis = 1)
        y_valid = test[response].values
        X_valid = test.drop([response], axis = 1)
        
        X_train = pd.DataFrame(X_train)
        X_valid = pd.DataFrame(X_valid)
        
        X_train, X_valid, categ_names, categ_idx = prepare_data.categ_feats(X_train, X_valid)
        
        feat_names = X_train.columns.values
        feat_names2 = X_valid.columns.values

        X_train = DataFrameImputer.fancy_impute(X_train, which_method='MICE')
        X_valid = DataFrameImputer.fancy_impute(X_valid, which_method='MICE')
        
        # returning as pandas dataframes to retain feature names for LIME and feature importance plots
        X_train = pd.DataFrame(data=X_train, columns=feat_names)
        X_valid = pd.DataFrame(data=X_valid, columns=feat_names2)
        
        return X_train, X_valid, y_train, y_valid, categ_names, categ_idx, feat_names

In [125]:
# create data function call
# CV approach
# train and valid features/response dataframes returned
# categorical column names/indices and all feature names also returned

X_train, X_valid, y_train, y_valid, categ_names, categ_idx, feat_names = prepare_data.create_without_split(input_file_path='train_final.csv',
                                                                  input_file_path_2='test_final.csv', response = 'label',
                                cols_to_remove = ['global id', 'original hire date', 'original id', 'pers. subarea text',
                                                  'manager global id', 'personnel number manager', 
                                                  'short text of organizational unit', 'position text', 
                                                  'physical work location-description', 'physical work location-city',
                                                  'position start date', 'manager position desc', 'costcenter description',
                                                  'local entity description', 'appraiser id'])

(6102, 77) 

(6085, 75) 

Counter({0: 5796, 1: 289}) 



100%|██████████████████████████████████████████████████████████████████████████████████| 41/41 [00:02<00:00, 18.37it/s]


In [126]:
print(X_train.shape)
#print(X_test.shape)
print(X_valid.shape)

print(y_train.shape)
#print(y_test.shape)
print(y_valid.shape)

print(collections.Counter(y_train))
#print(collections.Counter(y_test))
print(collections.Counter(y_valid))

(6085, 74)
(5897, 74)
(6085,)
(5897,)
Counter({0: 5796, 1: 289})
Counter({0: 5616, 1: 281})


## FEATURE ENGINEERING MODULE

1. PCA
2. ICA
3. tSVD
4. GRP
5. SRP
6. Binning
7. Deviation Encoding features
8. Salary related features
9. ...

In [127]:
class feat_eng():
    
    def __init__():
        """ this module contains several functions for creating new features. find below a brief description of each """
    
    def scalers(train, valid, which_method):
        if which_method == 'ss':
            sc = StandardScaler()
            sc.fit(train)
            train = pd.DataFrame(sc.transform(train))
            valid = pd.DataFrame(sc.transform(valid))
            return train, valid # scale all variables to zero mean and unit variance, required for PCA and related
        if which_method == 'mm':
            mm = MinMaxScaler()
            mm.fit(train)
            train = pd.DataFrame(mm.transform(train))
            valid = pd.DataFrame(mm.transform(valid))
            return train, valid # use this method to iterate
        
    def pca_feats(train, valid, n = .95):
            train, valid = feat_eng.scalers(train, valid, which_method='mm')
            pca_fit = decomposition.PCA(n_components=n)
            pca_fit.fit(train)
            pca_train = pd.DataFrame(pca_fit.transform(train))
            pca_valid = pd.DataFrame(pca_fit.transform(valid))
            pca_cols = list(set(list(pca_train)))
            pca_cols = ['pca_' + str(s) for s in pca_cols]
            pca_train.columns = pca_cols
            pca_valid.columns = pca_cols
            return pca_train, pca_valid
        
    def ica_feats(train, valid, n = 5):
            train, valid = feat_eng.scalers(train, valid, which_method='mm')
            ica_fit = decomposition.FastICA(n_components=n)
            ica_fit.fit(train)
            ica_train = pd.DataFrame(ica_fit.transform(train))
            ica_valid = pd.DataFrame(ica_fit.transform(valid))
            ica_cols = list(set(list(ica_train)))
            ica_cols = ['ica_' + str(s) for s in ica_cols]
            ica_train.columns = ica_cols
            ica_valid.columns = ica_cols
            return ica_train, ica_valid
        
    def tsvd_feats(train, valid, n = 5):
            train, valid = feat_eng.scalers(train, valid, which_method='mm')
            tsvd_fit = decomposition.TruncatedSVD(n_components=n)
            tsvd_fit.fit(train)
            tsvd_train = pd.DataFrame(tsvd_fit.transform(train))
            tsvd_valid = pd.DataFrame(tsvd_fit.transform(valid))
            tsvd_cols = list(set(list(tsvd_train)))
            tsvd_cols = ['tsvd_' + str(s) for s in tsvd_cols]
            tsvd_train.columns = tsvd_cols
            tsvd_valid.columns = tsvd_cols
            return tsvd_train, tsvd_valid
        
    def grp_feats(train, valid, n = 5):
            train, valid = feat_eng.scalers(train, valid, which_method='mm')
            grp_fit = GaussianRandomProjection(n_components=n, eps=0.1)
            grp_fit.fit(train)
            grp_train = pd.DataFrame(grp_fit.transform(train))
            grp_valid = pd.DataFrame(grp_fit.transform(valid))
            grp_cols = list(set(list(grp_train)))
            grp_cols = ['grp_' + str(s) for s in grp_cols]
            grp_train.columns = grp_cols
            grp_valid.columns = grp_cols
            return grp_train, grp_valid
    
    def srp_feats(train, valid, n = 5):
            train, valid = feat_eng.scalers(train, valid, which_method='mm')
            srp_fit = SparseRandomProjection(n_components=n, dense_output=True, eps=0.1)
            srp_fit.fit(train)
            srp_train = pd.DataFrame(srp_fit.transform(train))
            srp_valid = pd.DataFrame(srp_fit.transform(valid))
            srp_cols = list(set(list(srp_train)))
            srp_cols = ['srp_' + str(s) for s in srp_cols]
            srp_train.columns = srp_cols
            srp_valid.columns = srp_cols
            return srp_train, srp_valid
        
    def return_combined(train, valid, list_objects = ['pca', 'ica', 'tsvd', 'grp', 'srp', 'tsne']):
        if 'pca' in list_objects:
            train = pd.concat([train.reset_index(drop=True), pca_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), pca_valid], axis=1)
        if 'ica' in list_objects:
            train = pd.concat([train.reset_index(drop=True), ica_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), ica_valid], axis=1)
        if 'tsvd' in list_objects:
            train = pd.concat([train.reset_index(drop=True), tsvd_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), tsvd_valid], axis=1)
        if 'grp' in list_objects:
            train = pd.concat([train.reset_index(drop=True), grp_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), grp_valid], axis=1)
        if 'srp' in list_objects:
            train = pd.concat([train.reset_index(drop=True), srp_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), srp_valid], axis=1)
        return train, valid

In [128]:
## calling the various feat engineering functions and adding those features
## pca, ica, tsvd, grp, srp
pca_train, pca_valid = feat_eng.pca_feats(train=X_train, valid=X_valid, n=.95)
ica_train, ica_valid = feat_eng.ica_feats(train=X_train, valid=X_valid, n=10)
tsvd_train, tsvd_valid = feat_eng.tsvd_feats(train=X_train, valid=X_valid, n=10)
grp_train, grp_valid = feat_eng.grp_feats(train=X_train, valid=X_valid, n=10)
srp_train, srp_valid = feat_eng.srp_feats(train=X_train, valid=X_valid, n=10)

## scale the data
X_train, X_valid = feat_eng.scalers(train=X_train, valid=X_valid, which_method='mm')

## return the final datasets with the added features
X_train, X_valid = feat_eng.return_combined(train = X_train, valid = X_valid)

In [None]:
X_train['response'] = y_train
X_valid['response'] = y_valid

X_train.to_csv('X_train.csv', index=False)
X_valid.to_csv('X_valid.csv', index=False)

In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

# Import a sample binary outcome train/test set into H2O
h2o_train = h2o.import_file("X_train.csv", header=1)
h2o_test = h2o.import_file("X_valid.csv", header=1)

# Identify the response and set of predictors
y = "response"
x = list(h2o_train.columns)  #if x is defined as all columns except the response, then x is not required
x.remove(y)

# For binary classification, response should be a factor
h2o_train[y] = h2o_train[y].asfactor()
h2o_test[y] = h2o_test[y].asfactor()

In [None]:
# Run AutoML for n seconds
n = 30
aml = H2OAutoML(max_runtime_secs = n, stopping_metric='mean_per_class_error', sort_metric='mean_per_class_error',
                class_sampling_factors=[1, 0.2], balance_classes = False)
aml.train(x = x, y = y, training_frame = h2o_train)

# Print Leaderboard (ranked by xval metrics)
print(aml.leaderboard)

# (Optional) Evaluate performance on a test set
perf = aml.leader.model_performance(h2o_test)
print(perf.auc())

In [None]:
pred = aml.predict(h2o_test)[:,2]
pred = pred.as_data_frame().as_matrix()
predict = np.where(pred > 0.1, 1, 0)
y_test=y_valid

recall_score = sklearn.metrics.recall_score(y_pred=predict, y_true=y_test)
precision_score = sklearn.metrics.precision_score(y_pred=predict, y_true=y_test)
f1_score = sklearn.metrics.f1_score(y_pred=predict, y_true=y_test)
auc_score = roc_auc_score(y_test, pred)
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_pred=predict, y_true=y_test).ravel()
print(sklearn.metrics.confusion_matrix(y_pred=predict, y_true=y_test), '\n')
print('recall score is: ', recall_score)
print('precision score is: ', precision_score)
print('f1_score is: ', f1_score)
print('accuracy score: ', sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predict))
print('The final AUC after taking the best params and num_rounds when it stopped is {:.4f}.'.format(auc_score), '\n')

In [115]:
h2o.cluster().shutdown()

H2O session _sid_9353 closed.


In [103]:
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

In [81]:
### ONE-CLASS METHODS ###

from sklearn.model_selection import GridSearchCV

class oneclass_models():
    
    def __init__():
        """ this class contains several modelling algorithms for one-class classification/anomaly detection """

    def data_prepare(X_train, X_valid):
        Negatives=X_train[X_train['response']==0]
        Positives=X_train[X_train['response']==1]
        Negatives.drop(['response'], axis=1, inplace=True)
        Positives.drop(['response'], axis=1, inplace=True)
        print(Negatives.shape)
        print(Positives.shape)
        
        X_v = X_valid.drop(['response'], axis=1, inplace=False)
        
        return Positives, Negatives, X_v
        
    def uni_svm(X_train, X_valid):
        """ one-class svm by training separately on positives and negatives """
        
        Positives, Negatives, X_v = oneclass_models.data_prepare(X_train, X_valid)
        
        # Set the parameters by cross-validation
        params = [{'kernel': ['rbf', 'linear', 'poly'],
                   'gamma': [0.01, 0.1, 0.5],
                   'nu': [0.01, 0.1, 0.5]}]

        clf_P = GridSearchCV(sklearn.svm.OneClassSVM(), cv=5, param_grid=params, scoring='accuracy', verbose=1)
        clf_N = GridSearchCV(sklearn.svm.OneClassSVM(), cv=5, param_grid=params, scoring='accuracy', verbose=1)
        clf_P.fit(X=Positives, y=np.full(len(Positives),1))
        clf_N.fit(X=Negatives, y=np.full(len(Negatives),1))
        clf_AD_P = sklearn.svm.OneClassSVM(gamma=clf_P.best_params_['gamma'],
                                      kernel=clf_P.best_params_['kernel'], nu=clf_P.best_params_['nu'])
        clf_AD_P.fit(Positives)
        clf_AD_N = sklearn.svm.OneClassSVM(gamma=clf_N.best_params_['gamma'],
                                      kernel=clf_N.best_params_['kernel'], nu=clf_N.best_params_['nu'])
        clf_AD_N.fit(Negatives)

        valid_pred_P=clf_AD_P.predict(X_v)
        valid_pred_N=clf_AD_N.predict(X_v)
        
        return valid_pred_P, valid_pred_N, clf_AD_P, clf_AD_N
    
    def score_table(valid_pred_P, valid_pred_N):
        table = pd.DataFrame({'P': valid_pred_P,
                              'N': -1*valid_pred_N,
                              'O': y_valid})
        table['P_N'] = np.where((table['P'] == 1) & (table['N'] == -1), 1, 0)

        print(sklearn.metrics.accuracy_score(y_pred=table['P_N'], y_true=table['O']))
        print(sklearn.metrics.precision_score(y_pred=table['P_N'], y_true=table['O']))
        print(sklearn.metrics.recall_score(y_pred=table['P_N'], y_true=table['O']))
        
        return table        

In [82]:
p, n, clf_p, clf_n = oneclass_models.uni_svm(X_train=X_train, X_valid=X_valid)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(5753, 139)
(283, 139)
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:    2.7s finished


Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 10.1min finished


In [83]:
table=oneclass_models.score_table(valid_pred_N=n, valid_pred_P=p)

0.24792267254536204
0.04371704745166959
0.708185053380783


In [85]:
table

Unnamed: 0,P,N,O,P_N
0,1,-1,0,1
1,1,-1,0,1
2,1,-1,0,1
3,1,-1,0,1
4,-1,1,0,0
5,1,-1,0,1
6,1,-1,0,1
7,1,-1,0,1
8,1,-1,0,1
9,1,-1,0,1


In [86]:
IFA=IsolationForest(n_estimators=200, max_features=0.3)
IFA.fit(Negatives)

IsolationForest(bootstrap=False, contamination=0.1, max_features=0.3,
        max_samples='auto', n_estimators=200, n_jobs=1, random_state=None,
        verbose=0)

In [88]:
Negatives.shape

(5753, 139)

In [89]:
train_IFA=IFA.predict(Negatives)
test_IFA=IFA.predict(Positives)

In [90]:
def Train_Accuracy(Mat):
   
   Sum=0
   for i in Mat:
    
        if(i==1):
        
           Sum+=1.0
            
   return(Sum/len(Mat)*100)

def Test_Accuracy(Mat):
   
   Sum=0
   for i in Mat:
    
        if(i==-1):
        
           Sum+=1.0
            
   return(Sum/len(Mat)*100)

In [91]:
print("Training: Isolation Forest: ",(Train_Accuracy(train_IFA)),"%")
print("Test: Isolation Forest: ",(Test_Accuracy(test_IFA)),"%")

Training: Isolation Forest:  89.98783243525116 %
Test: Isolation Forest:  10.247349823321555 %
