In [1]:
# general packages
import pandas as pd
import numpy as np

# for model statistics
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.model_selection import train_test_split

In [2]:
# # # DATA # # #
# load iris flower data
from sklearn.datasets import load_iris

iris = load_iris()

x = iris.data
y = iris.target

# split data into train and test sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.33, random_state=7)

In [7]:
print("IRIS DATA")
pd.DataFrame(x).describe()

IRIS DATA


Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
print('IRIS TARGET DATA')
pd.DataFrame(y).describe()

IRIS TARGET DATA


Unnamed: 0,0
count,150.0
mean,1.0
std,0.819232
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,2.0


In [24]:
import hashlib

# Cross Validation
def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

# how to use
# use part of a table as index
irisdata = pd.DataFrame(iris.data).reset_index()
xtrain, xtest = split_train_test_by_id(irisdata, 0.2, 'index')
irisdata2 = pd.DataFrame(iris.target).reset_index()
ytrain, ytest = split_train_test_by_id(irisdata2, 0.2, 'index')

ytrain = ytrain.iloc[:,1:]
xtrain = xtrain.iloc[:,1:]
ytest = ytest.iloc[:,1:]
xtest = xtest.iloc[:,1:]


# print('xtest_size:', test_set.shape[0], '\nxtrain_size:', train_set.shape[0])

In [25]:
# # # DECISION TREE # # #
from sklearn import tree

#create tree object
# algorithms are gini or entropy
model1 = tree.DecisionTreeClassifier(criterion='entropy') # entropy tends to produce more balanced trees

# x(predictor) y(target) x_test(predictor) of test_dataset
model1.fit(xtrain, ytrain)
# model1.score(x, y)

# make predictions for test data
y_pred1 = model1.predict(xtest)
predictions1 = [round(v) for v in y_pred1]
ac1 = accuracy_score(ytest, predictions1)
loss1 = hamming_loss(ytest, predictions1)
print("Model Scores")
print("  Accuracy: {}%".format(ac1*100.0))
print("  Loss: {}".format(loss1))

# feature imporance
print('\nFeature Importance')
for name, score in zip(iris['feature_names'], model1.feature_importances_):
    print("  ", name, ':\t', round(score, 2))

# # visualize decision tree
# # to convert dot file to png on terminal:$ dot tree.dot -Tpng -o tree.png
# from sklearn.tree import export_graphviz

# export_graphviz(
#     model,
#     out_file="/home/rlougee/Desktop/iris_tree.dot",
#     feature_names=iris.feature_names[2:],
#     class_names=iris.target_names,
#     rounded=True,
#     filled=True
# )

Model Scores
  Accuracy: 92.85714285714286%
  Loss: 0.07142857142857142

Feature Importance
   sepal length (cm) :	 0.01
   sepal width (cm) :	 0.0
   petal length (cm) :	 0.33
   petal width (cm) :	 0.65


In [28]:
# # # RANDOM FOREST # # #
from sklearn.ensemble import RandomForestClassifier

# create random forest object
model2 = RandomForestClassifier(n_estimators=1000)

# Train the model using the training sets and check score
model2.fit(xtrain, ytrain)

# make predictions for test data
y_pred2 = model2.predict(xtest)
predictions2 = [round(v) for v in y_pred2]
ac2 = accuracy_score(ytest, predictions2)
loss2 = hamming_loss(ytest, predictions2)
print("Model Scores")
print("  Accuracy: {}%".format(round(ac2*100.0),2))
print("  Loss: {}".format(round(loss2, 2)))

# feature imporance
print('\nFeature Importance')
for name, score in zip(iris['feature_names'], model2.feature_importances_):
    print("  ", name, ':\t', round(score, 2))

  


Model Scores
  Accuracy: 93.0%
  Loss: 0.07

Feature Importance
   sepal length (cm) :	 0.09
   sepal width (cm) :	 0.02
   petal length (cm) :	 0.46
   petal width (cm) :	 0.43


In [29]:
# # # GBM # # #
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.ensemble import GradientBoostingRegression

# create GBM object
model3 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)

# train the model
model3.fit(xtrain, ytrain)

# make predictions for test data
y_pred3 = model3.predict(xtest)
predictions3 = [round(v) for v in y_pred3]
ac3 = accuracy_score(ytest, predictions3)
loss3 = hamming_loss(ytest, predictions3)
print("Model Scores")
print("  Accuracy: {}%".format(round(ac3*100.0, 2)))
print("  Loss: {}".format(round(loss3, 2)))

# feature imporance
print('\nFeature Importance')
for name, score in zip(iris['feature_names'], model3.feature_importances_):
    print("  ", name, ':\t', round(score, 2))

  y = column_or_1d(y, warn=True)


Model Scores
  Accuracy: 92.86%
  Loss: 0.07

Feature Importance
   sepal length (cm) :	 0.03
   sepal width (cm) :	 0.19
   petal length (cm) :	 0.16
   petal width (cm) :	 0.13


In [6]:
# # # XGBOOST # # #
from xgboost import XGBClassifier

# create XGB model
model4 = XGBClassifier()

# train XGB model
model4.fit(x, y)

# make predictions for test data
y_pred4 = model4.predict(xtest)
predictions4 = [round(v) for v in y_pred4]
ac4 = accuracy_score(ytest, predictions4)
loss4 = hamming_loss(ytest, predictions4)
print("Model Scores")
print("Accuracy: {}%".format(ac4*100.0))
print("Loss: {}".format(loss4))

# feature imporance
print('\nFeature Importance')
for name, score in zip(iris['feature_names'], model4.feature_importances_):

    print("  ", name, ':\t', round(score, 2))

Model Scores
Accuracy: 100.0%
Loss: 0.0

Feature Importance
   sepal length (cm) :	 0.18
   sepal width (cm) :	 0.11
   petal length (cm) :	 0.41
   petal width (cm) :	 0.3


  if diff:


In [None]:
# # # PARAMETER TUNING # # #
# 1) choose high learning rate
    # (0.05 - 0.30) generally 0.10 works
    # AND determine optimum number of tress (xgboost cv)
# 2) tune tree-specific parameters
    # max_depth (3-10) 5 is good
    # min_child_weight 1
    # gamma (0.1 and 0.2 are ok) 0
    # subsample, colsample_bytree (0.5-0.9) 0.8
    # scale_pos_weight 1
# 3) tune regularization parameters
    # lambda, alpha
# 4) lower learning rate and decide on optimal parameters

In [None]:
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [None]:
# # # CV FUNCTION # # #

def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
                    
#     feat_imp = pd.Series(alg.get_fscore()).sort_values(ascending=False)
#     feat_imp.plot(kind='bar', title='Feature Importances')
#     plt.ylabel('Feature Importance Score')

In [None]:
clf = XGBFeatureImportances(predictors)
clf.fit(xgb1, train)
importances = clf.feature_importances_