In [1]:
import scipy as sp
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import RandomizedSearchCV
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
#NEED INSTALL IMBLANCE PACKAGE
#code: conda install -c glemaitre imbalanced-learn
from imblearn.over_sampling import SMOTE
#read tables: After variable selection, choosing 50 variables by baseline RF importance
train=pd.read_table('training_self1_0127.txt')
test=pd.read_table('testing_self1_0127.txt')

In [3]:
train_Ycat=train['Ycat']
train_PERT=train['PERT_Y']
train=train.drop('Ycat',1)
train=train.drop('MemberID',1)
train=train.drop('PERT_Y',1)

test_Ycat=test['Ycat']
test_PERT=test['PERT_Y']
test=test.drop('Ycat',1)
test=test.drop('PERT_Y',1)
test=test.drop('MemberID',1)

scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test =  scaler.transform(test)

train_std=pd.DataFrame(train)
test_std=pd.DataFrame(test)

res=[1 if x =='Y' else 0 for x in train_PERT]

In [4]:
sm = SMOTE(random_state=1234)
X_res, y_res = sm.fit_sample(train_std, res)

In [5]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            

In [6]:
#hyper parameter options, could also apply to other model. 
param_dist = {"loss": ['deviance', 'exponential'],
              "learning_rate": sp.stats.expon(scale=.1),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "max_depth": sp_randint(1, 5),
              "max_features": ['auto', 'sqrt','log2']}


# Optimize Hyper-parameters by F1 Score

In [7]:
gbc=GradientBoostingClassifier(random_state=1234)

In [8]:
#search 50 times      
seed = np.random.seed(1234)
n_iter_search = 20
random_search = RandomizedSearchCV(gbc, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring ='f1', verbose=100)

start = time()
random_search.fit(X_res, y_res)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.cv_results_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] max_features=auto, learning_rate=0.0212598657618, min_samples_split=10, min_samples_leaf=5, loss=deviance, max_depth=2 
[CV]  max_features=auto, learning_rate=0.0212598657618, min_samples_split=10, min_samples_leaf=5, loss=deviance, max_depth=2, score=0.941331, total= 5.9min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.0min remaining:    0.0s
[CV] max_features=auto, learning_rate=0.0212598657618, min_samples_split=10, min_samples_leaf=5, loss=deviance, max_depth=2 
[CV]  max_features=auto, learning_rate=0.0212598657618, min_samples_split=10, min_samples_leaf=5, loss=deviance, max_depth=2, score=0.939715, total= 6.5min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 12.5min remaining:    0.0s
[CV] max_features=auto, learning_rate=0.0212598657618, min_samples_split=10, min_samples_leaf=5, loss=deviance, max_depth=2 
[CV]  max_features=auto, learning_rate=0.0212598657618, min_samples_split=10, min_samples_le

In [9]:
#RandomizedSearchCV took 9750.01 seconds for 20 candidates parameter settings.
#Model with rank: 1
#Mean validation score: 0.995 (std: 0.002)
#Parameters: {'max_features': 'auto', 'learning_rate': 0.24318158771707576, 'min_samples_split': 4, 
#'min_samples_leaf': 5, 'loss': 'deviance', 'max_depth': 3}

gbc=GradientBoostingClassifier(random_state=1234,loss='deviance',learning_rate=
                                0.24318158771707576,min_samples_leaf=5,
                                min_samples_split=4,max_features='auto',max_depth=3)


In [10]:
#fit the model
gbc.fit(X_res,y_res)

predicted_probs = gbc.predict_proba(train_std)
a= predicted_probs[:,1]
np.mean(a)
b= predicted_probs[:,0]
np.mean(b)
a1=sorted(a,reverse=True)
a2=pd.DataFrame(a1)
a2.iloc[train_PERT.value_counts()["Y"]*4-1,:]

0    0.20641
Name: 6679, dtype: float64

In [11]:
#Synchronize with Jeff's Scoring Functions
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score, average_precision_score, brier_score_loss, \
    fbeta_score, confusion_matrix
from creonmetrics import labeled_metric, assumed_metric, pu_score, \
    pr_one_unlabeled, brier_score_partial_loss

y_true = test_Ycat.values
y_prob = gbc.predict_proba(test_std)

y_pred=np.array([1 if x >= 0.20641 else 0 for x in y_prob[:,1]])

In [12]:
import collections
data = {'labeled_acc' : labeled_metric(y_true, y_pred, accuracy_score),
        'labeled_prec' : labeled_metric(y_true, y_pred, precision_score),
        'labeled_recall' : labeled_metric(y_true, y_pred, recall_score),
        'labeled_f1' : labeled_metric(y_true, y_pred, f1_score),
        'labeled_roc_auc' : labeled_metric(y_true, y_pred, roc_auc_score),
        'labeled_avg_prec' : labeled_metric(y_true, y_pred, average_precision_score),
        'labeled_brier' : labeled_metric(y_true, y_prob, brier_score_loss),
        'labeled_brier_pos' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=1),
        'labeled_brier_neg' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'confusion_matrix_lab' : labeled_metric(y_true, y_pred, confusion_matrix),
        'pr_one_unlabeled' : pr_one_unlabeled(y_true, y_pred),
        'assumed_brier' : assumed_metric(y_true, y_prob, brier_score_loss),
        'assumed_brier_neg' : assumed_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'assumed_f1' : assumed_metric(y_true, y_pred, f1_score),
        'assumed_f1beta10' : assumed_metric(y_true, y_pred, fbeta_score, beta=10),
        'confusion_matrix_un' : assumed_metric(y_true, y_pred, confusion_matrix),
        'pu_score' : pu_score(y_true, y_pred),
        }
data_s = collections.OrderedDict(sorted(data.items()))
for k, v in data_s.items():
    print(k, ': ', v)


assumed_brier :  0.00640939879289
assumed_brier_neg :  0.00353866878448
assumed_f1 :  0.190697674419
assumed_f1beta10 :  0.475627124874
confusion_matrix_lab :  [[199  17]
 [213 205]]
confusion_matrix_un :  [[86195  1527]
 [  213   205]]
labeled_acc :  0.637223974763
labeled_avg_prec :  0.874908095272
labeled_brier :  0.410339547276
labeled_brier_neg :  0.0261572522252
labeled_brier_pos :  0.6088643696
labeled_f1 :  0.640625
labeled_prec :  0.923423423423
labeled_recall :  0.49043062201
labeled_roc_auc :  0.705863459153
pr_one_unlabeled :  0.0172559595913
pu_score :  12.2399689767
