# KNN Model Classification

A comprehensive KNN model classification model in python

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

In [52]:
file_location = 'C:\\Users\\achyuthuni.harsha\\Documents\\GPI-UD IND Level IMS Sales.xlsx'
pred_df = pd.read_excel(file_location)
train_df = pred_df[pred_df.CALMONTH == 201802].drop(['CALMONTH', 'DOL_TOT', 'DOL_TOT_EAAP', 'DOL_TOT_EUAP'], axis = 1)

In [33]:
train_df[0:9]

Unnamed: 0,GPI,UNIT_DOSE_IND,LOG_DOL_TOT,IMS_NUM_SPLR_PP_MNTH,RFP_BNFT,CLASS
0,21550080102120,N,5.162008,2,33,1
8,94100030006100,N,8.387862,2,-24,0
41,38000083100320,N,5.76732,2,15,1
66,47300020000100,N,5.266678,1,74,1
81,75100090100320,,,1,-97,0
89,,N,8.767409,2,10,1
103,6110001000H410,Y,5.738052,1,-43,0
132,15000050000305,N,5.722823,1,-22,0
133,85158060100320,N,5.556284,2,-4,0


In [28]:
clss_continuous_cut = 0
print "Definition of Positive and Negative benefit class"
past_clss_array = {0:'<= ${:,}'.format(clss_continuous_cut), 1:'> ${:,}'.format(clss_continuous_cut)}
print past_clss_array

train_df['CLASS'] = 9999

train_df.loc[train_df['RFP_BNFT'] <= clss_continuous_cut, 'CLASS'] = 0
train_df.loc[train_df['RFP_BNFT'] > clss_continuous_cut, 'CLASS'] = 1

Definition of Positive and Negative benefit class
{0: '<= $0', 1: '> $0'}


In [35]:
#defining a global function to create ftr_array
def create_ftr_array(data_df, ftr_col_nam_list):    
    ftr_col_vals_list = []
    for ftr in ftr_col_nam_list:
        ftr_col_vals_list.append(data_df[ftr].tolist())
    ftr_array = np.transpose(ftr_col_vals_list)
    return ftr_array

In [36]:
#defining a global class to create a data array
class dataArrayClass(object):
    def __init__(self, ftr_array, ftr_labels, clss_array=None, clss_labels_dict=None, clss_continuous_array=None):
        """ Init dataArrayClass. """
        self.ftr_array = ftr_array
        self.ftr_labels = ftr_labels
        self.clss_array = clss_array
        if clss_array is None:
            self.classes = None
            self.n_classes = None
            self.clss_labels_dict = None
        else:
            self.classes = np.unique(clss_array)
            self.n_classes = len(self.classes)
            if clss_labels_dict:
                self.clss_labels_dict = clss_labels_dict
#             else:
#                 self.clss_labels_dict = {[(clss, '%s Class'%(str(clss))) for clss in self.classes]}       
        self.clss_continuous_array = clss_continuous_array   
        self.num_entries = ftr_array.shape[0]
        self.n_ftrs = ftr_array.shape[1]
        self.standardized_flg = False
        self.stndrd_ftr_array = None
        self.filled_nan_flg = False
        self.fill_nan_method = None
        self.train_test_split_frac = None
        self.train_ftr_array = None
        self.train_clss_array = None
        self.train_clss_continuous_array = None
        self.test_ftr_array = None
        self.test_clss_array = None
        self.test_clss_continuous_array = None
        self.stndrd_train_ftr_array = None
        self.stndrd_test_ftr_array = None
        self.train_test_split_resampling_flg = None
        self.balance_flg = None
        self.balance_oversample_flg = None
    
    def standardize_ftr_array(self):
        self.standardized_flg = True

        ftr_mean = np.nanmean(self.ftr_array, axis=0)
        ftr_stddev = np.nanstd(self.ftr_array, axis=0)
        self.stndrd_ftr_array = (self.ftr_array - ftr_mean) / ftr_stddev
        

    def fill_ftr_array_missing_values(self, fill_nan_method='mean'):
        self.filled_nan_flg = True
        self.fill_nan_method = fill_nan_method
        nan_indices = np.where(np.isnan(self.ftr_array))
        if fill_nan_method == 'mean':
            self.ftr_array[nan_indices] = np.nanmean(self.ftr_array, axis=0)[nan_indices[1]]
            if self.standardized_flg:
                self.stndrd_ftr_array[nan_indices] = np.nanmean(self.stndrd_ftr_array, axis=0)[nan_indices[1]]
                
    def create_test_train_sets(self, train_test_split_frac=0.8, train_test_split_resampling_flg=False, balance_flg=True, balance_oversample_flg=True):
        self.train_test_split_frac = train_test_split_frac
        self.train_test_split_resampling_flg = train_test_split_resampling_flg
        num_entries_train = int(np.floor(self.num_entries * self.train_test_split_frac))

        # Fill a list of indices for the training set
        train_index_list = np.random.choice(self.num_entries, size=num_entries_train, replace=train_test_split_resampling_flg)

        # Fill a list of indices for the test set
        test_index_list = [i for i in range(self.num_entries) if i not in train_index_list]
        self.train_ftr_array = self.ftr_array[train_index_list, :]
        self.train_clss_array = self.clss_array[train_index_list]
        self.train_clss_continuous_array = self.clss_continuous_array[train_index_list]

        self.test_ftr_array = self.ftr_array[test_index_list, :]
        self.test_clss_array = self.clss_array[test_index_list]
        self.test_clss_continuous_array = self.clss_continuous_array[test_index_list]

        if self.standardized_flg:
            self.stndrd_train_ftr_array = self.stndrd_ftr_array[train_index_list, :]
            self.stndrd_test_ftr_array = self.stndrd_ftr_array[test_index_list, :]

        # If balance_flag, resample to balance classes
        if balance_flg:
            self.balance_train_set()

    def balance_train_set(self, balance_oversample_flg=True):
        self.balance_flg = True
        self.balance_oversample_flg = balance_oversample_flg
        nevts_per_class = [len(self.clss_array[self.clss_array==c]) for c in self.classes]
        max_class_nevts = max(nevts_per_class)
        min_class_nevts = min(nevts_per_class)
        if balance_oversample_flg:
            final_nevts = max_class_nevts * self.n_classes
            class_indices = zip(range(0, final_nevts, max_class_nevts),
                                range(max_class_nevts, final_nevts + 1, max_class_nevts)); 
        else:
            final_nevts = min_class_nevts * self.n_classes
            class_indices = zip(range(0, final_nevts, min_class_nevts),
                                range(min_class_nevts, final_nevts + 1, min_class_nevts));

        balanced_ftr_array = np.empty([final_nevts, self.n_ftrs])
        if self.standardized_flg:
            balanced_stndrd_ftr_array = np.empty([final_nevts, self.n_ftrs])
        balanced_clss_array = np.empty([final_nevts])
        balanced_clss_continuous_array = np.empty([final_nevts])
        # Oversample or undersample each class to have the same # of evts
        for class_label, nevts_class, array_rng in zip(self.classes, nevts_per_class, class_indices):
            class_ftr_array = self.ftr_array[self.clss_array==class_label]
            if self.standardized_flg:
                class_stndrd_ftr_array = self.stndrd_ftr_array[self.clss_array==class_label]

            class_clss_array = self.clss_array[self.clss_array==class_label]
            clss_continuous_array = self.clss_continuous_array[self.clss_array==class_label]
            if balance_oversample_flg and nevts_class < max_class_nevts:
                oversample_numevts = max_class_nevts - nevts_class
                oversample_indices = np.random.choice(nevts_class, size=oversample_numevts, replace=True)
                balanced_ftr_array[array_rng[0]:array_rng[1], :] = np.append(class_ftr_array, class_ftr_array[oversample_indices], axis=0)
                if self.standardized_flg:
                    balanced_stndrd_ftr_array[array_rng[0]:array_rng[1], :] = np.append(class_stndrd_ftr_array, class_stndrd_ftr_array[oversample_indices], axis=0)
                balanced_clss_array[array_rng[0]:array_rng[1]]  = np.append(class_clss_array, class_clss_array[oversample_indices])
                balanced_clss_continuous_array[array_rng[0]:array_rng[1]]  = np.append(clss_continuous_array, clss_continuous_array[oversample_indices])

            elif not balance_oversample_flg and nevts_class > min_class_nevts:
                undersample_indices = np.random.choice(nevts_class, size=min_class_nevts, replace=False)
                balanced_ftr_array[array_rng[0]:array_rng[1], :] = class_ftr_array[undersample_indices]
                if self.standardized_flg:
                    balanced_stndrd_ftr_array[array_rng[0]:array_rng[1], :] = class__stndrdftr_array[undersample_indices]
                balanced_clss_array[array_rng[0]:array_rng[1]]  = class_clss_array[undersample_indices]
                balanced_clss_continuous_array[array_rng[0]:array_rng[1]]  = clss_continuous_array[undersample_indices]
            else:
                balanced_ftr_array[array_rng[0]:array_rng[1], :] = class_ftr_array
                if self.standardized_flg:
                    balanced_stndrd_ftr_array[array_rng[0]:array_rng[1], :] = class_stndrd_ftr_array
                balanced_clss_array[array_rng[0]:array_rng[1]]  = class_clss_array
                balanced_clss_continuous_array[array_rng[0]:array_rng[1]]  = clss_continuous_array
        self.train_ftr_array = balanced_ftr_array
        if self.standardized_flg:
            self.stndrd_train_ftr_array = balanced_stndrd_ftr_array
        self.train_clss_array = balanced_clss_array
        self.train_clss_continuous_array = balanced_clss_continuous_array

In [38]:
ftr_opt_list = ['LOG_DOL_TOT','IMS_NUM_SPLR_PP_MNTH']
past_clss_array = np.array(train_df['CLASS'].tolist())
past_clss_continuous_array = np.array(train_df['RFP_BNFT'].tolist())

opt_ftr_train_ftr_array = create_ftr_array(train_df, ftr_opt_list)
optFtrTrainData = dataArrayClass(opt_ftr_train_ftr_array, ftr_opt_list, clss_array=past_clss_array, clss_continuous_array=past_clss_continuous_array)
optFtrTrainData.standardize_ftr_array()
optFtrTrainData.fill_ftr_array_missing_values()
optFtrTrainData.create_test_train_sets(train_test_split_frac=0.8, train_test_split_resampling_flg=False, balance_flg=True, balance_oversample_flg=True)

In [42]:
opt_ftr_train_ftr_array

array([[ 5.16200849,  2.        ],
       [ 8.38786247,  2.        ],
       [ 5.7673199 ,  2.        ],
       ..., 
       [ 3.7767738 ,  2.        ],
       [ 5.48785924,  2.        ],
       [ 6.7842386 ,  1.        ]])

In [67]:
#global class to define classifiers
class clfClass(object):
    def __init__(self, clf):
        """Inits clfClass."""
        self.clf = clf
        self.label = None
        self.param_grid = None
        self.param_grid_scores = None
        self.scoring = None
        self.best_params = None
        self.best_score = None
        self.roc_params = None
        self.roc_auc = None
        self.roc_false_pos_rate = None
        self.roc_true_pos_rate = None
        self.roc_threshold = None
        self.use_standardized_data_flg = False
        self.use_sample_weights_flg = False

    def optimize(self, data_array, scoring, n_folds_cv=5, filename=None):
        self.scoring = scoring
     
        clf_opt = GridSearchCV(self.clf, self.param_grid, scoring=scoring, cv=n_folds_cv)
        if self.use_standardized_data_flg:
            clf_opt.fit(data_array.stndrd_ftr_array, data_array.clss_array)
        else:
            clf_opt.fit(data_array.ftr_array, data_array.clss_array)

        # Save scores for parameter combos
        self.param_grid_scores = clf_opt.grid_scores_
        if filename is not None:
            with open(filename, 'a') as f:
                for grid_score in clf_opt.grid_scores_:
                    param_str = '%s,'%(grid_score.parameters)
                    if type(scoring) == str:
                        score_type_str = '%s,'%(scoring)
                    elif 'func_name' in dir(scoring):
                        score_type_str = '%s,'%(scoring.func_name)
                    scores_str = n_folds_cv*'%9.8f,'%tuple(grid_score.cv_validation_scores)
                    mean_score_str = '%9.8f'%(grid_score.mean_validation_score)
                    f.write(self.label + ',' + score_type_str + param_str + scores_str + mean_score_str + '\r\n')

        # Set clf's params to best params
        self.best_params = clf_opt.best_params_
        self.clf.set_params(**self.best_params)
        self.best_score = clf_opt.best_score_

    def predict(self, train_ftr_array, train_clss_array, pred_ftr_array, sample_weights=None):
        if 'decision_function' in dir(self.clf):
            if sample_weights is None:
                return self.clf.fit(train_ftr_array, train_clss_array).decision_function(pred_ftr_array)
            else:
                return self.clf.fit(train_ftr_array, train_clss_array, sample_weight=sample_weights).decision_function(pred_ftr_array)
        elif 'predict_proba' in dir(self.clf):
            # Get column 1 to get prediction for positive class
            prediction = self.clf.fit(train_ftr_array, train_clss_array).predict_proba(pred_ftr_array)
            return prediction


    def calc_AUC(self, train_ftr_array, train_clss_array, test_ftr_array, test_clss_array, sample_weights=None):
        if train_ftr_array.size != 0:
            prediction = self.predict(train_ftr_array, train_clss_array, test_ftr_array, sample_weights=sample_weights)
            if 'decision_function' in dir(self.clf):
                self.roc_false_pos_rate, self.roc_true_pos_rate, self.roc_threshold = metrics.roc_curve(test_clss_array, prediction)
            elif 'predict_proba' in dir(self.clf):
                pos_clss_prediction = prediction[:, 1]    
                self.roc_false_pos_rate, self.roc_true_pos_rate, self.roc_threshold = metrics.roc_curve(test_clss_array, pos_clss_prediction)
            self.roc_auc = metrics.auc(self.roc_false_pos_rate, self.roc_true_pos_rate)
            self.roc_params = self.clf.get_params()
            return self.roc_auc
        else:
            return None

    def calc_pos_class_accuracy(self,  train_ftr_array, train_clss_array, test_ftr_array, test_clss_array, threshold=0.5):
        if train_ftr_array.size != 0:
            prediction = self.predict(train_ftr_array, train_clss_array, test_ftr_array)
            if 'decision_function' in dir(self.clf):
                above_threshold_evts = prediction >= threshold
                n_evts = len(prediction[above_threshold_evts])
            elif 'predict_proba' in dir(self.clf):
                pos_clss_prediction = prediction[:, 1]
                above_threshold_evts = pos_clss_prediction >= threshold
                n_evts = len(pos_clss_prediction[above_threshold_evts])
            accuracy = 1.0*sum(test_clss_array[above_threshold_evts]==1)
            return accuracy, 100
        else:
            return None

    def custom_precision_scorer(estimator, test_ftr_array, test_clss_array):  
        threshold = 0.8
        prediction = estimator.fit(train_ftr_array, train_clss_array).predict_proba(pred_ftr_array)[:, 1]
        above_threshold_evts = prediction >= threshold
        n_evts = len(prediction[above_threshold_evts])
        precision = 1.0*sum(test_clss_array[above_threshold_evts]==1)
        return precision

In [68]:
clf_KNN =  clfClass(KNeighborsClassifier(n_neighbors=50))
clf_KNN.label = 'KNN'
clf_KNN.use_standardized_data_flg = True

In [69]:
pred_ftr_array = create_ftr_array(pred_df, ftr_opt_list)
predData = dataArrayClass(pred_ftr_array, ftr_opt_list)
predData.standardize_ftr_array()
predData.fill_ftr_array_missing_values()

In [70]:
prediction = clf_KNN.predict(optFtrTrainData.stndrd_ftr_array, optFtrTrainData.clss_array, predData.stndrd_ftr_array)

In [71]:
accuracy_08, num_evts_08 = clf_KNN.calc_pos_class_accuracy(optFtrTrainData.stndrd_train_ftr_array, (optFtrTrainData.train_clss_array>0)*1, optFtrTrainData.stndrd_test_ftr_array, (optFtrTrainData.test_clss_continuous_array>0)*1, threshold=0.8)

In [72]:
accuracy_08, num_evts_08

(0.0, 100)